/* simulator.c -- Interface for the AArch64 simulator.
- Copyright (C) 2015-2016 Free Software Foundation, Inc.
+ Copyright (C) 2015-2020 Free Software Foundation, Inc.
Contributed by Red Hat.
" exe addr %" PRIx64, \
__LINE__, aarch64_get_PC (cpu)); \
if (! TRACE_ANY_P (cpu)) \
- { \
- sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: "); \
- trace_disasm (CPU_STATE (cpu), cpu, aarch64_get_PC (cpu)); \
- } \
+ sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
+ aarch64_get_instr (cpu)); \
sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
sim_stopped, SIM_SIGABRT); \
} \
if (result & (1 << 31))
flags |= N;
- if (uresult != result)
+ if (uresult != (uint32_t)result)
flags |= C;
if (sresult != result)
aarch64_set_CPSR (cpu, flags);
}
+#define NEG(a) (((a) & signbit) == signbit)
+#define POS(a) (((a) & signbit) == 0)
+
static void
set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
{
- int64_t sval1 = value1;
- int64_t sval2 = value2;
- uint64_t result = value1 + value2;
- int64_t sresult = sval1 + sval2;
- uint32_t flags = 0;
+ uint64_t result = value1 + value2;
+ uint32_t flags = 0;
+ uint64_t signbit = 1ULL << 63;
if (result == 0)
flags |= Z;
- if (result & (1ULL << 63))
+ if (NEG (result))
flags |= N;
- if (sval1 < 0)
- {
- if (sval2 < 0)
- {
- /* Negative plus a negative. Overflow happens if
- the result is greater than either of the operands. */
- if (sresult > sval1 || sresult > sval2)
- flags |= V;
- }
- /* else Negative plus a positive. Overflow cannot happen. */
- }
- else /* value1 is +ve. */
- {
- if (sval2 < 0)
- {
- /* Overflow can only occur if we computed "0 - MININT". */
- if (sval1 == 0 && sval2 == (1LL << 63))
- flags |= V;
- }
- else
- {
- /* Postive plus positive - overflow has happened if the
- result is smaller than either of the operands. */
- if (result < value1 || result < value2)
- flags |= V | C;
- }
- }
+ if ( (NEG (value1) && NEG (value2))
+ || (NEG (value1) && POS (result))
+ || (NEG (value2) && POS (result)))
+ flags |= C;
+
+ if ( (NEG (value1) && NEG (value2) && POS (result))
+ || (POS (value1) && POS (value2) && NEG (result)))
+ flags |= V;
aarch64_set_CPSR (cpu, flags);
}
-#define NEG(a) (((a) & signbit) == signbit)
-#define POS(a) (((a) & signbit) == 0)
-
static void
set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
{
}
static void
-do_vec_MOV_into_scalar (sim_cpu *cpu)
+do_vec_SMOV_into_scalar (sim_cpu *cpu)
{
/* instr[31] = 0
instr[30] = word(0)/long(1)
instr[29,21] = 00 1110 000
- instr[20,18] = element size and index
- instr[17,10] = 00 0011 11
+ instr[20,16] = element size and index
+ instr[15,10] = 00 0010 11
instr[9,5] = V source
instr[4,0] = R dest */
unsigned vs = INSTR (9, 5);
unsigned rd = INSTR (4, 0);
+ unsigned imm5 = INSTR (20, 16);
+ unsigned full = INSTR (30, 30);
+ int size, index;
NYI_assert (29, 21, 0x070);
- NYI_assert (17, 10, 0x0F);
+ NYI_assert (15, 10, 0x0B);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- switch (INSTR (20, 18))
+
+ if (imm5 & 0x1)
+ {
+ size = 0;
+ index = (imm5 >> 1) & 0xF;
+ }
+ else if (imm5 & 0x2)
+ {
+ size = 1;
+ index = (imm5 >> 2) & 0x7;
+ }
+ else if (full && (imm5 & 0x4))
+ {
+ size = 2;
+ index = (imm5 >> 3) & 0x3;
+ }
+ else
+ HALT_UNALLOC;
+
+ switch (size)
{
- case 0x2:
- aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 0));
+ case 0:
+ if (full)
+ aarch64_set_reg_s64 (cpu, rd, NO_SP,
+ aarch64_get_vec_s8 (cpu, vs, index));
+ else
+ aarch64_set_reg_s32 (cpu, rd, NO_SP,
+ aarch64_get_vec_s8 (cpu, vs, index));
break;
- case 0x6:
- aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 1));
+ case 1:
+ if (full)
+ aarch64_set_reg_s64 (cpu, rd, NO_SP,
+ aarch64_get_vec_s16 (cpu, vs, index));
+ else
+ aarch64_set_reg_s32 (cpu, rd, NO_SP,
+ aarch64_get_vec_s16 (cpu, vs, index));
break;
- case 0x1:
- case 0x3:
- case 0x5:
- case 0x7:
- aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u32
- (cpu, vs, INSTR (20, 19)));
+ case 2:
+ aarch64_set_reg_s64 (cpu, rd, NO_SP,
+ aarch64_get_vec_s32 (cpu, vs, index));
break;
default:
- HALT_NYI;
+ HALT_UNALLOC;
+ }
+}
+
+static void
+do_vec_UMOV_into_scalar (sim_cpu *cpu)
+{
+ /* instr[31] = 0
+ instr[30] = word(0)/long(1)
+ instr[29,21] = 00 1110 000
+ instr[20,16] = element size and index
+ instr[15,10] = 00 0011 11
+ instr[9,5] = V source
+ instr[4,0] = R dest */
+
+ unsigned vs = INSTR (9, 5);
+ unsigned rd = INSTR (4, 0);
+ unsigned imm5 = INSTR (20, 16);
+ unsigned full = INSTR (30, 30);
+ int size, index;
+
+ NYI_assert (29, 21, 0x070);
+ NYI_assert (15, 10, 0x0F);
+
+ TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+
+ if (!full)
+ {
+ if (imm5 & 0x1)
+ {
+ size = 0;
+ index = (imm5 >> 1) & 0xF;
+ }
+ else if (imm5 & 0x2)
+ {
+ size = 1;
+ index = (imm5 >> 2) & 0x7;
+ }
+ else if (imm5 & 0x4)
+ {
+ size = 2;
+ index = (imm5 >> 3) & 0x3;
+ }
+ else
+ HALT_UNALLOC;
+ }
+ else if (imm5 & 0x8)
+ {
+ size = 3;
+ index = (imm5 >> 4) & 0x1;
+ }
+ else
+ HALT_UNALLOC;
+
+ switch (size)
+ {
+ case 0:
+ aarch64_set_reg_u32 (cpu, rd, NO_SP,
+ aarch64_get_vec_u8 (cpu, vs, index));
+ break;
+
+ case 1:
+ aarch64_set_reg_u32 (cpu, rd, NO_SP,
+ aarch64_get_vec_u16 (cpu, vs, index));
+ break;
+
+ case 2:
+ aarch64_set_reg_u32 (cpu, rd, NO_SP,
+ aarch64_get_vec_u32 (cpu, vs, index));
+ break;
+
+ case 3:
+ aarch64_set_reg_u64 (cpu, rd, NO_SP,
+ aarch64_get_vec_u64 (cpu, vs, index));
+ break;
+
+ default:
+ HALT_UNALLOC;
}
}
uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
- uint64_t val1 = 0;
- uint64_t val2 = 0;
+ uint64_t val1;
+ uint64_t val2;
- uint64_t input1 = upper ? val_n1 : val_m1;
- uint64_t input2 = upper ? val_n2 : val_m2;
- unsigned i;
+ uint64_t input2 = full ? val_n2 : val_m1;
NYI_assert (29, 24, 0x0E);
NYI_assert (21, 21, 0);
NYI_assert (13, 10, 6);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- switch (INSTR (23, 23))
+ switch (INSTR (23, 22))
{
case 0:
- for (i = 0; i < 8; i++)
+ val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
+ val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
+ val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
+ val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
+
+ val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
+ val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
+ val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
+ val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
+
+ if (full)
{
- val1 |= (input1 >> (i * 8)) & (0xFFULL << (i * 8));
- val2 |= (input2 >> (i * 8)) & (0xFFULL << (i * 8));
+ val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
+ val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
+ val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
+ val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
+
+ val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
+ val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
+ val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
+ val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
}
break;
case 1:
- for (i = 0; i < 4; i++)
+ val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
+ val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
+
+ val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
+ val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
+
+ if (full)
{
- val1 |= (input1 >> (i * 16)) & (0xFFFFULL << (i * 16));
- val2 |= (input2 >> (i * 16)) & (0xFFFFULL << (i * 16));
+ val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
+ val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
+
+ val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
+ val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
}
break;
case 2:
- val1 = ((input1 & 0xFFFFFFFF) | ((input1 >> 32) & 0xFFFFFFFF00000000ULL));
- val2 = ((input2 & 0xFFFFFFFF) | ((input2 >> 32) & 0xFFFFFFFF00000000ULL));
+ val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
+ val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
+
+ if (full)
+ {
+ val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
+ val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
+ }
+ break;
case 3:
- val1 = input1;
- val2 = input2;
- break;
+ if (! full)
+ HALT_UNALLOC;
+
+ val1 = upper ? val_n2 : val_n1;
+ val2 = upper ? val_m2 : val_m1;
+ break;
}
aarch64_set_vec_u64 (cpu, vd, 0, val1);
case 0x8: /* 16-bit, no shift. */
for (i = 0; i < (full ? 8 : 4); i++)
aarch64_set_vec_u16 (cpu, vd, i, val);
- /* Fall through. */
+ break;
+
case 0xd: /* 32-bit, mask shift by 16. */
val <<= 8;
val |= 0xFF;
unsigned vm = INSTR (9, 5);
unsigned rd = INSTR (4, 0);
unsigned i;
- uint64_t val = 0;
int full = INSTR (30, 30);
NYI_assert (29, 24, 0x0E);
switch (INSTR (23, 22))
{
case 0:
- for (i = 0; i < (full ? 16 : 8); i++)
- val += aarch64_get_vec_u8 (cpu, vm, i);
- aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
- return;
+ {
+ uint8_t val = 0;
+ for (i = 0; i < (full ? 16 : 8); i++)
+ val += aarch64_get_vec_u8 (cpu, vm, i);
+ aarch64_set_vec_u64 (cpu, rd, 0, val);
+ return;
+ }
case 1:
- for (i = 0; i < (full ? 8 : 4); i++)
- val += aarch64_get_vec_u16 (cpu, vm, i);
- aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
- return;
+ {
+ uint16_t val = 0;
+ for (i = 0; i < (full ? 8 : 4); i++)
+ val += aarch64_get_vec_u16 (cpu, vm, i);
+ aarch64_set_vec_u64 (cpu, rd, 0, val);
+ return;
+ }
case 2:
- for (i = 0; i < (full ? 4 : 2); i++)
- val += aarch64_get_vec_u32 (cpu, vm, i);
- aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
- return;
+ {
+ uint32_t val = 0;
+ if (! full)
+ HALT_UNALLOC;
+ for (i = 0; i < 4; i++)
+ val += aarch64_get_vec_u32 (cpu, vm, i);
+ aarch64_set_vec_u64 (cpu, rd, 0, val);
+ return;
+ }
case 3:
- if (! full)
- HALT_UNALLOC;
- val = aarch64_get_vec_u64 (cpu, vm, 0);
- val += aarch64_get_vec_u64 (cpu, vm, 1);
- aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
- return;
+ HALT_UNALLOC;
}
}
switch (INSTR (23, 22))
{
case 0:
- DO_VEC_WIDENING_MUL (full ? 16 : 8, uint16_t, u8, u16);
+ DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
return;
case 1:
- DO_VEC_WIDENING_MUL (full ? 8 : 4, uint32_t, u16, u32);
+ DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
return;
case 2:
- DO_VEC_WIDENING_MUL (full ? 4 : 2, uint64_t, u32, u64);
+ DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
return;
case 3:
switch (INSTR (23, 22))
{
case 0:
- {
- uint16_t a[16], b[16];
-
- for (i = 0; i < (full ? 16 : 8); i++)
- {
- a[i] = aarch64_get_vec_u8 (cpu, vn, i);
- b[i] = aarch64_get_vec_u8 (cpu, vm, i);
- }
-
- for (i = 0; i < (full ? 16 : 8); i++)
- {
- uint16_t v = aarch64_get_vec_u8 (cpu, vd, i);
-
- aarch64_set_vec_u16 (cpu, vd, i, v + (a[i] * b[i]));
- }
- }
+ for (i = 0; i < (full ? 16 : 8); i++)
+ aarch64_set_vec_u8 (cpu, vd, i,
+ aarch64_get_vec_u8 (cpu, vd, i)
+ + (aarch64_get_vec_u8 (cpu, vn, i)
+ * aarch64_get_vec_u8 (cpu, vm, i)));
return;
case 1:
- {
- uint32_t a[8], b[8];
-
- for (i = 0; i < (full ? 8 : 4); i++)
- {
- a[i] = aarch64_get_vec_u16 (cpu, vn, i);
- b[i] = aarch64_get_vec_u16 (cpu, vm, i);
- }
-
- for (i = 0; i < (full ? 8 : 4); i++)
- {
- uint32_t v = aarch64_get_vec_u16 (cpu, vd, i);
-
- aarch64_set_vec_u32 (cpu, vd, i, v + (a[i] * b[i]));
- }
- }
+ for (i = 0; i < (full ? 8 : 4); i++)
+ aarch64_set_vec_u16 (cpu, vd, i,
+ aarch64_get_vec_u16 (cpu, vd, i)
+ + (aarch64_get_vec_u16 (cpu, vn, i)
+ * aarch64_get_vec_u16 (cpu, vm, i)));
return;
case 2:
- {
- uint64_t a[4], b[4];
-
- for (i = 0; i < (full ? 4 : 2); i++)
- {
- a[i] = aarch64_get_vec_u32 (cpu, vn, i);
- b[i] = aarch64_get_vec_u32 (cpu, vm, i);
- }
-
- for (i = 0; i < (full ? 4 : 2); i++)
- {
- uint64_t v = aarch64_get_vec_u32 (cpu, vd, i);
-
- aarch64_set_vec_u64 (cpu, vd, i, v + (a[i] * b[i]));
- }
- }
+ for (i = 0; i < (full ? 4 : 2); i++)
+ aarch64_set_vec_u32 (cpu, vd, i,
+ aarch64_get_vec_u32 (cpu, vd, i)
+ + (aarch64_get_vec_u32 (cpu, vn, i)
+ * aarch64_get_vec_u32 (cpu, vm, i)));
return;
- case 3:
+ default:
HALT_UNALLOC;
}
}
static float
fmaxnm (float a, float b)
{
- if (fpclassify (a) == FP_NORMAL)
+ if (! isnan (a))
{
- if (fpclassify (b) == FP_NORMAL)
+ if (! isnan (b))
return a > b ? a : b;
return a;
}
- else if (fpclassify (b) == FP_NORMAL)
+ else if (! isnan (b))
return b;
return a;
}
static float
fminnm (float a, float b)
{
- if (fpclassify (a) == FP_NORMAL)
+ if (! isnan (a))
{
- if (fpclassify (b) == FP_NORMAL)
+ if (! isnan (b))
return a < b ? a : b;
return a;
}
- else if (fpclassify (b) == FP_NORMAL)
+ else if (! isnan (b))
return b;
return a;
}
static double
dmaxnm (double a, double b)
{
- if (fpclassify (a) == FP_NORMAL)
+ if (! isnan (a))
{
- if (fpclassify (b) == FP_NORMAL)
+ if (! isnan (b))
return a > b ? a : b;
return a;
}
- else if (fpclassify (b) == FP_NORMAL)
+ else if (! isnan (b))
return b;
return a;
}
static double
dminnm (double a, double b)
{
- if (fpclassify (a) == FP_NORMAL)
+ if (! isnan (a))
{
- if (fpclassify (b) == FP_NORMAL)
+ if (! isnan (b))
return a < b ? a : b;
return a;
}
- else if (fpclassify (b) == FP_NORMAL)
+ else if (! isnan (b))
return b;
return a;
}
NYI_assert (15, 10, 0x07);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- if (test_false)
- {
- for (i = 0; i < (full ? 16 : 8); i++)
- if (aarch64_get_vec_u32 (cpu, vn, i) == 0)
- aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
- }
- else
+ for (i = 0; i < (full ? 4 : 2); i++)
{
- for (i = 0; i < (full ? 16 : 8); i++)
- if (aarch64_get_vec_u32 (cpu, vn, i) != 0)
- aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
+ uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
+ uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
+ uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
+ if (test_false)
+ aarch64_set_vec_u32 (cpu, vd, i,
+ (vd_val & vm_val) | (vn_val & ~vm_val));
+ else
+ aarch64_set_vec_u32 (cpu, vd, i,
+ (vd_val & ~vm_val) | (vn_val & vm_val));
}
}
switch (INSTR (23, 22))
{
case 0:
- if (bias)
- for (i = 0; i < 8; i++)
- aarch64_set_vec_u8 (cpu, vd, i + 8,
- aarch64_get_vec_u16 (cpu, vs, i) >> 8);
- else
- for (i = 0; i < 8; i++)
- aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, i));
+ for (i = 0; i < 8; i++)
+ aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
+ aarch64_get_vec_u16 (cpu, vs, i));
return;
case 1:
- if (bias)
- for (i = 0; i < 4; i++)
- aarch64_set_vec_u16 (cpu, vd, i + 4,
- aarch64_get_vec_u32 (cpu, vs, i) >> 16);
- else
- for (i = 0; i < 4; i++)
- aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, i));
+ for (i = 0; i < 4; i++)
+ aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
+ aarch64_get_vec_u32 (cpu, vs, i));
return;
case 2:
- if (bias)
- for (i = 0; i < 2; i++)
- aarch64_set_vec_u32 (cpu, vd, i + 4,
- aarch64_get_vec_u64 (cpu, vs, i) >> 32);
- else
- for (i = 0; i < 2; i++)
- aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, i));
+ for (i = 0; i < 2; i++)
+ aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
+ aarch64_get_vec_u64 (cpu, vs, i));
return;
}
}
+/* Return the number of bits set in the input value. */
+#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+# define popcount __builtin_popcount
+#else
+static int
+popcount (unsigned char x)
+{
+ static const unsigned char popcnt[16] =
+ {
+ 0, 1, 1, 2,
+ 1, 2, 2, 3,
+ 1, 2, 2, 3,
+ 2, 3, 3, 4
+ };
+
+ /* Only counts the low 8 bits of the input as that is all we need. */
+ return popcnt[x % 16] + popcnt[x / 16];
+}
+#endif
+
+static void
+do_vec_CNT (sim_cpu *cpu)
+{
+ /* instr[31] = 0
+ instr[30] = half (0)/ full (1)
+ instr[29,24] = 00 1110
+ instr[23,22] = size: byte(00)
+ instr[21,10] = 1000 0001 0110
+ instr[9,5] = Vs
+ instr[4,0] = Vd. */
+
+ unsigned vs = INSTR (9, 5);
+ unsigned vd = INSTR (4, 0);
+ int full = INSTR (30, 30);
+ int size = INSTR (23, 22);
+ int i;
+
+ NYI_assert (29, 24, 0x0E);
+ NYI_assert (21, 10, 0x816);
+
+ if (size != 0)
+ HALT_UNALLOC;
+
+ TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+
+ for (i = 0; i < (full ? 16 : 8); i++)
+ aarch64_set_vec_u8 (cpu, vd, i,
+ popcount (aarch64_get_vec_u8 (cpu, vs, i)));
+}
+
static void
do_vec_maxv (sim_cpu *cpu)
{
case 0x0D: /* 0001101 GT */ VEC_CMP (s, > );
case 0x0F: /* 0001111 GE */ VEC_CMP (s, >= );
case 0x22: /* 0100010 GT #0 */ VEC_CMP0 (s, > );
+ case 0x23: /* 0100011 TST */ VEC_CMP (u, & );
case 0x26: /* 0100110 EQ #0 */ VEC_CMP0 (s, == );
case 0x2A: /* 0101010 LT #0 */ VEC_CMP0 (s, < );
case 0x4D: /* 1001101 HI */ VEC_CMP (u, > );
}
}
+/* Float point vector convert to longer (precision). */
static void
-do_vec_UMOV (sim_cpu *cpu)
+do_vec_FCVTL (sim_cpu *cpu)
{
/* instr[31] = 0
- instr[30] = 32-bit(0)/64-bit(1)
- instr[29,21] = 00 1110 000
- insrt[20,16] = size & index
- instr[15,10] = 0011 11
- instr[9,5] = V source
- instr[4,0] = R dest. */
+ instr[30] = half (0) / all (1)
+ instr[29,23] = 00 1110 0
+ instr[22] = single (0) / double (1)
+ instr[21,10] = 10 0001 0111 10
+ instr[9,5] = Rn
+ instr[4,0] = Rd. */
- unsigned vs = INSTR (9, 5);
+ unsigned rn = INSTR (9, 5);
unsigned rd = INSTR (4, 0);
- unsigned index;
+ unsigned full = INSTR (30, 30);
+ unsigned i;
- NYI_assert (29, 21, 0x070);
- NYI_assert (15, 10, 0x0F);
+ NYI_assert (31, 31, 0);
+ NYI_assert (29, 23, 0x1C);
+ NYI_assert (21, 10, 0x85E);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- if (INSTR (16, 16))
- {
- /* Byte transfer. */
- index = INSTR (20, 17);
- aarch64_set_reg_u64 (cpu, rd, NO_SP,
- aarch64_get_vec_u8 (cpu, vs, index));
- }
- else if (INSTR (17, 17))
- {
- index = INSTR (20, 18);
- aarch64_set_reg_u64 (cpu, rd, NO_SP,
- aarch64_get_vec_u16 (cpu, vs, index));
- }
- else if (INSTR (18, 18))
+ if (INSTR (22, 22))
{
- index = INSTR (20, 19);
- aarch64_set_reg_u64 (cpu, rd, NO_SP,
- aarch64_get_vec_u32 (cpu, vs, index));
+ for (i = 0; i < 2; i++)
+ aarch64_set_vec_double (cpu, rd, i,
+ aarch64_get_vec_float (cpu, rn, i + 2*full));
}
else
{
- if (INSTR (30, 30) != 1)
- HALT_UNALLOC;
+ HALT_NYI;
- index = INSTR (20, 20);
- aarch64_set_reg_u64 (cpu, rd, NO_SP,
- aarch64_get_vec_u64 (cpu, vs, index));
+#if 0
+ /* TODO: Implement missing half-float support. */
+ for (i = 0; i < 4; i++)
+ aarch64_set_vec_float (cpu, rd, i,
+ aarch64_get_vec_halffloat (cpu, rn, i + 4*full));
+#endif
}
}
case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
case 0x07: do_vec_INS (cpu); return;
- case 0x0A: do_vec_TRN (cpu); return;
-
- case 0x0F:
- if (INSTR (17, 16) == 0)
- {
- do_vec_MOV_into_scalar (cpu);
- return;
- }
- break;
+ case 0x0B: do_vec_SMOV_into_scalar (cpu); return;
+ case 0x0F: do_vec_UMOV_into_scalar (cpu); return;
case 0x00:
case 0x08:
case 0x16:
do_vec_UZP (cpu); return;
+ case 0x0A: do_vec_TRN (cpu); return;
+
case 0x0E:
case 0x1E:
do_vec_ZIP (cpu); return;
case 0x6: do_vec_UZP (cpu); return;
case 0xE: do_vec_ZIP (cpu); return;
case 0xA: do_vec_TRN (cpu); return;
- case 0xF: do_vec_UMOV (cpu); return;
default: HALT_NYI;
}
}
case 0x08: do_vec_sub_long (cpu); return;
case 0x0a: do_vec_XTN (cpu); return;
case 0x11: do_vec_SSHL (cpu); return;
+ case 0x16: do_vec_CNT (cpu); return;
case 0x19: do_vec_max (cpu); return;
case 0x1B: do_vec_min (cpu); return;
case 0x21: do_vec_add (cpu); return;
case 0x33: do_vec_FMLA (cpu); return;
case 0x35: do_vec_fadd (cpu); return;
+ case 0x1E:
+ switch (INSTR (20, 16))
+ {
+ case 0x01: do_vec_FCVTL (cpu); return;
+ default: HALT_NYI;
+ }
+
case 0x2E:
switch (INSTR (20, 16))
{
NYI_assert (19, 19, 1);
shift = INSTR (18, 16);
- bias *= 3;
+ bias *= 4;
for (i = 0; i < 8; i++)
v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
for (i = 0; i < 8; i++)
NYI_assert (19, 19, 1);
shift = INSTR (18, 16);
- bias *= 3;
+ bias *= 4;
for (i = 0; i < 8; i++)
v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
for (i = 0; i < 8; i++)
case 0:
for (i = 0; i < (full ? 16 : 8); i++)
aarch64_set_vec_u8 (cpu, vd, i,
- (aarch64_get_vec_u8 (cpu, vn, i)
- * aarch64_get_vec_u8 (cpu, vm, i))
- - aarch64_get_vec_u8 (cpu, vd, i));
+ aarch64_get_vec_u8 (cpu, vd, i)
+ - (aarch64_get_vec_u8 (cpu, vn, i)
+ * aarch64_get_vec_u8 (cpu, vm, i)));
return;
case 1:
for (i = 0; i < (full ? 8 : 4); i++)
aarch64_set_vec_u16 (cpu, vd, i,
- (aarch64_get_vec_u16 (cpu, vn, i)
- * aarch64_get_vec_u16 (cpu, vm, i))
- - aarch64_get_vec_u16 (cpu, vd, i));
+ aarch64_get_vec_u16 (cpu, vd, i)
+ - (aarch64_get_vec_u16 (cpu, vn, i)
+ * aarch64_get_vec_u16 (cpu, vm, i)));
return;
case 2:
for (i = 0; i < (full ? 4 : 2); i++)
aarch64_set_vec_u32 (cpu, vd, i,
- (aarch64_get_vec_u32 (cpu, vn, i)
- * aarch64_get_vec_u32 (cpu, vm, i))
- - aarch64_get_vec_u32 (cpu, vd, i));
+ aarch64_get_vec_u32 (cpu, vd, i)
+ - (aarch64_get_vec_u32 (cpu, vn, i)
+ * aarch64_get_vec_u32 (cpu, vm, i)));
return;
default:
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
if (INSTR (22, 22))
- aarch64_set_FP_double (cpu, sd, set ? sn : sm);
+ aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
+ : aarch64_get_FP_double (cpu, sm)));
else
- aarch64_set_FP_float (cpu, sd, set ? sn : sm);
+ aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
+ : aarch64_get_FP_float (cpu, sm)));
}
/* Store 32 bit unscaled signed 9 bit. */
unsigned int st = INSTR (4, 0);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
- aarch64_get_vec_u32 (cpu, rn, 0));
+ aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
+ aarch64_get_vec_u32 (cpu, st, 0));
}
/* Store 64 bit unscaled signed 9 bit. */
unsigned int st = INSTR (4, 0);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
- aarch64_get_vec_u64 (cpu, rn, 0));
+ aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
+ aarch64_get_vec_u64 (cpu, st, 0));
}
/* Store 128 bit unscaled signed 9 bit. */
FRegister a;
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- aarch64_get_FP_long_double (cpu, rn, & a);
+ aarch64_get_FP_long_double (cpu, st, & a);
aarch64_set_mem_long_double (cpu,
- aarch64_get_reg_u64 (cpu, st, 1)
+ aarch64_get_reg_u64 (cpu, rn, 1)
+ offset, a);
}
unsigned sd = INSTR (4, 0);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- aarch64_set_FP_float (cpu, sd, sqrt (aarch64_get_FP_float (cpu, sn)));
+ aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
}
/* Double square root. */
static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
+#define UINT_MIN 0
+#define ULONG_MIN 0
+static const float FLOAT_UINT_MAX = (float) UINT_MAX;
+static const float FLOAT_UINT_MIN = (float) UINT_MIN;
+static const double DOUBLE_UINT_MAX = (double) UINT_MAX;
+static const double DOUBLE_UINT_MIN = (double) UINT_MIN;
+static const float FLOAT_ULONG_MAX = (float) ULONG_MAX;
+static const float FLOAT_ULONG_MIN = (float) ULONG_MIN;
+static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
+static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
+
/* Check for FP exception conditions:
NaN raises IO
Infinity raises IO
/* Do not raise an exception if we have reached ULONG_MAX. */
if (value != (1UL << 63))
- RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
+ RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
}
/* Do not raise an exception if we have reached ULONG_MAX. */
if (value != (1UL << 63))
- RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
+ RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
}
value = (uint32_t) d;
/* Do not raise an exception if we have reached UINT_MAX. */
if (value != (1UL << 31))
- RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
+ RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
}
else
{
value = (uint32_t) f;
/* Do not raise an exception if we have reached UINT_MAX. */
if (value != (1UL << 31))
- RAISE_EXCEPTIONS (f, value, FLOAT, INT);
+ RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
}
aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
{
uint32_t flags;
+ /* FIXME: Add exception raising. */
if (isnan (fvalue1) || isnan (fvalue2))
flags = C|V;
+ else if (isinf (fvalue1) && isinf (fvalue2))
+ {
+ /* Subtracting two infinities may give a NaN. We only need to compare
+ the signs, which we can get from isinf. */
+ int result = isinf (fvalue1) - isinf (fvalue2);
+
+ if (result == 0)
+ flags = Z|C;
+ else if (result < 0)
+ flags = N;
+ else /* (result > 0). */
+ flags = C;
+ }
else
{
float result = fvalue1 - fvalue2;
{
uint32_t flags;
+ /* FIXME: Add exception raising. */
if (isnan (dval1) || isnan (dval2))
flags = C|V;
+ else if (isinf (dval1) && isinf (dval2))
+ {
+ /* Subtracting two infinities may give a NaN. We only need to compare
+ the signs, which we can get from isinf. */
+ int result = isinf (dval1) - isinf (dval2);
+
+ if (result == 0)
+ flags = Z|C;
+ else if (result < 0)
+ flags = N;
+ else /* (result > 0). */
+ flags = C;
+ }
else
{
double result = dval1 - dval2;
aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
}
+/* Floating point scalar compare greater than or equal to 0. */
+static void
+do_scalar_FCMGE_zero (sim_cpu *cpu)
+{
+ /* instr [31,23] = 0111 1110 1
+ instr [22,22] = size
+ instr [21,16] = 1000 00
+ instr [15,10] = 1100 10
+ instr [9, 5] = Rn
+ instr [4, 0] = Rd. */
+
+ unsigned size = INSTR (22, 22);
+ unsigned rn = INSTR (9, 5);
+ unsigned rd = INSTR (4, 0);
+
+ NYI_assert (31, 23, 0x0FD);
+ NYI_assert (21, 16, 0x20);
+ NYI_assert (15, 10, 0x32);
+
+ TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+ if (size)
+ aarch64_set_vec_u64 (cpu, rd, 0,
+ aarch64_get_vec_double (cpu, rn, 0) >= 0.0 ? -1 : 0);
+ else
+ aarch64_set_vec_u32 (cpu, rd, 0,
+ aarch64_get_vec_float (cpu, rn, 0) >= 0.0 ? -1 : 0);
+}
+
+/* Floating point scalar compare less than or equal to 0. */
+static void
+do_scalar_FCMLE_zero (sim_cpu *cpu)
+{
+ /* instr [31,23] = 0111 1110 1
+ instr [22,22] = size
+ instr [21,16] = 1000 00
+ instr [15,10] = 1101 10
+ instr [9, 5] = Rn
+ instr [4, 0] = Rd. */
+
+ unsigned size = INSTR (22, 22);
+ unsigned rn = INSTR (9, 5);
+ unsigned rd = INSTR (4, 0);
+
+ NYI_assert (31, 23, 0x0FD);
+ NYI_assert (21, 16, 0x20);
+ NYI_assert (15, 10, 0x36);
+
+ TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+ if (size)
+ aarch64_set_vec_u64 (cpu, rd, 0,
+ aarch64_get_vec_double (cpu, rn, 0) <= 0.0 ? -1 : 0);
+ else
+ aarch64_set_vec_u32 (cpu, rd, 0,
+ aarch64_get_vec_float (cpu, rn, 0) <= 0.0 ? -1 : 0);
+}
+
+/* Floating point scalar compare greater than 0. */
+static void
+do_scalar_FCMGT_zero (sim_cpu *cpu)
+{
+ /* instr [31,23] = 0101 1110 1
+ instr [22,22] = size
+ instr [21,16] = 1000 00
+ instr [15,10] = 1100 10
+ instr [9, 5] = Rn
+ instr [4, 0] = Rd. */
+
+ unsigned size = INSTR (22, 22);
+ unsigned rn = INSTR (9, 5);
+ unsigned rd = INSTR (4, 0);
+
+ NYI_assert (31, 23, 0x0BD);
+ NYI_assert (21, 16, 0x20);
+ NYI_assert (15, 10, 0x32);
+
+ TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+ if (size)
+ aarch64_set_vec_u64 (cpu, rd, 0,
+ aarch64_get_vec_double (cpu, rn, 0) > 0.0 ? -1 : 0);
+ else
+ aarch64_set_vec_u32 (cpu, rd, 0,
+ aarch64_get_vec_float (cpu, rn, 0) > 0.0 ? -1 : 0);
+}
+
+/* Floating point scalar compare equal to 0. */
+static void
+do_scalar_FCMEQ_zero (sim_cpu *cpu)
+{
+ /* instr [31,23] = 0101 1110 1
+ instr [22,22] = size
+ instr [21,16] = 1000 00
+ instr [15,10] = 1101 10
+ instr [9, 5] = Rn
+ instr [4, 0] = Rd. */
+
+ unsigned size = INSTR (22, 22);
+ unsigned rn = INSTR (9, 5);
+ unsigned rd = INSTR (4, 0);
+
+ NYI_assert (31, 23, 0x0BD);
+ NYI_assert (21, 16, 0x20);
+ NYI_assert (15, 10, 0x36);
+
+ TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+ if (size)
+ aarch64_set_vec_u64 (cpu, rd, 0,
+ aarch64_get_vec_double (cpu, rn, 0) == 0.0 ? -1 : 0);
+ else
+ aarch64_set_vec_u32 (cpu, rd, 0,
+ aarch64_get_vec_float (cpu, rn, 0) == 0.0 ? -1 : 0);
+}
+
+/* Floating point scalar compare less than 0. */
+static void
+do_scalar_FCMLT_zero (sim_cpu *cpu)
+{
+ /* instr [31,23] = 0101 1110 1
+ instr [22,22] = size
+ instr [21,16] = 1000 00
+ instr [15,10] = 1110 10
+ instr [9, 5] = Rn
+ instr [4, 0] = Rd. */
+
+ unsigned size = INSTR (22, 22);
+ unsigned rn = INSTR (9, 5);
+ unsigned rd = INSTR (4, 0);
+
+ NYI_assert (31, 23, 0x0BD);
+ NYI_assert (21, 16, 0x20);
+ NYI_assert (15, 10, 0x3A);
+
+ TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+ if (size)
+ aarch64_set_vec_u64 (cpu, rd, 0,
+ aarch64_get_vec_double (cpu, rn, 0) < 0.0 ? -1 : 0);
+ else
+ aarch64_set_vec_u32 (cpu, rd, 0,
+ aarch64_get_vec_float (cpu, rn, 0) < 0.0 ? -1 : 0);
+}
+
static void
do_scalar_shift (sim_cpu *cpu)
{
case 0x0D: do_scalar_CMGT (cpu); return;
case 0x11: do_scalar_USHL (cpu); return;
case 0x2E: do_scalar_NEG (cpu); return;
+ case 0x32: do_scalar_FCMGE_zero (cpu); return;
case 0x35: do_scalar_FABD (cpu); return;
+ case 0x36: do_scalar_FCMLE_zero (cpu); return;
case 0x39: do_scalar_FCM (cpu); return;
case 0x3B: do_scalar_FCM (cpu); return;
default:
{
case 0x21: do_double_add (cpu); return;
case 0x11: do_scalar_SSHL (cpu); return;
+ case 0x32: do_scalar_FCMGT_zero (cpu); return;
+ case 0x36: do_scalar_FCMEQ_zero (cpu); return;
+ case 0x3A: do_scalar_FCMLT_zero (cpu); return;
default:
HALT_NYI;
}
return (v + o) & 0x3F;
}
-/* Load multiple N-element structures to N consecutive registers. */
+/* Load multiple N-element structures to M consecutive registers. */
static void
-vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
{
int all = INSTR (30, 30);
unsigned size = INSTR (11, 10);
unsigned vd = INSTR (4, 0);
- unsigned i;
+ unsigned rpt = (N == M) ? 1 : M;
+ unsigned selem = N;
+ unsigned i, j, k;
switch (size)
{
case 0: /* 8-bit operations. */
- if (all)
- for (i = 0; i < (16 * N); i++)
- aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
- aarch64_get_mem_u8 (cpu, address + i));
- else
- for (i = 0; i < (8 * N); i++)
- aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
- aarch64_get_mem_u8 (cpu, address + i));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (8 + (8 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
+ aarch64_get_mem_u8 (cpu, address));
+ address += 1;
+ }
return;
case 1: /* 16-bit operations. */
- if (all)
- for (i = 0; i < (8 * N); i++)
- aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
- aarch64_get_mem_u16 (cpu, address + i * 2));
- else
- for (i = 0; i < (4 * N); i++)
- aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
- aarch64_get_mem_u16 (cpu, address + i * 2));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (4 + (4 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
+ aarch64_get_mem_u16 (cpu, address));
+ address += 2;
+ }
return;
case 2: /* 32-bit operations. */
- if (all)
- for (i = 0; i < (4 * N); i++)
- aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
- aarch64_get_mem_u32 (cpu, address + i * 4));
- else
- for (i = 0; i < (2 * N); i++)
- aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
- aarch64_get_mem_u32 (cpu, address + i * 4));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (2 + (2 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
+ aarch64_get_mem_u32 (cpu, address));
+ address += 4;
+ }
return;
case 3: /* 64-bit operations. */
- if (all)
- for (i = 0; i < (2 * N); i++)
- aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
- aarch64_get_mem_u64 (cpu, address + i * 8));
- else
- for (i = 0; i < N; i++)
- aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
- aarch64_get_mem_u64 (cpu, address + i * 8));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (1 + all); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
+ aarch64_get_mem_u64 (cpu, address));
+ address += 8;
+ }
return;
}
}
-/* LD4: load multiple 4-element to four consecutive registers. */
+/* Load multiple 4-element structures into four consecutive registers. */
static void
LD4 (sim_cpu *cpu, uint64_t address)
{
- vec_load (cpu, address, 4);
+ vec_load (cpu, address, 4, 4);
}
-/* LD3: load multiple 3-element structures to three consecutive registers. */
+/* Load multiple 3-element structures into three consecutive registers. */
static void
LD3 (sim_cpu *cpu, uint64_t address)
{
- vec_load (cpu, address, 3);
+ vec_load (cpu, address, 3, 3);
}
-/* LD2: load multiple 2-element structures to two consecutive registers. */
+/* Load multiple 2-element structures into two consecutive registers. */
static void
LD2 (sim_cpu *cpu, uint64_t address)
{
- vec_load (cpu, address, 2);
+ vec_load (cpu, address, 2, 2);
}
/* Load multiple 1-element structures into one register. */
static void
LD1_1 (sim_cpu *cpu, uint64_t address)
{
- int all = INSTR (30, 30);
- unsigned size = INSTR (11, 10);
- unsigned vd = INSTR (4, 0);
- unsigned i;
-
- switch (size)
- {
- case 0:
- /* LD1 {Vd.16b}, addr, #16 */
- /* LD1 {Vd.8b}, addr, #8 */
- for (i = 0; i < (all ? 16 : 8); i++)
- aarch64_set_vec_u8 (cpu, vd, i,
- aarch64_get_mem_u8 (cpu, address + i));
- return;
-
- case 1:
- /* LD1 {Vd.8h}, addr, #16 */
- /* LD1 {Vd.4h}, addr, #8 */
- for (i = 0; i < (all ? 8 : 4); i++)
- aarch64_set_vec_u16 (cpu, vd, i,
- aarch64_get_mem_u16 (cpu, address + i * 2));
- return;
-
- case 2:
- /* LD1 {Vd.4s}, addr, #16 */
- /* LD1 {Vd.2s}, addr, #8 */
- for (i = 0; i < (all ? 4 : 2); i++)
- aarch64_set_vec_u32 (cpu, vd, i,
- aarch64_get_mem_u32 (cpu, address + i * 4));
- return;
-
- case 3:
- /* LD1 {Vd.2d}, addr, #16 */
- /* LD1 {Vd.1d}, addr, #8 */
- for (i = 0; i < (all ? 2 : 1); i++)
- aarch64_set_vec_u64 (cpu, vd, i,
- aarch64_get_mem_u64 (cpu, address + i * 8));
- return;
- }
+ vec_load (cpu, address, 1, 1);
}
/* Load multiple 1-element structures into two registers. */
static void
LD1_2 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the LD2 version.
- So why have two different instructions ? There must be something
- wrong somewhere. */
- vec_load (cpu, address, 2);
+ vec_load (cpu, address, 1, 2);
}
/* Load multiple 1-element structures into three registers. */
static void
LD1_3 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the LD3 version.
- So why have two different instructions ? There must be something
- wrong somewhere. */
- vec_load (cpu, address, 3);
+ vec_load (cpu, address, 1, 3);
}
/* Load multiple 1-element structures into four registers. */
static void
LD1_4 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the LD4 version.
- So why have two different instructions ? There must be something
- wrong somewhere. */
- vec_load (cpu, address, 4);
+ vec_load (cpu, address, 1, 4);
}
-/* Store multiple N-element structures to N consecutive registers. */
+/* Store multiple N-element structures from M consecutive registers. */
static void
-vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
{
int all = INSTR (30, 30);
unsigned size = INSTR (11, 10);
unsigned vd = INSTR (4, 0);
- unsigned i;
+ unsigned rpt = (N == M) ? 1 : M;
+ unsigned selem = N;
+ unsigned i, j, k;
switch (size)
{
case 0: /* 8-bit operations. */
- if (all)
- for (i = 0; i < (16 * N); i++)
- aarch64_set_mem_u8
- (cpu, address + i,
- aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
- else
- for (i = 0; i < (8 * N); i++)
- aarch64_set_mem_u8
- (cpu, address + i,
- aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (8 + (8 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_mem_u8
+ (cpu, address,
+ aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
+ address += 1;
+ }
return;
case 1: /* 16-bit operations. */
- if (all)
- for (i = 0; i < (8 * N); i++)
- aarch64_set_mem_u16
- (cpu, address + i * 2,
- aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
- else
- for (i = 0; i < (4 * N); i++)
- aarch64_set_mem_u16
- (cpu, address + i * 2,
- aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (4 + (4 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_mem_u16
+ (cpu, address,
+ aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
+ address += 2;
+ }
return;
case 2: /* 32-bit operations. */
- if (all)
- for (i = 0; i < (4 * N); i++)
- aarch64_set_mem_u32
- (cpu, address + i * 4,
- aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
- else
- for (i = 0; i < (2 * N); i++)
- aarch64_set_mem_u32
- (cpu, address + i * 4,
- aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (2 + (2 * all)); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_mem_u32
+ (cpu, address,
+ aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
+ address += 4;
+ }
return;
case 3: /* 64-bit operations. */
- if (all)
- for (i = 0; i < (2 * N); i++)
- aarch64_set_mem_u64
- (cpu, address + i * 8,
- aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
- else
- for (i = 0; i < N; i++)
- aarch64_set_mem_u64
- (cpu, address + i * 8,
- aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
+ for (i = 0; i < rpt; i++)
+ for (j = 0; j < (1 + all); j++)
+ for (k = 0; k < selem; k++)
+ {
+ aarch64_set_mem_u64
+ (cpu, address,
+ aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
+ address += 8;
+ }
return;
}
}
-/* Store multiple 4-element structure to four consecutive registers. */
+/* Store multiple 4-element structure from four consecutive registers. */
static void
ST4 (sim_cpu *cpu, uint64_t address)
{
- vec_store (cpu, address, 4);
+ vec_store (cpu, address, 4, 4);
}
-/* Store multiple 3-element structures to three consecutive registers. */
+/* Store multiple 3-element structures from three consecutive registers. */
static void
ST3 (sim_cpu *cpu, uint64_t address)
{
- vec_store (cpu, address, 3);
+ vec_store (cpu, address, 3, 3);
}
-/* Store multiple 2-element structures to two consecutive registers. */
+/* Store multiple 2-element structures from two consecutive registers. */
static void
ST2 (sim_cpu *cpu, uint64_t address)
{
- vec_store (cpu, address, 2);
+ vec_store (cpu, address, 2, 2);
}
-/* Store multiple 1-element structures into one register. */
+/* Store multiple 1-element structures from one register. */
static void
ST1_1 (sim_cpu *cpu, uint64_t address)
{
- int all = INSTR (30, 30);
- unsigned size = INSTR (11, 10);
- unsigned vd = INSTR (4, 0);
- unsigned i;
-
- switch (size)
- {
- case 0:
- for (i = 0; i < (all ? 16 : 8); i++)
- aarch64_set_mem_u8 (cpu, address + i,
- aarch64_get_vec_u8 (cpu, vd, i));
- return;
-
- case 1:
- for (i = 0; i < (all ? 8 : 4); i++)
- aarch64_set_mem_u16 (cpu, address + i * 2,
- aarch64_get_vec_u16 (cpu, vd, i));
- return;
-
- case 2:
- for (i = 0; i < (all ? 4 : 2); i++)
- aarch64_set_mem_u32 (cpu, address + i * 4,
- aarch64_get_vec_u32 (cpu, vd, i));
- return;
-
- case 3:
- for (i = 0; i < (all ? 2 : 1); i++)
- aarch64_set_mem_u64 (cpu, address + i * 8,
- aarch64_get_vec_u64 (cpu, vd, i));
- return;
- }
+ vec_store (cpu, address, 1, 1);
}
-/* Store multiple 1-element structures into two registers. */
+/* Store multiple 1-element structures from two registers. */
static void
ST1_2 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the ST2 version.
- So why have two different instructions ? There must be
- something wrong somewhere. */
- vec_store (cpu, address, 2);
+ vec_store (cpu, address, 1, 2);
}
-/* Store multiple 1-element structures into three registers. */
+/* Store multiple 1-element structures from three registers. */
static void
ST1_3 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the ST3 version.
- So why have two different instructions ? There must be
- something wrong somewhere. */
- vec_store (cpu, address, 3);
+ vec_store (cpu, address, 1, 3);
}
-/* Store multiple 1-element structures into four registers. */
+/* Store multiple 1-element structures from four registers. */
static void
ST1_4 (sim_cpu *cpu, uint64_t address)
{
- /* FIXME: This algorithm is *exactly* the same as the ST4 version.
- So why have two different instructions ? There must be
- something wrong somewhere. */
- vec_store (cpu, address, 4);
-}
+ vec_store (cpu, address, 1, 4);
+}
+
+#define LDn_STn_SINGLE_LANE_AND_SIZE() \
+ do \
+ { \
+ switch (INSTR (15, 14)) \
+ { \
+ case 0: \
+ lane = (full << 3) | (s << 2) | size; \
+ size = 0; \
+ break; \
+ \
+ case 1: \
+ if ((size & 1) == 1) \
+ HALT_UNALLOC; \
+ lane = (full << 2) | (s << 1) | (size >> 1); \
+ size = 1; \
+ break; \
+ \
+ case 2: \
+ if ((size & 2) == 2) \
+ HALT_UNALLOC; \
+ \
+ if ((size & 1) == 0) \
+ { \
+ lane = (full << 1) | s; \
+ size = 2; \
+ } \
+ else \
+ { \
+ if (s) \
+ HALT_UNALLOC; \
+ lane = full; \
+ size = 3; \
+ } \
+ break; \
+ \
+ default: \
+ HALT_UNALLOC; \
+ } \
+ } \
+ while (0)
+/* Load single structure into one lane of N registers. */
static void
-do_vec_LDnR (sim_cpu *cpu, uint64_t address)
+do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
{
/* instr[31] = 0
instr[30] = element selector 0=>half, 1=>all elements
instr[29,24] = 00 1101
instr[23] = 0=>simple, 1=>post
instr[22] = 1
- instr[21] = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
+ instr[21] = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11111 (immediate post inc)
- instr[15,14] = 11
- instr[13] = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
- instr[12] = 0
- instr[11,10] = element size 00=> byte(b), 01=> half(h),
- 10=> word(s), 11=> double(d)
+ instr[15,13] = opcode
+ instr[12] = S, used for lane number
+ instr[11,10] = size, also used for lane number
instr[9,5] = address
instr[4,0] = Vd */
unsigned full = INSTR (30, 30);
unsigned vd = INSTR (4, 0);
unsigned size = INSTR (11, 10);
+ unsigned s = INSTR (12, 12);
+ int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+ int lane = 0;
int i;
NYI_assert (29, 24, 0x0D);
NYI_assert (22, 22, 1);
- NYI_assert (15, 14, 3);
- NYI_assert (12, 12, 0);
-
- switch ((INSTR (13, 13) << 1) | INSTR (21, 21))
- {
- case 0: /* LD1R. */
- switch (size)
- {
- case 0:
- {
- uint8_t val = aarch64_get_mem_u8 (cpu, address);
- for (i = 0; i < (full ? 16 : 8); i++)
- aarch64_set_vec_u8 (cpu, vd, i, val);
- break;
- }
-
- case 1:
- {
- uint16_t val = aarch64_get_mem_u16 (cpu, address);
- for (i = 0; i < (full ? 8 : 4); i++)
- aarch64_set_vec_u16 (cpu, vd, i, val);
- break;
- }
- case 2:
- {
- uint32_t val = aarch64_get_mem_u32 (cpu, address);
- for (i = 0; i < (full ? 4 : 2); i++)
- aarch64_set_vec_u32 (cpu, vd, i, val);
- break;
- }
-
- case 3:
- {
- uint64_t val = aarch64_get_mem_u64 (cpu, address);
- for (i = 0; i < (full ? 2 : 1); i++)
- aarch64_set_vec_u64 (cpu, vd, i, val);
- break;
- }
+ /* Compute the lane number first (using size), and then compute size. */
+ LDn_STn_SINGLE_LANE_AND_SIZE ();
- default:
- HALT_UNALLOC;
+ for (i = 0; i < nregs; i++)
+ switch (size)
+ {
+ case 0:
+ {
+ uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
+ aarch64_set_vec_u8 (cpu, vd + i, lane, val);
+ break;
}
- break;
- case 1: /* LD2R. */
- switch (size)
+ case 1:
{
- case 0:
- {
- uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
- uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
-
- for (i = 0; i < (full ? 16 : 8); i++)
- {
- aarch64_set_vec_u8 (cpu, vd, 0, val1);
- aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
- }
- break;
- }
+ uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
+ aarch64_set_vec_u16 (cpu, vd + i, lane, val);
+ break;
+ }
- case 1:
- {
- uint16_t val1 = aarch64_get_mem_u16 (cpu, address);
- uint16_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
+ case 2:
+ {
+ uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
+ aarch64_set_vec_u32 (cpu, vd + i, lane, val);
+ break;
+ }
- for (i = 0; i < (full ? 8 : 4); i++)
- {
- aarch64_set_vec_u16 (cpu, vd, 0, val1);
- aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
- }
- break;
- }
+ case 3:
+ {
+ uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
+ aarch64_set_vec_u64 (cpu, vd + i, lane, val);
+ break;
+ }
+ }
+}
- case 2:
- {
- uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
+/* Store single structure from one lane from N registers. */
+static void
+do_vec_STn_single (sim_cpu *cpu, uint64_t address)
+{
+ /* instr[31] = 0
+ instr[30] = element selector 0=>half, 1=>all elements
+ instr[29,24] = 00 1101
+ instr[23] = 0=>simple, 1=>post
+ instr[22] = 0
+ instr[21] = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
+ instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
+ 11111 (immediate post inc)
+ instr[15,13] = opcode
+ instr[12] = S, used for lane number
+ instr[11,10] = size, also used for lane number
+ instr[9,5] = address
+ instr[4,0] = Vd */
- for (i = 0; i < (full ? 4 : 2); i++)
- {
- aarch64_set_vec_u32 (cpu, vd, 0, val1);
- aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
- }
- break;
- }
+ unsigned full = INSTR (30, 30);
+ unsigned vd = INSTR (4, 0);
+ unsigned size = INSTR (11, 10);
+ unsigned s = INSTR (12, 12);
+ int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+ int lane = 0;
+ int i;
- case 3:
- {
- uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
- uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
+ NYI_assert (29, 24, 0x0D);
+ NYI_assert (22, 22, 0);
- for (i = 0; i < (full ? 2 : 1); i++)
- {
- aarch64_set_vec_u64 (cpu, vd, 0, val1);
- aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
- }
- break;
- }
+ /* Compute the lane number first (using size), and then compute size. */
+ LDn_STn_SINGLE_LANE_AND_SIZE ();
- default:
- HALT_UNALLOC;
+ for (i = 0; i < nregs; i++)
+ switch (size)
+ {
+ case 0:
+ {
+ uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
+ aarch64_set_mem_u8 (cpu, address + i, val);
+ break;
}
- break;
- case 2: /* LD3R. */
- switch (size)
+ case 1:
{
- case 0:
- {
- uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
- uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
- uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
-
- for (i = 0; i < (full ? 16 : 8); i++)
- {
- aarch64_set_vec_u8 (cpu, vd, 0, val1);
- aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
- }
- }
+ uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
+ aarch64_set_mem_u16 (cpu, address + (i * 2), val);
break;
+ }
- case 1:
- {
- uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
- uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
-
- for (i = 0; i < (full ? 8 : 4); i++)
- {
- aarch64_set_vec_u16 (cpu, vd, 0, val1);
- aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
- }
- }
+ case 2:
+ {
+ uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
+ aarch64_set_mem_u32 (cpu, address + (i * 4), val);
break;
+ }
- case 2:
- {
- uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
- uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
-
- for (i = 0; i < (full ? 4 : 2); i++)
- {
- aarch64_set_vec_u32 (cpu, vd, 0, val1);
- aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
- }
- }
+ case 3:
+ {
+ uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
+ aarch64_set_mem_u64 (cpu, address + (i * 8), val);
break;
+ }
+ }
+}
- case 3:
- {
- uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
- uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
- uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
+/* Load single structure into all lanes of N registers. */
+static void
+do_vec_LDnR (sim_cpu *cpu, uint64_t address)
+{
+ /* instr[31] = 0
+ instr[30] = element selector 0=>half, 1=>all elements
+ instr[29,24] = 00 1101
+ instr[23] = 0=>simple, 1=>post
+ instr[22] = 1
+ instr[21] = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
+ instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
+ 11111 (immediate post inc)
+ instr[15,14] = 11
+ instr[13] = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
+ instr[12] = 0
+ instr[11,10] = element size 00=> byte(b), 01=> half(h),
+ 10=> word(s), 11=> double(d)
+ instr[9,5] = address
+ instr[4,0] = Vd */
- for (i = 0; i < (full ? 2 : 1); i++)
- {
- aarch64_set_vec_u64 (cpu, vd, 0, val1);
- aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
- }
- }
- break;
+ unsigned full = INSTR (30, 30);
+ unsigned vd = INSTR (4, 0);
+ unsigned size = INSTR (11, 10);
+ int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+ int i, n;
- default:
- HALT_UNALLOC;
- }
- break;
+ NYI_assert (29, 24, 0x0D);
+ NYI_assert (22, 22, 1);
+ NYI_assert (15, 14, 3);
+ NYI_assert (12, 12, 0);
- case 3: /* LD4R. */
- switch (size)
+ for (n = 0; n < nregs; n++)
+ switch (size)
+ {
+ case 0:
{
- case 0:
- {
- uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
- uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
- uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
- uint8_t val4 = aarch64_get_mem_u8 (cpu, address + 3);
-
- for (i = 0; i < (full ? 16 : 8); i++)
- {
- aarch64_set_vec_u8 (cpu, vd, 0, val1);
- aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
- aarch64_set_vec_u8 (cpu, vd + 3, 0, val4);
- }
- }
+ uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
+ for (i = 0; i < (full ? 16 : 8); i++)
+ aarch64_set_vec_u8 (cpu, vd + n, i, val);
break;
+ }
- case 1:
- {
- uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
- uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
- uint32_t val4 = aarch64_get_mem_u16 (cpu, address + 6);
-
- for (i = 0; i < (full ? 8 : 4); i++)
- {
- aarch64_set_vec_u16 (cpu, vd, 0, val1);
- aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
- aarch64_set_vec_u16 (cpu, vd + 3, 0, val4);
- }
- }
+ case 1:
+ {
+ uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
+ for (i = 0; i < (full ? 8 : 4); i++)
+ aarch64_set_vec_u16 (cpu, vd + n, i, val);
break;
+ }
- case 2:
- {
- uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
- uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
- uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
- uint32_t val4 = aarch64_get_mem_u32 (cpu, address + 12);
-
- for (i = 0; i < (full ? 4 : 2); i++)
- {
- aarch64_set_vec_u32 (cpu, vd, 0, val1);
- aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
- aarch64_set_vec_u32 (cpu, vd + 3, 0, val4);
- }
- }
+ case 2:
+ {
+ uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
+ for (i = 0; i < (full ? 4 : 2); i++)
+ aarch64_set_vec_u32 (cpu, vd + n, i, val);
break;
+ }
- case 3:
- {
- uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
- uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
- uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
- uint64_t val4 = aarch64_get_mem_u64 (cpu, address + 24);
-
- for (i = 0; i < (full ? 2 : 1); i++)
- {
- aarch64_set_vec_u64 (cpu, vd, 0, val1);
- aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
- aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
- aarch64_set_vec_u64 (cpu, vd + 3, 0, val4);
- }
- }
+ case 3:
+ {
+ uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
+ for (i = 0; i < (full ? 2 : 1); i++)
+ aarch64_set_vec_u64 (cpu, vd + n, i, val);
break;
-
- default:
- HALT_UNALLOC;
}
- break;
- default:
- HALT_UNALLOC;
- }
+ default:
+ HALT_UNALLOC;
+ }
}
static void
instr[31] = 0
instr[30] = element selector 0=>half, 1=>all elements
instr[29,25] = 00110
- instr[24] = ?
+ instr[24] = 0=>multiple struct, 1=>single struct
instr[23] = 0=>simple, 1=>post
instr[22] = 0=>store, 1=>load
instr[21] = 0 (LDn) / small(0)-large(1) selector (LDnR)
instr[9,5] = Vn, can be SP
instr[4,0] = Vd */
+ int single;
int post;
int load;
unsigned vn;
if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
HALT_NYI;
- type = INSTR (15, 12);
- if (type != 0xE && type != 0xE && INSTR (21, 21) != 0)
- HALT_NYI;
-
+ single = INSTR (24, 24);
post = INSTR (23, 23);
load = INSTR (22, 22);
+ type = INSTR (15, 12);
vn = INSTR (9, 5);
address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
+ if (! single && INSTR (21, 21) != 0)
+ HALT_UNALLOC;
+
if (post)
{
unsigned vm = INSTR (20, 16);
{
unsigned sizeof_operation;
- switch (type)
+ if (single)
{
- case 0: sizeof_operation = 32; break;
- case 4: sizeof_operation = 24; break;
- case 8: sizeof_operation = 16; break;
-
- case 0xC:
- sizeof_operation = INSTR (21, 21) ? 2 : 1;
- sizeof_operation <<= INSTR (11, 10);
- break;
+ if ((type >= 0) && (type <= 11))
+ {
+ int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+ switch (INSTR (15, 14))
+ {
+ case 0:
+ sizeof_operation = nregs * 1;
+ break;
+ case 1:
+ sizeof_operation = nregs * 2;
+ break;
+ case 2:
+ if (INSTR (10, 10) == 0)
+ sizeof_operation = nregs * 4;
+ else
+ sizeof_operation = nregs * 8;
+ break;
+ default:
+ HALT_UNALLOC;
+ }
+ }
+ else if (type == 0xC)
+ {
+ sizeof_operation = INSTR (21, 21) ? 2 : 1;
+ sizeof_operation <<= INSTR (11, 10);
+ }
+ else if (type == 0xE)
+ {
+ sizeof_operation = INSTR (21, 21) ? 4 : 3;
+ sizeof_operation <<= INSTR (11, 10);
+ }
+ else
+ HALT_UNALLOC;
+ }
+ else
+ {
+ switch (type)
+ {
+ case 0: sizeof_operation = 32; break;
+ case 4: sizeof_operation = 24; break;
+ case 8: sizeof_operation = 16; break;
- case 0xE:
- sizeof_operation = INSTR (21, 21) ? 8 : 4;
- sizeof_operation <<= INSTR (11, 10);
- break;
+ case 7:
+ /* One register, immediate offset variant. */
+ sizeof_operation = 8;
+ break;
- case 7:
- /* One register, immediate offset variant. */
- sizeof_operation = 8;
- break;
+ case 10:
+ /* Two registers, immediate offset variant. */
+ sizeof_operation = 16;
+ break;
- case 10:
- /* Two registers, immediate offset variant. */
- sizeof_operation = 16;
- break;
+ case 6:
+ /* Three registers, immediate offset variant. */
+ sizeof_operation = 24;
+ break;
- case 6:
- /* Three registers, immediate offset variant. */
- sizeof_operation = 24;
- break;
+ case 2:
+ /* Four registers, immediate offset variant. */
+ sizeof_operation = 32;
+ break;
- case 2:
- /* Four registers, immediate offset variant. */
- sizeof_operation = 32;
- break;
+ default:
+ HALT_UNALLOC;
+ }
- default:
- HALT_UNALLOC;
+ if (INSTR (30, 30))
+ sizeof_operation *= 2;
}
- if (INSTR (30, 30))
- sizeof_operation *= 2;
-
aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
}
else
NYI_assert (20, 16, 0);
}
+ if (single)
+ {
+ if (load)
+ {
+ if ((type >= 0) && (type <= 11))
+ do_vec_LDn_single (cpu, address);
+ else if ((type == 0xC) || (type == 0xE))
+ do_vec_LDnR (cpu, address);
+ else
+ HALT_UNALLOC;
+ return;
+ }
+
+ /* Stores. */
+ if ((type >= 0) && (type <= 11))
+ {
+ do_vec_STn_single (cpu, address);
+ return;
+ }
+
+ HALT_UNALLOC;
+ }
+
if (load)
{
switch (type)
case 10: LD1_2 (cpu, address); return;
case 7: LD1_1 (cpu, address); return;
- case 0xE:
- case 0xC: do_vec_LDnR (cpu, address); return;
-
default:
- HALT_NYI;
+ HALT_UNALLOC;
}
}
case 10: ST1_2 (cpu, address); return;
case 7: ST1_1 (cpu, address); return;
default:
- HALT_NYI;
+ HALT_UNALLOC;
}
}
/* Drop lowest 32 bits of middle cross-product. */
result = resultmid1 >> 32;
+ /* Move carry bit to just above middle cross-product highest bit. */
+ carry = carry << 32;
/* Add top cross-product plus and any carry. */
result += xproducthi + carry;
int64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
uint64_t uvalue1;
uint64_t uvalue2;
- int64_t signum = 1;
+ int negate = 0;
if (ra != R31)
HALT_UNALLOC;
the fix the sign up afterwards. */
if (value1 < 0)
{
- signum *= -1L;
+ negate = !negate;
uvalue1 = -value1;
}
else
if (value2 < 0)
{
- signum *= -1L;
+ negate = !negate;
uvalue2 = -value2;
}
else
}
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+
uresult = mul64hi (uvalue1, uvalue2);
result = uresult;
- result *= signum;
+
+ if (negate)
+ {
+ /* Multiply 128-bit result by -1, which means highpart gets inverted,
+ and has carry in added only if low part is 0. */
+ result = ~result;
+ if ((uvalue1 * uvalue2) == 0)
+ result += 1;
+ }
aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
}
" %*scall %" PRIx64 " [%s]"
" [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
stack_depth, " ", aarch64_get_next_PC (cpu),
- aarch64_get_func (aarch64_get_next_PC (cpu)),
+ aarch64_get_func (CPU_STATE (cpu),
+ aarch64_get_next_PC (cpu)),
aarch64_get_reg_u64 (cpu, 0, NO_SP),
aarch64_get_reg_u64 (cpu, 1, NO_SP),
aarch64_get_reg_u64 (cpu, 2, NO_SP)
static void
blr (sim_cpu *cpu)
{
- unsigned rn = INSTR (9, 5);
+ /* Ensure we read the destination before we write LR. */
+ uint64_t target = aarch64_get_reg_u64 (cpu, INSTR (9, 5), NO_SP);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- /* The pseudo code in the spec says we update LR before fetching.
- the value from the rn. */
aarch64_save_LR (cpu);
- aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
+ aarch64_set_next_PC (cpu, target);
if (TRACE_BRANCH_P (cpu))
{
" %*scall %" PRIx64 " [%s]"
" [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
stack_depth, " ", aarch64_get_next_PC (cpu),
- aarch64_get_func (aarch64_get_next_PC (cpu)),
+ aarch64_get_func (CPU_STATE (cpu),
+ aarch64_get_next_PC (cpu)),
aarch64_get_reg_u64 (cpu, 0, NO_SP),
aarch64_get_reg_u64 (cpu, 1, NO_SP),
aarch64_get_reg_u64 (cpu, 2, NO_SP)
unsigned rt = INSTR (4, 0);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (1 << pos))
+ if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
aarch64_set_next_PC_by_offset (cpu, offset);
}
unsigned rt = INSTR (4, 0);
TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
- if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (1 << pos)))
+ if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
aarch64_set_next_PC_by_offset (cpu, offset);
}
instr[18,5] = simm14 : signed offset counted in words
instr[4,0] = uimm5 */
- uint32_t pos = ((INSTR (31, 31) << 4) | INSTR (23, 19));
+ uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
NYI_assert (30, 25, 0x1b);
return FALSE;
aarch64_set_next_PC (cpu, pc + 4);
- aarch64_get_instr (cpu) = aarch64_get_mem_u32 (cpu, pc);
+
+ /* Code is always little-endian. */
+ sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
+ & aarch64_get_instr (cpu), pc, 4);
+ aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
aarch64_get_instr (cpu));
sim_cpu *cpu = STATE_CPU (sd, 0);
while (aarch64_step (cpu))
- aarch64_update_PC (cpu);
+ {
+ aarch64_update_PC (cpu);
+
+ if (sim_events_tick (sd))
+ sim_events_process (sd);
+ }
- sim_engine_halt (sd, NULL, NULL, aarch64_get_PC (cpu),
- sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
+ sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
+ sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
}
void