sim/aarch64/simulator.c

   1 /* simulator.c -- Interface for the AArch64 simulator.
   2
   3    Copyright (C) 2015-2017 Free Software Foundation, Inc.
   4
   5    Contributed by Red Hat.
   6
   7    This file is part of GDB.
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <sys/types.h>
  27 #include <math.h>
  28 #include <time.h>
  29 #include <limits.h>
  30
  31 #include "simulator.h"
  32 #include "cpustate.h"
  33 #include "memory.h"
  34
  35 #define NO_SP 0
  36 #define SP_OK 1
  37
  38 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
  39 #define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
  40 #define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
  41
  42 /* Space saver macro.  */
  43 #define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
  44
  45 #define HALT_UNALLOC                                                    \
  46   do                                                                    \
  47     {                                                                   \
  48       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  49       TRACE_INSN (cpu,                                                  \
  50                   "Unallocated instruction detected at sim line %d,"    \
  51                   " exe addr %" PRIx64,                                 \
  52                   __LINE__, aarch64_get_PC (cpu));                      \
  53       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  54                        sim_stopped, SIM_SIGILL);                        \
  55     }                                                                   \
  56   while (0)
  57
  58 #define HALT_NYI                                                        \
  59   do                                                                    \
  60     {                                                                   \
  61       TRACE_DISASM (cpu, aarch64_get_PC (cpu));                         \
  62       TRACE_INSN (cpu,                                                  \
  63                   "Unimplemented instruction detected at sim line %d,"  \
  64                   " exe addr %" PRIx64,                                 \
  65                   __LINE__, aarch64_get_PC (cpu));                      \
  66       if (! TRACE_ANY_P (cpu))                                          \
  67         sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
  68                         aarch64_get_instr (cpu));                       \
  69       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
  70                        sim_stopped, SIM_SIGABRT);                       \
  71     }                                                                   \
  72   while (0)
  73
  74 #define NYI_assert(HI, LO, EXPECTED)                                    \
  75   do                                                                    \
  76     {                                                                   \
  77       if (INSTR ((HI), (LO)) != (EXPECTED))                             \
  78         HALT_NYI;                                                       \
  79     }                                                                   \
  80   while (0)
  81
  82 /* Helper functions used by expandLogicalImmediate.  */
  83
  84 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
  85 static inline uint64_t
  86 ones (int N)
  87 {
  88   return (N == 64 ? (uint64_t)-1UL : ((1UL << N) - 1));
  89 }
  90
  91 /* result<0> to val<N>  */
  92 static inline uint64_t
  93 pickbit (uint64_t val, int N)
  94 {
  95   return pickbits64 (val, N, N);
  96 }
  97
  98 static uint64_t
  99 expand_logical_immediate (uint32_t S, uint32_t R, uint32_t N)
 100 {
 101   uint64_t mask;
 102   uint64_t imm;
 103   unsigned simd_size;
 104
 105   /* The immediate value is S+1 bits to 1, left rotated by SIMDsize - R
 106      (in other words, right rotated by R), then replicated. */
 107   if (N != 0)
 108     {
 109       simd_size = 64;
 110       mask = 0xffffffffffffffffull;
 111     }
 112   else
 113     {
 114       switch (S)
 115         {
 116         case 0x00 ... 0x1f: /* 0xxxxx */ simd_size = 32;           break;
 117         case 0x20 ... 0x2f: /* 10xxxx */ simd_size = 16; S &= 0xf; break;
 118         case 0x30 ... 0x37: /* 110xxx */ simd_size =  8; S &= 0x7; break;
 119         case 0x38 ... 0x3b: /* 1110xx */ simd_size =  4; S &= 0x3; break;
 120         case 0x3c ... 0x3d: /* 11110x */ simd_size =  2; S &= 0x1; break;
 121         default: return 0;
 122         }
 123       mask = (1ull << simd_size) - 1;
 124       /* Top bits are IGNORED.  */
 125       R &= simd_size - 1;
 126     }
 127
 128   /* NOTE: if S = simd_size - 1 we get 0xf..f which is rejected.  */
 129   if (S == simd_size - 1)
 130     return 0;
 131
 132   /* S+1 consecutive bits to 1.  */
 133   /* NOTE: S can't be 63 due to detection above.  */
 134   imm = (1ull << (S + 1)) - 1;
 135
 136   /* Rotate to the left by simd_size - R.  */
 137   if (R != 0)
 138     imm = ((imm << (simd_size - R)) & mask) | (imm >> R);
 139
 140   /* Replicate the value according to SIMD size.  */
 141   switch (simd_size)
 142     {
 143     case  2: imm = (imm <<  2) | imm;
 144     case  4: imm = (imm <<  4) | imm;
 145     case  8: imm = (imm <<  8) | imm;
 146     case 16: imm = (imm << 16) | imm;
 147     case 32: imm = (imm << 32) | imm;
 148     case 64: break;
 149     default: return 0;
 150     }
 151
 152   return imm;
 153 }
 154
 155 /* Instr[22,10] encodes N immr and imms. we want a lookup table
 156    for each possible combination i.e. 13 bits worth of int entries.  */
 157 #define  LI_TABLE_SIZE  (1 << 13)
 158 static uint64_t LITable[LI_TABLE_SIZE];
 159
 160 void
 161 aarch64_init_LIT_table (void)
 162 {
 163   unsigned index;
 164
 165   for (index = 0; index < LI_TABLE_SIZE; index++)
 166     {
 167       uint32_t N    = uimm (index, 12, 12);
 168       uint32_t immr = uimm (index, 11, 6);
 169       uint32_t imms = uimm (index, 5, 0);
 170
 171       LITable [index] = expand_logical_immediate (imms, immr, N);
 172     }
 173 }
 174
 175 static void
 176 dexNotify (sim_cpu *cpu)
 177 {
 178   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
 179                            2 ==> exit Java, 3 ==> start next bytecode.  */
 180   uint32_t type = INSTR (14, 0);
 181
 182   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
 183
 184   switch (type)
 185     {
 186     case 0:
 187       /* aarch64_notifyMethodEntry (aarch64_get_reg_u64 (cpu, R23, 0),
 188          aarch64_get_reg_u64 (cpu, R22, 0));  */
 189       break;
 190     case 1:
 191       /* aarch64_notifyMethodReentry (aarch64_get_reg_u64 (cpu, R23, 0),
 192          aarch64_get_reg_u64 (cpu, R22, 0));  */
 193       break;
 194     case 2:
 195       /* aarch64_notifyMethodExit ();  */
 196       break;
 197     case 3:
 198       /* aarch64_notifyBCStart (aarch64_get_reg_u64 (cpu, R23, 0),
 199          aarch64_get_reg_u64 (cpu, R22, 0));  */
 200       break;
 201     }
 202 }
 203
 204 /* secondary decode within top level groups  */
 205
 206 static void
 207 dexPseudo (sim_cpu *cpu)
 208 {
 209   /* assert instr[28,27] = 00
 210
 211      We provide 2 pseudo instructions:
 212
 213      HALT stops execution of the simulator causing an immediate
 214      return to the x86 code which entered it.
 215
 216      CALLOUT initiates recursive entry into x86 code.  A register
 217      argument holds the address of the x86 routine.  Immediate
 218      values in the instruction identify the number of general
 219      purpose and floating point register arguments to be passed
 220      and the type of any value to be returned.  */
 221
 222   uint32_t PSEUDO_HALT      =  0xE0000000U;
 223   uint32_t PSEUDO_CALLOUT   =  0x00018000U;
 224   uint32_t PSEUDO_CALLOUTR  =  0x00018001U;
 225   uint32_t PSEUDO_NOTIFY    =  0x00014000U;
 226   uint32_t dispatch;
 227
 228   if (aarch64_get_instr (cpu) == PSEUDO_HALT)
 229     {
 230       TRACE_EVENTS (cpu, " Pseudo Halt Instruction");
 231       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 232                        sim_stopped, SIM_SIGTRAP);
 233     }
 234
 235   dispatch = INSTR (31, 15);
 236
 237   /* We do not handle callouts at the moment.  */
 238   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
 239     {
 240       TRACE_EVENTS (cpu, " Callout");
 241       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
 242                        sim_stopped, SIM_SIGABRT);
 243     }
 244
 245   else if (dispatch == PSEUDO_NOTIFY)
 246     dexNotify (cpu);
 247
 248   else
 249     HALT_UNALLOC;
 250 }
 251
 252 /* Load-store single register (unscaled offset)
 253    These instructions employ a base register plus an unscaled signed
 254    9 bit offset.
 255
 256    N.B. the base register (source) can be Xn or SP. all other
 257    registers may not be SP.  */
 258
 259 /* 32 bit load 32 bit unscaled signed 9 bit.  */
 260 static void
 261 ldur32 (sim_cpu *cpu, int32_t offset)
 262 {
 263   unsigned rn = INSTR (9, 5);
 264   unsigned rt = INSTR (4, 0);
 265
 266   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 267   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 268                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 269                         + offset));
 270 }
 271
 272 /* 64 bit load 64 bit unscaled signed 9 bit.  */
 273 static void
 274 ldur64 (sim_cpu *cpu, int32_t offset)
 275 {
 276   unsigned rn = INSTR (9, 5);
 277   unsigned rt = INSTR (4, 0);
 278
 279   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 280   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 281                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 282                         + offset));
 283 }
 284
 285 /* 32 bit load zero-extended byte unscaled signed 9 bit.  */
 286 static void
 287 ldurb32 (sim_cpu *cpu, int32_t offset)
 288 {
 289   unsigned rn = INSTR (9, 5);
 290   unsigned rt = INSTR (4, 0);
 291
 292   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 293   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
 294                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 295                         + offset));
 296 }
 297
 298 /* 32 bit load sign-extended byte unscaled signed 9 bit.  */
 299 static void
 300 ldursb32 (sim_cpu *cpu, int32_t offset)
 301 {
 302   unsigned rn = INSTR (9, 5);
 303   unsigned rt = INSTR (4, 0);
 304
 305   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 306   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
 307                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 308                         + offset));
 309 }
 310
 311 /* 64 bit load sign-extended byte unscaled signed 9 bit.  */
 312 static void
 313 ldursb64 (sim_cpu *cpu, int32_t offset)
 314 {
 315   unsigned rn = INSTR (9, 5);
 316   unsigned rt = INSTR (4, 0);
 317
 318   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 319   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
 320                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 321                         + offset));
 322 }
 323
 324 /* 32 bit load zero-extended short unscaled signed 9 bit  */
 325 static void
 326 ldurh32 (sim_cpu *cpu, int32_t offset)
 327 {
 328   unsigned rn = INSTR (9, 5);
 329   unsigned rd = INSTR (4, 0);
 330
 331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 332   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
 333                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 334                         + offset));
 335 }
 336
 337 /* 32 bit load sign-extended short unscaled signed 9 bit  */
 338 static void
 339 ldursh32 (sim_cpu *cpu, int32_t offset)
 340 {
 341   unsigned rn = INSTR (9, 5);
 342   unsigned rd = INSTR (4, 0);
 343
 344   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 345   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
 346                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 347                         + offset));
 348 }
 349
 350 /* 64 bit load sign-extended short unscaled signed 9 bit  */
 351 static void
 352 ldursh64 (sim_cpu *cpu, int32_t offset)
 353 {
 354   unsigned rn = INSTR (9, 5);
 355   unsigned rt = INSTR (4, 0);
 356
 357   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 358   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
 359                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 360                         + offset));
 361 }
 362
 363 /* 64 bit load sign-extended word unscaled signed 9 bit  */
 364 static void
 365 ldursw (sim_cpu *cpu, int32_t offset)
 366 {
 367   unsigned rn = INSTR (9, 5);
 368   unsigned rd = INSTR (4, 0);
 369
 370   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 371   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
 372                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 373                         + offset));
 374 }
 375
 376 /* N.B. with stores the value in source is written to the address
 377    identified by source2 modified by offset.  */
 378
 379 /* 32 bit store 32 bit unscaled signed 9 bit.  */
 380 static void
 381 stur32 (sim_cpu *cpu, int32_t offset)
 382 {
 383   unsigned rn = INSTR (9, 5);
 384   unsigned rd = INSTR (4, 0);
 385
 386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 387   aarch64_set_mem_u32 (cpu,
 388                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 389                        aarch64_get_reg_u32 (cpu, rd, NO_SP));
 390 }
 391
 392 /* 64 bit store 64 bit unscaled signed 9 bit  */
 393 static void
 394 stur64 (sim_cpu *cpu, int32_t offset)
 395 {
 396   unsigned rn = INSTR (9, 5);
 397   unsigned rd = INSTR (4, 0);
 398
 399   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 400   aarch64_set_mem_u64 (cpu,
 401                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 402                        aarch64_get_reg_u64 (cpu, rd, NO_SP));
 403 }
 404
 405 /* 32 bit store byte unscaled signed 9 bit  */
 406 static void
 407 sturb (sim_cpu *cpu, int32_t offset)
 408 {
 409   unsigned rn = INSTR (9, 5);
 410   unsigned rd = INSTR (4, 0);
 411
 412   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 413   aarch64_set_mem_u8 (cpu,
 414                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 415                       aarch64_get_reg_u8 (cpu, rd, NO_SP));
 416 }
 417
 418 /* 32 bit store short unscaled signed 9 bit  */
 419 static void
 420 sturh (sim_cpu *cpu, int32_t offset)
 421 {
 422   unsigned rn = INSTR (9, 5);
 423   unsigned rd = INSTR (4, 0);
 424
 425   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 426   aarch64_set_mem_u16 (cpu,
 427                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 428                        aarch64_get_reg_u16 (cpu, rd, NO_SP));
 429 }
 430
 431 /* Load single register pc-relative label
 432    Offset is a signed 19 bit immediate count in words
 433    rt may not be SP.  */
 434
 435 /* 32 bit pc-relative load  */
 436 static void
 437 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
 438 {
 439   unsigned rd = INSTR (4, 0);
 440
 441   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 442   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 443                        aarch64_get_mem_u32
 444                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 445 }
 446
 447 /* 64 bit pc-relative load  */
 448 static void
 449 ldr_pcrel (sim_cpu *cpu, int32_t offset)
 450 {
 451   unsigned rd = INSTR (4, 0);
 452
 453   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 454   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 455                        aarch64_get_mem_u64
 456                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 457 }
 458
 459 /* sign extended 32 bit pc-relative load  */
 460 static void
 461 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
 462 {
 463   unsigned rd = INSTR (4, 0);
 464
 465   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 466   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 467                        aarch64_get_mem_s32
 468                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 469 }
 470
 471 /* float pc-relative load  */
 472 static void
 473 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
 474 {
 475   unsigned int rd = INSTR (4, 0);
 476
 477   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 478   aarch64_set_vec_u32 (cpu, rd, 0,
 479                        aarch64_get_mem_u32
 480                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 481 }
 482
 483 /* double pc-relative load  */
 484 static void
 485 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
 486 {
 487   unsigned int st = INSTR (4, 0);
 488
 489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 490   aarch64_set_vec_u64 (cpu, st, 0,
 491                        aarch64_get_mem_u64
 492                        (cpu, aarch64_get_PC (cpu) + offset * 4));
 493 }
 494
 495 /* long double pc-relative load.  */
 496 static void
 497 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
 498 {
 499   unsigned int st = INSTR (4, 0);
 500   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
 501   FRegister a;
 502
 503   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 504   aarch64_get_mem_long_double (cpu, addr, & a);
 505   aarch64_set_FP_long_double (cpu, st, a);
 506 }
 507
 508 /* This can be used to scale an offset by applying
 509    the requisite shift. the second argument is either
 510    16, 32 or 64.  */
 511
 512 #define SCALE(_offset, _elementSize) \
 513     ((_offset) << ScaleShift ## _elementSize)
 514
 515 /* This can be used to optionally scale a register derived offset
 516    by applying the requisite shift as indicated by the Scaling
 517    argument.  The second argument is either Byte, Short, Word
 518    or Long. The third argument is either Scaled or Unscaled.
 519    N.B. when _Scaling is Scaled the shift gets ANDed with
 520    all 1s while when it is Unscaled it gets ANDed with 0.  */
 521
 522 #define OPT_SCALE(_offset, _elementType, _Scaling) \
 523   ((_offset) << (_Scaling ? ScaleShift ## _elementType : 0))
 524
 525 /* This can be used to zero or sign extend a 32 bit register derived
 526    value to a 64 bit value.  the first argument must be the value as
 527    a uint32_t and the second must be either UXTW or SXTW. The result
 528    is returned as an int64_t.  */
 529
 530 static inline int64_t
 531 extend (uint32_t value, Extension extension)
 532 {
 533   union
 534   {
 535     uint32_t u;
 536     int32_t   n;
 537   } x;
 538
 539   /* A branchless variant of this ought to be possible.  */
 540   if (extension == UXTW || extension == NoExtension)
 541     return value;
 542
 543   x.u = value;
 544   return x.n;
 545 }
 546
 547 /* Scalar Floating Point
 548
 549    FP load/store single register (4 addressing modes)
 550
 551    N.B. the base register (source) can be the stack pointer.
 552    The secondary source register (source2) can only be an Xn register.  */
 553
 554 /* Load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 555 static void
 556 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 557 {
 558   unsigned rn = INSTR (9, 5);
 559   unsigned st = INSTR (4, 0);
 560   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 561
 562   if (wb != Post)
 563     address += offset;
 564
 565   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 566   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
 567   if (wb == Post)
 568     address += offset;
 569
 570   if (wb != NoWriteBack)
 571     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 572 }
 573
 574 /* Load 8 bit with unsigned 12 bit offset.  */
 575 static void
 576 fldrb_abs (sim_cpu *cpu, uint32_t offset)
 577 {
 578   unsigned rd = INSTR (4, 0);
 579   unsigned rn = INSTR (9, 5);
 580   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
 581
 582   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 583   aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 584 }
 585
 586 /* Load 16 bit scaled unsigned 12 bit.  */
 587 static void
 588 fldrh_abs (sim_cpu *cpu, uint32_t offset)
 589 {
 590   unsigned rd = INSTR (4, 0);
 591   unsigned rn = INSTR (9, 5);
 592   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
 593
 594   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 595   aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
 596 }
 597
 598 /* Load 32 bit scaled unsigned 12 bit.  */
 599 static void
 600 fldrs_abs (sim_cpu *cpu, uint32_t offset)
 601 {
 602   unsigned rd = INSTR (4, 0);
 603   unsigned rn = INSTR (9, 5);
 604   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
 605
 606   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 607   aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
 608 }
 609
 610 /* Load 64 bit scaled unsigned 12 bit.  */
 611 static void
 612 fldrd_abs (sim_cpu *cpu, uint32_t offset)
 613 {
 614   unsigned rd = INSTR (4, 0);
 615   unsigned rn = INSTR (9, 5);
 616   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
 617
 618   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 619   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 620 }
 621
 622 /* Load 128 bit scaled unsigned 12 bit.  */
 623 static void
 624 fldrq_abs (sim_cpu *cpu, uint32_t offset)
 625 {
 626   unsigned rd = INSTR (4, 0);
 627   unsigned rn = INSTR (9, 5);
 628   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
 629
 630   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 631   aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
 632   aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
 633 }
 634
 635 /* Load 32 bit scaled or unscaled zero- or sign-extended
 636    32-bit register offset.  */
 637 static void
 638 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 639 {
 640   unsigned rm = INSTR (20, 16);
 641   unsigned rn = INSTR (9, 5);
 642   unsigned st = INSTR (4, 0);
 643   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 644   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 645   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
 646
 647   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 648   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
 649                        (cpu, address + displacement));
 650 }
 651
 652 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 653 static void
 654 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 655 {
 656   unsigned rn = INSTR (9, 5);
 657   unsigned st = INSTR (4, 0);
 658   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 659
 660   if (wb != Post)
 661     address += offset;
 662
 663   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 664   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
 665
 666   if (wb == Post)
 667     address += offset;
 668
 669   if (wb != NoWriteBack)
 670     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 671 }
 672
 673 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
 674 static void
 675 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 676 {
 677   unsigned rm = INSTR (20, 16);
 678   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 679   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
 680
 681   fldrd_wb (cpu, displacement, NoWriteBack);
 682 }
 683
 684 /* Load 128 bit unscaled signed 9 bit with pre- or post-writeback.  */
 685 static void
 686 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 687 {
 688   FRegister a;
 689   unsigned rn = INSTR (9, 5);
 690   unsigned st = INSTR (4, 0);
 691   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 692
 693   if (wb != Post)
 694     address += offset;
 695
 696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 697   aarch64_get_mem_long_double (cpu, address, & a);
 698   aarch64_set_FP_long_double (cpu, st, a);
 699
 700   if (wb == Post)
 701     address += offset;
 702
 703   if (wb != NoWriteBack)
 704     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 705 }
 706
 707 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
 708 static void
 709 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 710 {
 711   unsigned rm = INSTR (20, 16);
 712   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 713   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
 714
 715   fldrq_wb (cpu, displacement, NoWriteBack);
 716 }
 717
 718 /* Memory Access
 719
 720    load-store single register
 721    There are four addressing modes available here which all employ a
 722    64 bit source (base) register.
 723
 724    N.B. the base register (source) can be the stack pointer.
 725    The secondary source register (source2)can only be an Xn register.
 726
 727    Scaled, 12-bit, unsigned immediate offset, without pre- and
 728    post-index options.
 729    Unscaled, 9-bit, signed immediate offset with pre- or post-index
 730    writeback.
 731    scaled or unscaled 64-bit register offset.
 732    scaled or unscaled 32-bit extended register offset.
 733
 734    All offsets are assumed to be raw from the decode i.e. the
 735    simulator is expected to adjust scaled offsets based on the
 736    accessed data size with register or extended register offset
 737    versions the same applies except that in the latter case the
 738    operation may also require a sign extend.
 739
 740    A separate method is provided for each possible addressing mode.  */
 741
 742 /* 32 bit load 32 bit scaled unsigned 12 bit  */
 743 static void
 744 ldr32_abs (sim_cpu *cpu, uint32_t offset)
 745 {
 746   unsigned rn = INSTR (9, 5);
 747   unsigned rt = INSTR (4, 0);
 748
 749   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 750   /* The target register may not be SP but the source may be.  */
 751   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 752                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 753                         + SCALE (offset, 32)));
 754 }
 755
 756 /* 32 bit load 32 bit unscaled signed 9 bit with pre- or post-writeback.  */
 757 static void
 758 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 759 {
 760   unsigned rn = INSTR (9, 5);
 761   unsigned rt = INSTR (4, 0);
 762   uint64_t address;
 763
 764   if (rn == rt && wb != NoWriteBack)
 765     HALT_UNALLOC;
 766
 767   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 768
 769   if (wb != Post)
 770     address += offset;
 771
 772   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 773   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
 774
 775   if (wb == Post)
 776     address += offset;
 777
 778   if (wb != NoWriteBack)
 779     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 780 }
 781
 782 /* 32 bit load 32 bit scaled or unscaled
 783    zero- or sign-extended 32-bit register offset  */
 784 static void
 785 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 786 {
 787   unsigned rm = INSTR (20, 16);
 788   unsigned rn = INSTR (9, 5);
 789   unsigned rt = INSTR (4, 0);
 790   /* rn may reference SP, rm and rt must reference ZR  */
 791
 792   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 793   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 794   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
 795
 796   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 797   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 798                        aarch64_get_mem_u32 (cpu, address + displacement));
 799 }
 800
 801 /* 64 bit load 64 bit scaled unsigned 12 bit  */
 802 static void
 803 ldr_abs (sim_cpu *cpu, uint32_t offset)
 804 {
 805   unsigned rn = INSTR (9, 5);
 806   unsigned rt = INSTR (4, 0);
 807
 808   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 809   /* The target register may not be SP but the source may be.  */
 810   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 811                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 812                         + SCALE (offset, 64)));
 813 }
 814
 815 /* 64 bit load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 816 static void
 817 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 818 {
 819   unsigned rn = INSTR (9, 5);
 820   unsigned rt = INSTR (4, 0);
 821   uint64_t address;
 822
 823   if (rn == rt && wb != NoWriteBack)
 824     HALT_UNALLOC;
 825
 826   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 827
 828   if (wb != Post)
 829     address += offset;
 830
 831   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 832   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
 833
 834   if (wb == Post)
 835     address += offset;
 836
 837   if (wb != NoWriteBack)
 838     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 839 }
 840
 841 /* 64 bit load 64 bit scaled or unscaled zero-
 842    or sign-extended 32-bit register offset.  */
 843 static void
 844 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 845 {
 846   unsigned rm = INSTR (20, 16);
 847   unsigned rn = INSTR (9, 5);
 848   unsigned rt = INSTR (4, 0);
 849   /* rn may reference SP, rm and rt must reference ZR  */
 850
 851   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 852   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
 853   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
 854
 855   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 856   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 857                        aarch64_get_mem_u64 (cpu, address + displacement));
 858 }
 859
 860 /* 32 bit load zero-extended byte scaled unsigned 12 bit.  */
 861 static void
 862 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
 863 {
 864   unsigned rn = INSTR (9, 5);
 865   unsigned rt = INSTR (4, 0);
 866
 867   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 868   /* The target register may not be SP but the source may be
 869      there is no scaling required for a byte load.  */
 870   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 871                        aarch64_get_mem_u8
 872                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 873 }
 874
 875 /* 32 bit load zero-extended byte unscaled signed 9 bit with pre- or post-writeback.  */
 876 static void
 877 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 878 {
 879   unsigned rn = INSTR (9, 5);
 880   unsigned rt = INSTR (4, 0);
 881   uint64_t address;
 882
 883   if (rn == rt && wb != NoWriteBack)
 884     HALT_UNALLOC;
 885
 886   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 887
 888   if (wb != Post)
 889     address += offset;
 890
 891   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 892   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
 893
 894   if (wb == Post)
 895     address += offset;
 896
 897   if (wb != NoWriteBack)
 898     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 899 }
 900
 901 /* 32 bit load zero-extended byte scaled or unscaled zero-
 902    or sign-extended 32-bit register offset.  */
 903 static void
 904 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 905 {
 906   unsigned rm = INSTR (20, 16);
 907   unsigned rn = INSTR (9, 5);
 908   unsigned rt = INSTR (4, 0);
 909   /* rn may reference SP, rm and rt must reference ZR  */
 910
 911   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 912   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 913                                  extension);
 914
 915   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 916   /* There is no scaling required for a byte load.  */
 917   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 918                        aarch64_get_mem_u8 (cpu, address + displacement));
 919 }
 920
 921 /* 64 bit load sign-extended byte unscaled signed 9 bit
 922    with pre- or post-writeback.  */
 923 static void
 924 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 925 {
 926   unsigned rn = INSTR (9, 5);
 927   unsigned rt = INSTR (4, 0);
 928   uint64_t address;
 929   int64_t val;
 930
 931   if (rn == rt && wb != NoWriteBack)
 932     HALT_UNALLOC;
 933
 934   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 935
 936   if (wb != Post)
 937     address += offset;
 938
 939   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 940   val = aarch64_get_mem_s8 (cpu, address);
 941   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 942
 943   if (wb == Post)
 944     address += offset;
 945
 946   if (wb != NoWriteBack)
 947     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 948 }
 949
 950 /* 64 bit load sign-extended byte scaled unsigned 12 bit.  */
 951 static void
 952 ldrsb_abs (sim_cpu *cpu, uint32_t offset)
 953 {
 954   ldrsb_wb (cpu, offset, NoWriteBack);
 955 }
 956
 957 /* 64 bit load sign-extended byte scaled or unscaled zero-
 958    or sign-extended 32-bit register offset.  */
 959 static void
 960 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 961 {
 962   unsigned rm = INSTR (20, 16);
 963   unsigned rn = INSTR (9, 5);
 964   unsigned rt = INSTR (4, 0);
 965   /* rn may reference SP, rm and rt must reference ZR  */
 966
 967   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 968   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 969                                  extension);
 970   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 971   /* There is no scaling required for a byte load.  */
 972   aarch64_set_reg_s64 (cpu, rt, NO_SP,
 973                        aarch64_get_mem_s8 (cpu, address + displacement));
 974 }
 975
 976 /* 32 bit load zero-extended short scaled unsigned 12 bit.  */
 977 static void
 978 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
 979 {
 980   unsigned rn = INSTR (9, 5);
 981   unsigned rt = INSTR (4, 0);
 982   uint32_t val;
 983
 984   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 985   /* The target register may not be SP but the source may be.  */
 986   val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 987                              + SCALE (offset, 16));
 988   aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
 989 }
 990
 991 /* 32 bit load zero-extended short unscaled signed 9 bit
 992    with pre- or post-writeback.  */
 993 static void
 994 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 995 {
 996   unsigned rn = INSTR (9, 5);
 997   unsigned rt = INSTR (4, 0);
 998   uint64_t address;
 999
1000   if (rn == rt && wb != NoWriteBack)
1001     HALT_UNALLOC;
1002
1003   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1004
1005   if (wb != Post)
1006     address += offset;
1007
1008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1009   aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1010
1011   if (wb == Post)
1012     address += offset;
1013
1014   if (wb != NoWriteBack)
1015     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1016 }
1017
1018 /* 32 bit load zero-extended short scaled or unscaled zero-
1019    or sign-extended 32-bit register offset.  */
1020 static void
1021 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1022 {
1023   unsigned rm = INSTR (20, 16);
1024   unsigned rn = INSTR (9, 5);
1025   unsigned rt = INSTR (4, 0);
1026   /* rn may reference SP, rm and rt must reference ZR  */
1027
1028   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1029   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1030   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1031
1032   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1033   aarch64_set_reg_u32 (cpu, rt, NO_SP,
1034                        aarch64_get_mem_u16 (cpu, address + displacement));
1035 }
1036
1037 /* 32 bit load sign-extended short scaled unsigned 12 bit.  */
1038 static void
1039 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
1040 {
1041   unsigned rn = INSTR (9, 5);
1042   unsigned rt = INSTR (4, 0);
1043   int32_t val;
1044
1045   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1046   /* The target register may not be SP but the source may be.  */
1047   val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1048                              + SCALE (offset, 16));
1049   aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
1050 }
1051
1052 /* 32 bit load sign-extended short unscaled signed 9 bit
1053    with pre- or post-writeback.  */
1054 static void
1055 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1056 {
1057   unsigned rn = INSTR (9, 5);
1058   unsigned rt = INSTR (4, 0);
1059   uint64_t address;
1060
1061   if (rn == rt && wb != NoWriteBack)
1062     HALT_UNALLOC;
1063
1064   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1065
1066   if (wb != Post)
1067     address += offset;
1068
1069   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1070   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1071                        (int32_t) aarch64_get_mem_s16 (cpu, address));
1072
1073   if (wb == Post)
1074     address += offset;
1075
1076   if (wb != NoWriteBack)
1077     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1078 }
1079
1080 /* 32 bit load sign-extended short scaled or unscaled zero-
1081    or sign-extended 32-bit register offset.  */
1082 static void
1083 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1084 {
1085   unsigned rm = INSTR (20, 16);
1086   unsigned rn = INSTR (9, 5);
1087   unsigned rt = INSTR (4, 0);
1088   /* rn may reference SP, rm and rt must reference ZR  */
1089
1090   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1091   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1092   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1093
1094   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1095   aarch64_set_reg_s32 (cpu, rt, NO_SP,
1096                        (int32_t) aarch64_get_mem_s16
1097                        (cpu, address + displacement));
1098 }
1099
1100 /* 64 bit load sign-extended short scaled unsigned 12 bit.  */
1101 static void
1102 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
1103 {
1104   unsigned rn = INSTR (9, 5);
1105   unsigned rt = INSTR (4, 0);
1106   int64_t val;
1107
1108   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1109   /* The target register may not be SP but the source may be.  */
1110   val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1111                               + SCALE (offset, 16));
1112   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1113 }
1114
1115 /* 64 bit load sign-extended short unscaled signed 9 bit
1116    with pre- or post-writeback.  */
1117 static void
1118 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1119 {
1120   unsigned rn = INSTR (9, 5);
1121   unsigned rt = INSTR (4, 0);
1122   uint64_t address;
1123   int64_t val;
1124
1125   if (rn == rt && wb != NoWriteBack)
1126     HALT_UNALLOC;
1127
1128   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1129   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1130
1131   if (wb != Post)
1132     address += offset;
1133
1134   val = aarch64_get_mem_s16 (cpu, address);
1135   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1136
1137   if (wb == Post)
1138     address += offset;
1139
1140   if (wb != NoWriteBack)
1141     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1142 }
1143
1144 /* 64 bit load sign-extended short scaled or unscaled zero-
1145    or sign-extended 32-bit register offset.  */
1146 static void
1147 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1148 {
1149   unsigned rm = INSTR (20, 16);
1150   unsigned rn = INSTR (9, 5);
1151   unsigned rt = INSTR (4, 0);
1152
1153   /* rn may reference SP, rm and rt must reference ZR  */
1154
1155   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1156   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1157   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
1158   int64_t val;
1159
1160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1161   val = aarch64_get_mem_s16 (cpu, address + displacement);
1162   aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1163 }
1164
1165 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
1166 static void
1167 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
1168 {
1169   unsigned rn = INSTR (9, 5);
1170   unsigned rt = INSTR (4, 0);
1171   int64_t val;
1172
1173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1174   val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1175                              + SCALE (offset, 32));
1176   /* The target register may not be SP but the source may be.  */
1177   return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
1178 }
1179
1180 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
1181    with pre- or post-writeback.  */
1182 static void
1183 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1184 {
1185   unsigned rn = INSTR (9, 5);
1186   unsigned rt = INSTR (4, 0);
1187   uint64_t address;
1188
1189   if (rn == rt && wb != NoWriteBack)
1190     HALT_UNALLOC;
1191
1192   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1193
1194   if (wb != Post)
1195     address += offset;
1196
1197   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1198   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
1199
1200   if (wb == Post)
1201     address += offset;
1202
1203   if (wb != NoWriteBack)
1204     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1205 }
1206
1207 /* 64 bit load sign-extended 32 bit scaled or unscaled zero-
1208    or sign-extended 32-bit register offset.  */
1209 static void
1210 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1211 {
1212   unsigned rm = INSTR (20, 16);
1213   unsigned rn = INSTR (9, 5);
1214   unsigned rt = INSTR (4, 0);
1215   /* rn may reference SP, rm and rt must reference ZR  */
1216
1217   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1218   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1219   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
1220
1221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1222   aarch64_set_reg_s64 (cpu, rt, NO_SP,
1223                        aarch64_get_mem_s32 (cpu, address + displacement));
1224 }
1225
1226 /* N.B. with stores the value in source is written to the
1227    address identified by source2 modified by source3/offset.  */
1228
1229 /* 32 bit store scaled unsigned 12 bit.  */
1230 static void
1231 str32_abs (sim_cpu *cpu, uint32_t offset)
1232 {
1233   unsigned rn = INSTR (9, 5);
1234   unsigned rt = INSTR (4, 0);
1235
1236   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1237   /* The target register may not be SP but the source may be.  */
1238   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
1239                              + SCALE (offset, 32)),
1240                        aarch64_get_reg_u32 (cpu, rt, NO_SP));
1241 }
1242
1243 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1244 static void
1245 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1246 {
1247   unsigned rn = INSTR (9, 5);
1248   unsigned rt = INSTR (4, 0);
1249   uint64_t address;
1250
1251   if (rn == rt && wb != NoWriteBack)
1252     HALT_UNALLOC;
1253
1254   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1255   if (wb != Post)
1256     address += offset;
1257
1258   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1259   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
1260
1261   if (wb == Post)
1262     address += offset;
1263
1264   if (wb != NoWriteBack)
1265     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1266 }
1267
1268 /* 32 bit store scaled or unscaled zero- or
1269    sign-extended 32-bit register offset.  */
1270 static void
1271 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1272 {
1273   unsigned rm = INSTR (20, 16);
1274   unsigned rn = INSTR (9, 5);
1275   unsigned rt = INSTR (4, 0);
1276
1277   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1278   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1279   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
1280
1281   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1282   aarch64_set_mem_u32 (cpu, address + displacement,
1283                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1284 }
1285
1286 /* 64 bit store scaled unsigned 12 bit.  */
1287 static void
1288 str_abs (sim_cpu *cpu, uint32_t offset)
1289 {
1290   unsigned rn = INSTR (9, 5);
1291   unsigned rt = INSTR (4, 0);
1292
1293   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1294   aarch64_set_mem_u64 (cpu,
1295                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
1296                        + SCALE (offset, 64),
1297                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1298 }
1299
1300 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
1301 static void
1302 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1303 {
1304   unsigned rn = INSTR (9, 5);
1305   unsigned rt = INSTR (4, 0);
1306   uint64_t address;
1307
1308   if (rn == rt && wb != NoWriteBack)
1309     HALT_UNALLOC;
1310
1311   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1312
1313   if (wb != Post)
1314     address += offset;
1315
1316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1317   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
1318
1319   if (wb == Post)
1320     address += offset;
1321
1322   if (wb != NoWriteBack)
1323     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1324 }
1325
1326 /* 64 bit store scaled or unscaled zero-
1327    or sign-extended 32-bit register offset.  */
1328 static void
1329 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1330 {
1331   unsigned rm = INSTR (20, 16);
1332   unsigned rn = INSTR (9, 5);
1333   unsigned rt = INSTR (4, 0);
1334   /* rn may reference SP, rm and rt must reference ZR  */
1335
1336   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1337   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1338                                extension);
1339   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
1340
1341   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1342   aarch64_set_mem_u64 (cpu, address + displacement,
1343                        aarch64_get_reg_u64 (cpu, rt, NO_SP));
1344 }
1345
1346 /* 32 bit store byte scaled unsigned 12 bit.  */
1347 static void
1348 strb_abs (sim_cpu *cpu, uint32_t offset)
1349 {
1350   unsigned rn = INSTR (9, 5);
1351   unsigned rt = INSTR (4, 0);
1352
1353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1354   /* The target register may not be SP but the source may be.
1355      There is no scaling required for a byte load.  */
1356   aarch64_set_mem_u8 (cpu,
1357                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
1358                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1359 }
1360
1361 /* 32 bit store byte unscaled signed 9 bit with pre- or post-writeback.  */
1362 static void
1363 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1364 {
1365   unsigned rn = INSTR (9, 5);
1366   unsigned rt = INSTR (4, 0);
1367   uint64_t address;
1368
1369   if (rn == rt && wb != NoWriteBack)
1370     HALT_UNALLOC;
1371
1372   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1373
1374   if (wb != Post)
1375     address += offset;
1376
1377   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1378   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
1379
1380   if (wb == Post)
1381     address += offset;
1382
1383   if (wb != NoWriteBack)
1384     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1385 }
1386
1387 /* 32 bit store byte scaled or unscaled zero-
1388    or sign-extended 32-bit register offset.  */
1389 static void
1390 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1391 {
1392   unsigned rm = INSTR (20, 16);
1393   unsigned rn = INSTR (9, 5);
1394   unsigned rt = INSTR (4, 0);
1395   /* rn may reference SP, rm and rt must reference ZR  */
1396
1397   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1398   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1399                                  extension);
1400
1401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1402   /* There is no scaling required for a byte load.  */
1403   aarch64_set_mem_u8 (cpu, address + displacement,
1404                       aarch64_get_reg_u8 (cpu, rt, NO_SP));
1405 }
1406
1407 /* 32 bit store short scaled unsigned 12 bit.  */
1408 static void
1409 strh_abs (sim_cpu *cpu, uint32_t offset)
1410 {
1411   unsigned rn = INSTR (9, 5);
1412   unsigned rt = INSTR (4, 0);
1413
1414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1415   /* The target register may not be SP but the source may be.  */
1416   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
1417                        + SCALE (offset, 16),
1418                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1419 }
1420
1421 /* 32 bit store short unscaled signed 9 bit with pre- or post-writeback.  */
1422 static void
1423 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
1424 {
1425   unsigned rn = INSTR (9, 5);
1426   unsigned rt = INSTR (4, 0);
1427   uint64_t address;
1428
1429   if (rn == rt && wb != NoWriteBack)
1430     HALT_UNALLOC;
1431
1432   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1433
1434   if (wb != Post)
1435     address += offset;
1436
1437   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1438   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
1439
1440   if (wb == Post)
1441     address += offset;
1442
1443   if (wb != NoWriteBack)
1444     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
1445 }
1446
1447 /* 32 bit store short scaled or unscaled zero-
1448    or sign-extended 32-bit register offset.  */
1449 static void
1450 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1451 {
1452   unsigned rm = INSTR (20, 16);
1453   unsigned rn = INSTR (9, 5);
1454   unsigned rt = INSTR (4, 0);
1455   /* rn may reference SP, rm and rt must reference ZR  */
1456
1457   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1458   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
1459   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
1460
1461   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1462   aarch64_set_mem_u16 (cpu, address + displacement,
1463                        aarch64_get_reg_u16 (cpu, rt, NO_SP));
1464 }
1465
1466 /* Prefetch unsigned 12 bit.  */
1467 static void
1468 prfm_abs (sim_cpu *cpu, uint32_t offset)
1469 {
1470   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1471                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1472                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1473                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1474                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1475                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1476                           ow ==> UNALLOC
1477      PrfOp prfop = prfop (instr, 4, 0);
1478      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
1479      + SCALE (offset, 64).  */
1480
1481   /* TODO : implement prefetch of address.  */
1482 }
1483
1484 /* Prefetch scaled or unscaled zero- or sign-extended 32-bit register offset.  */
1485 static void
1486 prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
1487 {
1488   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1489                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1490                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1491                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1492                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1493                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1494                           ow ==> UNALLOC
1495      rn may reference SP, rm may only reference ZR
1496      PrfOp prfop = prfop (instr, 4, 0);
1497      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1498      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1499                                 extension);
1500      uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
1501      uint64_t address = base + displacement.  */
1502
1503   /* TODO : implement prefetch of address  */
1504 }
1505
1506 /* 64 bit pc-relative prefetch.  */
1507 static void
1508 prfm_pcrel (sim_cpu *cpu, int32_t offset)
1509 {
1510   /* instr[4,0] = prfop : 00000 ==> PLDL1KEEP, 00001 ==> PLDL1STRM,
1511                           00010 ==> PLDL2KEEP, 00001 ==> PLDL2STRM,
1512                           00100 ==> PLDL3KEEP, 00101 ==> PLDL3STRM,
1513                           10000 ==> PSTL1KEEP, 10001 ==> PSTL1STRM,
1514                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
1515                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
1516                           ow ==> UNALLOC
1517      PrfOp prfop = prfop (instr, 4, 0);
1518      uint64_t address = aarch64_get_PC (cpu) + offset.  */
1519
1520   /* TODO : implement this  */
1521 }
1522
1523 /* Load-store exclusive.  */
1524
1525 static void
1526 ldxr (sim_cpu *cpu)
1527 {
1528   unsigned rn = INSTR (9, 5);
1529   unsigned rt = INSTR (4, 0);
1530   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1531   int size = INSTR (31, 30);
1532   /* int ordered = INSTR (15, 15);  */
1533   /* int exclusive = ! INSTR (23, 23);  */
1534
1535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1536   switch (size)
1537     {
1538     case 0:
1539       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
1540       break;
1541     case 1:
1542       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
1543       break;
1544     case 2:
1545       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
1546       break;
1547     case 3:
1548       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
1549       break;
1550     }
1551 }
1552
1553 static void
1554 stxr (sim_cpu *cpu)
1555 {
1556   unsigned rn = INSTR (9, 5);
1557   unsigned rt = INSTR (4, 0);
1558   unsigned rs = INSTR (20, 16);
1559   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1560   int      size = INSTR (31, 30);
1561   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
1562
1563   switch (size)
1564     {
1565     case 0: aarch64_set_mem_u8 (cpu, address, data); break;
1566     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
1567     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
1568     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
1569     }
1570
1571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1572   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
1573 }
1574
1575 static void
1576 dexLoadLiteral (sim_cpu *cpu)
1577 {
1578   /* instr[29,27] == 011
1579      instr[25,24] == 00
1580      instr[31,30:26] = opc: 000 ==> LDRW,  001 ==> FLDRS
1581                             010 ==> LDRX,  011 ==> FLDRD
1582                             100 ==> LDRSW, 101 ==> FLDRQ
1583                             110 ==> PRFM, 111 ==> UNALLOC
1584      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
1585      instr[23, 5] == simm19  */
1586
1587   /* unsigned rt = INSTR (4, 0);  */
1588   uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
1589   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
1590
1591   switch (dispatch)
1592     {
1593     case 0: ldr32_pcrel (cpu, imm); break;
1594     case 1: fldrs_pcrel (cpu, imm); break;
1595     case 2: ldr_pcrel   (cpu, imm); break;
1596     case 3: fldrd_pcrel (cpu, imm); break;
1597     case 4: ldrsw_pcrel (cpu, imm); break;
1598     case 5: fldrq_pcrel (cpu, imm); break;
1599     case 6: prfm_pcrel  (cpu, imm); break;
1600     case 7:
1601     default:
1602       HALT_UNALLOC;
1603     }
1604 }
1605
1606 /* Immediate arithmetic
1607    The aimm argument is a 12 bit unsigned value or a 12 bit unsigned
1608    value left shifted by 12 bits (done at decode).
1609
1610    N.B. the register args (dest, source) can normally be Xn or SP.
1611    the exception occurs for flag setting instructions which may
1612    only use Xn for the output (dest).  */
1613
1614 /* 32 bit add immediate.  */
1615 static void
1616 add32 (sim_cpu *cpu, uint32_t aimm)
1617 {
1618   unsigned rn = INSTR (9, 5);
1619   unsigned rd = INSTR (4, 0);
1620
1621   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1622   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1623                        aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
1624 }
1625
1626 /* 64 bit add immediate.  */
1627 static void
1628 add64 (sim_cpu *cpu, uint32_t aimm)
1629 {
1630   unsigned rn = INSTR (9, 5);
1631   unsigned rd = INSTR (4, 0);
1632
1633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1634   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1635                        aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
1636 }
1637
1638 static void
1639 set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
1640 {
1641   int32_t   result = value1 + value2;
1642   int64_t   sresult = (int64_t) value1 + (int64_t) value2;
1643   uint64_t  uresult = (uint64_t)(uint32_t) value1
1644     + (uint64_t)(uint32_t) value2;
1645   uint32_t  flags = 0;
1646
1647   if (result == 0)
1648     flags |= Z;
1649
1650   if (result & (1 << 31))
1651     flags |= N;
1652
1653   if (uresult != result)
1654     flags |= C;
1655
1656   if (sresult != result)
1657     flags |= V;
1658
1659   aarch64_set_CPSR (cpu, flags);
1660 }
1661
1662 #define NEG(a) (((a) & signbit) == signbit)
1663 #define POS(a) (((a) & signbit) == 0)
1664
1665 static void
1666 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1667 {
1668   uint64_t result = value1 + value2;
1669   uint32_t flags = 0;
1670   uint64_t signbit = 1ULL << 63;
1671
1672   if (result == 0)
1673     flags |= Z;
1674
1675   if (NEG (result))
1676     flags |= N;
1677
1678   if (   (NEG (value1) && NEG (value2))
1679       || (NEG (value1) && POS (result))
1680       || (NEG (value2) && POS (result)))
1681     flags |= C;
1682
1683   if (   (NEG (value1) && NEG (value2) && POS (result))
1684       || (POS (value1) && POS (value2) && NEG (result)))
1685     flags |= V;
1686
1687   aarch64_set_CPSR (cpu, flags);
1688 }
1689
1690 static void
1691 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
1692 {
1693   uint32_t result = value1 - value2;
1694   uint32_t flags = 0;
1695   uint32_t signbit = 1U << 31;
1696
1697   if (result == 0)
1698     flags |= Z;
1699
1700   if (NEG (result))
1701     flags |= N;
1702
1703   if (   (NEG (value1) && POS (value2))
1704       || (NEG (value1) && POS (result))
1705       || (POS (value2) && POS (result)))
1706     flags |= C;
1707
1708   if (   (NEG (value1) && POS (value2) && POS (result))
1709       || (POS (value1) && NEG (value2) && NEG (result)))
1710     flags |= V;
1711
1712   aarch64_set_CPSR (cpu, flags);
1713 }
1714
1715 static void
1716 set_flags_for_sub64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
1717 {
1718   uint64_t result = value1 - value2;
1719   uint32_t flags = 0;
1720   uint64_t signbit = 1ULL << 63;
1721
1722   if (result == 0)
1723     flags |= Z;
1724
1725   if (NEG (result))
1726     flags |= N;
1727
1728   if (   (NEG (value1) && POS (value2))
1729       || (NEG (value1) && POS (result))
1730       || (POS (value2) && POS (result)))
1731     flags |= C;
1732
1733   if (   (NEG (value1) && POS (value2) && POS (result))
1734       || (POS (value1) && NEG (value2) && NEG (result)))
1735     flags |= V;
1736
1737   aarch64_set_CPSR (cpu, flags);
1738 }
1739
1740 static void
1741 set_flags_for_binop32 (sim_cpu *cpu, uint32_t result)
1742 {
1743   uint32_t flags = 0;
1744
1745   if (result == 0)
1746     flags |= Z;
1747   else
1748     flags &= ~ Z;
1749
1750   if (result & (1 << 31))
1751     flags |= N;
1752   else
1753     flags &= ~ N;
1754
1755   aarch64_set_CPSR (cpu, flags);
1756 }
1757
1758 static void
1759 set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
1760 {
1761   uint32_t flags = 0;
1762
1763   if (result == 0)
1764     flags |= Z;
1765   else
1766     flags &= ~ Z;
1767
1768   if (result & (1ULL << 63))
1769     flags |= N;
1770   else
1771     flags &= ~ N;
1772
1773   aarch64_set_CPSR (cpu, flags);
1774 }
1775
1776 /* 32 bit add immediate set flags.  */
1777 static void
1778 adds32 (sim_cpu *cpu, uint32_t aimm)
1779 {
1780   unsigned rn = INSTR (9, 5);
1781   unsigned rd = INSTR (4, 0);
1782   /* TODO : do we need to worry about signs here?  */
1783   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
1784
1785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1786   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
1787   set_flags_for_add32 (cpu, value1, aimm);
1788 }
1789
1790 /* 64 bit add immediate set flags.  */
1791 static void
1792 adds64 (sim_cpu *cpu, uint32_t aimm)
1793 {
1794   unsigned rn = INSTR (9, 5);
1795   unsigned rd = INSTR (4, 0);
1796   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1797   uint64_t value2 = aimm;
1798
1799   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1800   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1801   set_flags_for_add64 (cpu, value1, value2);
1802 }
1803
1804 /* 32 bit sub immediate.  */
1805 static void
1806 sub32 (sim_cpu *cpu, uint32_t aimm)
1807 {
1808   unsigned rn = INSTR (9, 5);
1809   unsigned rd = INSTR (4, 0);
1810
1811   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1812   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1813                        aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
1814 }
1815
1816 /* 64 bit sub immediate.  */
1817 static void
1818 sub64 (sim_cpu *cpu, uint32_t aimm)
1819 {
1820   unsigned rn = INSTR (9, 5);
1821   unsigned rd = INSTR (4, 0);
1822
1823   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1824   aarch64_set_reg_u64 (cpu, rd, SP_OK,
1825                        aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
1826 }
1827
1828 /* 32 bit sub immediate set flags.  */
1829 static void
1830 subs32 (sim_cpu *cpu, uint32_t aimm)
1831 {
1832   unsigned rn = INSTR (9, 5);
1833   unsigned rd = INSTR (4, 0);
1834   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1835   uint32_t value2 = aimm;
1836
1837   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1838   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1839   set_flags_for_sub32 (cpu, value1, value2);
1840 }
1841
1842 /* 64 bit sub immediate set flags.  */
1843 static void
1844 subs64 (sim_cpu *cpu, uint32_t aimm)
1845 {
1846   unsigned rn = INSTR (9, 5);
1847   unsigned rd = INSTR (4, 0);
1848   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
1849   uint32_t value2 = aimm;
1850
1851   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1852   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
1853   set_flags_for_sub64 (cpu, value1, value2);
1854 }
1855
1856 /* Data Processing Register.  */
1857
1858 /* First two helpers to perform the shift operations.  */
1859
1860 static inline uint32_t
1861 shifted32 (uint32_t value, Shift shift, uint32_t count)
1862 {
1863   switch (shift)
1864     {
1865     default:
1866     case LSL:
1867       return (value << count);
1868     case LSR:
1869       return (value >> count);
1870     case ASR:
1871       {
1872         int32_t svalue = value;
1873         return (svalue >> count);
1874       }
1875     case ROR:
1876       {
1877         uint32_t top = value >> count;
1878         uint32_t bottom = value << (32 - count);
1879         return (bottom | top);
1880       }
1881     }
1882 }
1883
1884 static inline uint64_t
1885 shifted64 (uint64_t value, Shift shift, uint32_t count)
1886 {
1887   switch (shift)
1888     {
1889     default:
1890     case LSL:
1891       return (value << count);
1892     case LSR:
1893       return (value >> count);
1894     case ASR:
1895       {
1896         int64_t svalue = value;
1897         return (svalue >> count);
1898       }
1899     case ROR:
1900       {
1901         uint64_t top = value >> count;
1902         uint64_t bottom = value << (64 - count);
1903         return (bottom | top);
1904       }
1905     }
1906 }
1907
1908 /* Arithmetic shifted register.
1909    These allow an optional LSL, ASR or LSR to the second source
1910    register with a count up to the register bit count.
1911
1912    N.B register args may not be SP.  */
1913
1914 /* 32 bit ADD shifted register.  */
1915 static void
1916 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1917 {
1918   unsigned rm = INSTR (20, 16);
1919   unsigned rn = INSTR (9, 5);
1920   unsigned rd = INSTR (4, 0);
1921
1922   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1923   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1924                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1925                        + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1926                                     shift, count));
1927 }
1928
1929 /* 64 bit ADD shifted register.  */
1930 static void
1931 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1932 {
1933   unsigned rm = INSTR (20, 16);
1934   unsigned rn = INSTR (9, 5);
1935   unsigned rd = INSTR (4, 0);
1936
1937   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1938   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1939                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
1940                        + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1941                                     shift, count));
1942 }
1943
1944 /* 32 bit ADD shifted register setting flags.  */
1945 static void
1946 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1947 {
1948   unsigned rm = INSTR (20, 16);
1949   unsigned rn = INSTR (9, 5);
1950   unsigned rd = INSTR (4, 0);
1951
1952   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
1953   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1954                                shift, count);
1955
1956   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1957   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1958   set_flags_for_add32 (cpu, value1, value2);
1959 }
1960
1961 /* 64 bit ADD shifted register setting flags.  */
1962 static void
1963 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1964 {
1965   unsigned rm = INSTR (20, 16);
1966   unsigned rn = INSTR (9, 5);
1967   unsigned rd = INSTR (4, 0);
1968
1969   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
1970   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
1971                                shift, count);
1972
1973   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1974   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
1975   set_flags_for_add64 (cpu, value1, value2);
1976 }
1977
1978 /* 32 bit SUB shifted register.  */
1979 static void
1980 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1981 {
1982   unsigned rm = INSTR (20, 16);
1983   unsigned rn = INSTR (9, 5);
1984   unsigned rd = INSTR (4, 0);
1985
1986   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
1987   aarch64_set_reg_u64 (cpu, rd, NO_SP,
1988                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
1989                        - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
1990                                     shift, count));
1991 }
1992
1993 /* 64 bit SUB shifted register.  */
1994 static void
1995 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
1996 {
1997   unsigned rm = INSTR (20, 16);
1998   unsigned rn = INSTR (9, 5);
1999   unsigned rd = INSTR (4, 0);
2000
2001   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2002   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2003                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2004                        - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2005                                     shift, count));
2006 }
2007
2008 /* 32 bit SUB shifted register setting flags.  */
2009 static void
2010 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2011 {
2012   unsigned rm = INSTR (20, 16);
2013   unsigned rn = INSTR (9, 5);
2014   unsigned rd = INSTR (4, 0);
2015
2016   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2017   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
2018                               shift, count);
2019
2020   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2021   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2022   set_flags_for_sub32 (cpu, value1, value2);
2023 }
2024
2025 /* 64 bit SUB shifted register setting flags.  */
2026 static void
2027 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
2028 {
2029   unsigned rm = INSTR (20, 16);
2030   unsigned rn = INSTR (9, 5);
2031   unsigned rd = INSTR (4, 0);
2032
2033   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2034   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
2035                                shift, count);
2036
2037   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2038   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2039   set_flags_for_sub64 (cpu, value1, value2);
2040 }
2041
2042 /* First a couple more helpers to fetch the
2043    relevant source register element either
2044    sign or zero extended as required by the
2045    extension value.  */
2046
2047 static uint32_t
2048 extreg32 (sim_cpu *cpu, unsigned int lo, Extension extension)
2049 {
2050   switch (extension)
2051     {
2052     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2053     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2054     case UXTW: /* Fall through.  */
2055     case UXTX: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2056     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2057     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2058     case SXTW: /* Fall through.  */
2059     case SXTX: /* Fall through.  */
2060     default:   return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2061   }
2062 }
2063
2064 static uint64_t
2065 extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
2066 {
2067   switch (extension)
2068     {
2069     case UXTB: return aarch64_get_reg_u8  (cpu, lo, NO_SP);
2070     case UXTH: return aarch64_get_reg_u16 (cpu, lo, NO_SP);
2071     case UXTW: return aarch64_get_reg_u32 (cpu, lo, NO_SP);
2072     case UXTX: return aarch64_get_reg_u64 (cpu, lo, NO_SP);
2073     case SXTB: return aarch64_get_reg_s8  (cpu, lo, NO_SP);
2074     case SXTH: return aarch64_get_reg_s16 (cpu, lo, NO_SP);
2075     case SXTW: return aarch64_get_reg_s32 (cpu, lo, NO_SP);
2076     case SXTX:
2077     default:   return aarch64_get_reg_s64 (cpu, lo, NO_SP);
2078     }
2079 }
2080
2081 /* Arithmetic extending register
2082    These allow an optional sign extension of some portion of the
2083    second source register followed by an optional left shift of
2084    between 1 and 4 bits (i.e. a shift of 0-4 bits???)
2085
2086    N.B output (dest) and first input arg (source) may normally be Xn
2087    or SP. However, for flag setting operations dest can only be
2088    Xn. Second input registers are always Xn.  */
2089
2090 /* 32 bit ADD extending register.  */
2091 static void
2092 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2093 {
2094   unsigned rm = INSTR (20, 16);
2095   unsigned rn = INSTR (9, 5);
2096   unsigned rd = INSTR (4, 0);
2097
2098   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2099   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2100                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2101                        + (extreg32 (cpu, rm, extension) << shift));
2102 }
2103
2104 /* 64 bit ADD extending register.
2105    N.B. This subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2106 static void
2107 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2108 {
2109   unsigned rm = INSTR (20, 16);
2110   unsigned rn = INSTR (9, 5);
2111   unsigned rd = INSTR (4, 0);
2112
2113   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2114   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2115                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2116                        + (extreg64 (cpu, rm, extension) << shift));
2117 }
2118
2119 /* 32 bit ADD extending register setting flags.  */
2120 static void
2121 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2122 {
2123   unsigned rm = INSTR (20, 16);
2124   unsigned rn = INSTR (9, 5);
2125   unsigned rd = INSTR (4, 0);
2126
2127   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2128   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2129
2130   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2131   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2132   set_flags_for_add32 (cpu, value1, value2);
2133 }
2134
2135 /* 64 bit ADD extending register setting flags  */
2136 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2137 static void
2138 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2139 {
2140   unsigned rm = INSTR (20, 16);
2141   unsigned rn = INSTR (9, 5);
2142   unsigned rd = INSTR (4, 0);
2143
2144   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2145   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2146
2147   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2148   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
2149   set_flags_for_add64 (cpu, value1, value2);
2150 }
2151
2152 /* 32 bit SUB extending register.  */
2153 static void
2154 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2155 {
2156   unsigned rm = INSTR (20, 16);
2157   unsigned rn = INSTR (9, 5);
2158   unsigned rd = INSTR (4, 0);
2159
2160   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2161   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2162                        aarch64_get_reg_u32 (cpu, rn, SP_OK)
2163                        - (extreg32 (cpu, rm, extension) << shift));
2164 }
2165
2166 /* 64 bit SUB extending register.  */
2167 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0.  */
2168 static void
2169 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2170 {
2171   unsigned rm = INSTR (20, 16);
2172   unsigned rn = INSTR (9, 5);
2173   unsigned rd = INSTR (4, 0);
2174
2175   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2176   aarch64_set_reg_u64 (cpu, rd, SP_OK,
2177                        aarch64_get_reg_u64 (cpu, rn, SP_OK)
2178                        - (extreg64 (cpu, rm, extension) << shift));
2179 }
2180
2181 /* 32 bit SUB extending register setting flags.  */
2182 static void
2183 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2184 {
2185   unsigned rm = INSTR (20, 16);
2186   unsigned rn = INSTR (9, 5);
2187   unsigned rd = INSTR (4, 0);
2188
2189   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
2190   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
2191
2192   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2193   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2194   set_flags_for_sub32 (cpu, value1, value2);
2195 }
2196
2197 /* 64 bit SUB extending register setting flags  */
2198 /* N.B. this subsumes the case with 64 bit source2 and UXTX #n or LSL #0  */
2199 static void
2200 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
2201 {
2202   unsigned rm = INSTR (20, 16);
2203   unsigned rn = INSTR (9, 5);
2204   unsigned rd = INSTR (4, 0);
2205
2206   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
2207   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
2208
2209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2210   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
2211   set_flags_for_sub64 (cpu, value1, value2);
2212 }
2213
2214 static void
2215 dexAddSubtractImmediate (sim_cpu *cpu)
2216 {
2217   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2218      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2219      instr[29]    = set : 0 ==> no flags, 1 ==> set flags
2220      instr[28,24] = 10001
2221      instr[23,22] = shift : 00 == LSL#0, 01 = LSL#12 1x = UNALLOC
2222      instr[21,10] = uimm12
2223      instr[9,5]   = Rn
2224      instr[4,0]   = Rd  */
2225
2226   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
2227   uint32_t shift = INSTR (23, 22);
2228   uint32_t imm = INSTR (21, 10);
2229   uint32_t dispatch = INSTR (31, 29);
2230
2231   NYI_assert (28, 24, 0x11);
2232
2233   if (shift > 1)
2234     HALT_UNALLOC;
2235
2236   if (shift)
2237     imm <<= 12;
2238
2239   switch (dispatch)
2240     {
2241     case 0: add32 (cpu, imm); break;
2242     case 1: adds32 (cpu, imm); break;
2243     case 2: sub32 (cpu, imm); break;
2244     case 3: subs32 (cpu, imm); break;
2245     case 4: add64 (cpu, imm); break;
2246     case 5: adds64 (cpu, imm); break;
2247     case 6: sub64 (cpu, imm); break;
2248     case 7: subs64 (cpu, imm); break;
2249     }
2250 }
2251
2252 static void
2253 dexAddSubtractShiftedRegister (sim_cpu *cpu)
2254 {
2255   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2256      instr[30,29] = op : 00 ==> ADD, 01 ==> ADDS, 10 ==> SUB, 11 ==> SUBS
2257      instr[28,24] = 01011
2258      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> UNALLOC
2259      instr[21]    = 0
2260      instr[20,16] = Rm
2261      instr[15,10] = count : must be 0xxxxx for 32 bit
2262      instr[9,5]   = Rn
2263      instr[4,0]   = Rd  */
2264
2265   uint32_t size = INSTR (31, 31);
2266   uint32_t count = INSTR (15, 10);
2267   Shift shiftType = INSTR (23, 22);
2268
2269   NYI_assert (28, 24, 0x0B);
2270   NYI_assert (21, 21, 0);
2271
2272   /* Shift encoded as ROR is unallocated.  */
2273   if (shiftType == ROR)
2274     HALT_UNALLOC;
2275
2276   /* 32 bit operations must have count[5] = 0
2277      or else we have an UNALLOC.  */
2278   if (size == 0 && uimm (count, 5, 5))
2279     HALT_UNALLOC;
2280
2281   /* Dispatch on size:op i.e instr [31,29].  */
2282   switch (INSTR (31, 29))
2283     {
2284     case 0: add32_shift  (cpu, shiftType, count); break;
2285     case 1: adds32_shift (cpu, shiftType, count); break;
2286     case 2: sub32_shift  (cpu, shiftType, count); break;
2287     case 3: subs32_shift (cpu, shiftType, count); break;
2288     case 4: add64_shift  (cpu, shiftType, count); break;
2289     case 5: adds64_shift (cpu, shiftType, count); break;
2290     case 6: sub64_shift  (cpu, shiftType, count); break;
2291     case 7: subs64_shift (cpu, shiftType, count); break;
2292     }
2293 }
2294
2295 static void
2296 dexAddSubtractExtendedRegister (sim_cpu *cpu)
2297 {
2298   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2299      instr[30]    = op : 0 ==> ADD, 1 ==> SUB
2300      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2301      instr[28,24] = 01011
2302      instr[23,22] = opt : 0 ==> ok, 1,2,3 ==> UNALLOC
2303      instr[21]    = 1
2304      instr[20,16] = Rm
2305      instr[15,13] = option : 000 ==> UXTB, 001 ==> UXTH,
2306                              000 ==> LSL|UXTW, 001 ==> UXTZ,
2307                              000 ==> SXTB, 001 ==> SXTH,
2308                              000 ==> SXTW, 001 ==> SXTX,
2309      instr[12,10] = shift : 0,1,2,3,4 ==> ok, 5,6,7 ==> UNALLOC
2310      instr[9,5]   = Rn
2311      instr[4,0]   = Rd  */
2312
2313   Extension extensionType = INSTR (15, 13);
2314   uint32_t shift = INSTR (12, 10);
2315
2316   NYI_assert (28, 24, 0x0B);
2317   NYI_assert (21, 21, 1);
2318
2319   /* Shift may not exceed 4.  */
2320   if (shift > 4)
2321     HALT_UNALLOC;
2322
2323   /* Dispatch on size:op:set?.  */
2324   switch (INSTR (31, 29))
2325     {
2326     case 0: add32_ext  (cpu, extensionType, shift); break;
2327     case 1: adds32_ext (cpu, extensionType, shift); break;
2328     case 2: sub32_ext  (cpu, extensionType, shift); break;
2329     case 3: subs32_ext (cpu, extensionType, shift); break;
2330     case 4: add64_ext  (cpu, extensionType, shift); break;
2331     case 5: adds64_ext (cpu, extensionType, shift); break;
2332     case 6: sub64_ext  (cpu, extensionType, shift); break;
2333     case 7: subs64_ext (cpu, extensionType, shift); break;
2334     }
2335 }
2336
2337 /* Conditional data processing
2338    Condition register is implicit 3rd source.  */
2339
2340 /* 32 bit add with carry.  */
2341 /* N.B register args may not be SP.  */
2342
2343 static void
2344 adc32 (sim_cpu *cpu)
2345 {
2346   unsigned rm = INSTR (20, 16);
2347   unsigned rn = INSTR (9, 5);
2348   unsigned rd = INSTR (4, 0);
2349
2350   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2351   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2352                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2353                        + aarch64_get_reg_u32 (cpu, rm, NO_SP)
2354                        + IS_SET (C));
2355 }
2356
2357 /* 64 bit add with carry  */
2358 static void
2359 adc64 (sim_cpu *cpu)
2360 {
2361   unsigned rm = INSTR (20, 16);
2362   unsigned rn = INSTR (9, 5);
2363   unsigned rd = INSTR (4, 0);
2364
2365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2366   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2367                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2368                        + aarch64_get_reg_u64 (cpu, rm, NO_SP)
2369                        + IS_SET (C));
2370 }
2371
2372 /* 32 bit add with carry setting flags.  */
2373 static void
2374 adcs32 (sim_cpu *cpu)
2375 {
2376   unsigned rm = INSTR (20, 16);
2377   unsigned rn = INSTR (9, 5);
2378   unsigned rd = INSTR (4, 0);
2379
2380   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2381   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2382   uint32_t carry = IS_SET (C);
2383
2384   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2385   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2386   set_flags_for_add32 (cpu, value1, value2 + carry);
2387 }
2388
2389 /* 64 bit add with carry setting flags.  */
2390 static void
2391 adcs64 (sim_cpu *cpu)
2392 {
2393   unsigned rm = INSTR (20, 16);
2394   unsigned rn = INSTR (9, 5);
2395   unsigned rd = INSTR (4, 0);
2396
2397   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2398   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2399   uint64_t carry = IS_SET (C);
2400
2401   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2402   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
2403   set_flags_for_add64 (cpu, value1, value2 + carry);
2404 }
2405
2406 /* 32 bit sub with carry.  */
2407 static void
2408 sbc32 (sim_cpu *cpu)
2409 {
2410   unsigned rm = INSTR (20, 16);
2411   unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
2412   unsigned rd = INSTR (4, 0);
2413
2414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2415   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2416                        aarch64_get_reg_u32 (cpu, rn, NO_SP)
2417                        - aarch64_get_reg_u32 (cpu, rm, NO_SP)
2418                        - 1 + IS_SET (C));
2419 }
2420
2421 /* 64 bit sub with carry  */
2422 static void
2423 sbc64 (sim_cpu *cpu)
2424 {
2425   unsigned rm = INSTR (20, 16);
2426   unsigned rn = INSTR (9, 5);
2427   unsigned rd = INSTR (4, 0);
2428
2429   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2430   aarch64_set_reg_u64 (cpu, rd, NO_SP,
2431                        aarch64_get_reg_u64 (cpu, rn, NO_SP)
2432                        - aarch64_get_reg_u64 (cpu, rm, NO_SP)
2433                        - 1 + IS_SET (C));
2434 }
2435
2436 /* 32 bit sub with carry setting flags  */
2437 static void
2438 sbcs32 (sim_cpu *cpu)
2439 {
2440   unsigned rm = INSTR (20, 16);
2441   unsigned rn = INSTR (9, 5);
2442   unsigned rd = INSTR (4, 0);
2443
2444   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
2445   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
2446   uint32_t carry  = IS_SET (C);
2447   uint32_t result = value1 - value2 + 1 - carry;
2448
2449   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2450   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2451   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
2452 }
2453
2454 /* 64 bit sub with carry setting flags  */
2455 static void
2456 sbcs64 (sim_cpu *cpu)
2457 {
2458   unsigned rm = INSTR (20, 16);
2459   unsigned rn = INSTR (9, 5);
2460   unsigned rd = INSTR (4, 0);
2461
2462   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
2463   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
2464   uint64_t carry  = IS_SET (C);
2465   uint64_t result = value1 - value2 + 1 - carry;
2466
2467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2468   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
2469   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
2470 }
2471
2472 static void
2473 dexAddSubtractWithCarry (sim_cpu *cpu)
2474 {
2475   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2476      instr[30]    = op : 0 ==> ADC, 1 ==> SBC
2477      instr[29]    = set? : 0 ==> no flags, 1 ==> set flags
2478      instr[28,21] = 1 1010 000
2479      instr[20,16] = Rm
2480      instr[15,10] = op2 : 00000 ==> ok, ow ==> UNALLOC
2481      instr[9,5]   = Rn
2482      instr[4,0]   = Rd  */
2483
2484   uint32_t op2 = INSTR (15, 10);
2485
2486   NYI_assert (28, 21, 0xD0);
2487
2488   if (op2 != 0)
2489     HALT_UNALLOC;
2490
2491   /* Dispatch on size:op:set?.  */
2492   switch (INSTR (31, 29))
2493     {
2494     case 0: adc32 (cpu); break;
2495     case 1: adcs32 (cpu); break;
2496     case 2: sbc32 (cpu); break;
2497     case 3: sbcs32 (cpu); break;
2498     case 4: adc64 (cpu); break;
2499     case 5: adcs64 (cpu); break;
2500     case 6: sbc64 (cpu); break;
2501     case 7: sbcs64 (cpu); break;
2502     }
2503 }
2504
2505 static uint32_t
2506 testConditionCode (sim_cpu *cpu, CondCode cc)
2507 {
2508   /* This should be reduceable to branchless logic
2509      by some careful testing of bits in CC followed
2510      by the requisite masking and combining of bits
2511      from the flag register.
2512
2513      For now we do it with a switch.  */
2514   int res;
2515
2516   switch (cc)
2517     {
2518     case EQ:  res = IS_SET (Z);    break;
2519     case NE:  res = IS_CLEAR (Z);  break;
2520     case CS:  res = IS_SET (C);    break;
2521     case CC:  res = IS_CLEAR (C);  break;
2522     case MI:  res = IS_SET (N);    break;
2523     case PL:  res = IS_CLEAR (N);  break;
2524     case VS:  res = IS_SET (V);    break;
2525     case VC:  res = IS_CLEAR (V);  break;
2526     case HI:  res = IS_SET (C) && IS_CLEAR (Z);  break;
2527     case LS:  res = IS_CLEAR (C) || IS_SET (Z);  break;
2528     case GE:  res = IS_SET (N) == IS_SET (V);    break;
2529     case LT:  res = IS_SET (N) != IS_SET (V);    break;
2530     case GT:  res = IS_CLEAR (Z) && (IS_SET (N) == IS_SET (V));  break;
2531     case LE:  res = IS_SET (Z) || (IS_SET (N) != IS_SET (V));    break;
2532     case AL:
2533     case NV:
2534     default:
2535       res = 1;
2536       break;
2537     }
2538   return res;
2539 }
2540
2541 static void
2542 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
2543 {
2544   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
2545      instr[30]    = compare with positive (1) or negative value (0)
2546      instr[29,21] = 1 1101 0010
2547      instr[20,16] = Rm or const
2548      instr[15,12] = cond
2549      instr[11]    = compare reg (0) or const (1)
2550      instr[10]    = 0
2551      instr[9,5]   = Rn
2552      instr[4]     = 0
2553      instr[3,0]   = value for CPSR bits if the comparison does not take place.  */
2554   signed int negate;
2555   unsigned rm;
2556   unsigned rn;
2557
2558   NYI_assert (29, 21, 0x1d2);
2559   NYI_assert (10, 10, 0);
2560   NYI_assert (4, 4, 0);
2561
2562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2563   if (! testConditionCode (cpu, INSTR (15, 12)))
2564     {
2565       aarch64_set_CPSR (cpu, INSTR (3, 0));
2566       return;
2567     }
2568
2569   negate = INSTR (30, 30) ? 1 : -1;
2570   rm = INSTR (20, 16);
2571   rn = INSTR ( 9,  5);
2572
2573   if (INSTR (31, 31))
2574     {
2575       if (INSTR (11, 11))
2576         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2577                              negate * (uint64_t) rm);
2578       else
2579         set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
2580                              negate * aarch64_get_reg_u64 (cpu, rm, SP_OK));
2581     }
2582   else
2583     {
2584       if (INSTR (11, 11))
2585         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2586                              negate * rm);
2587       else
2588         set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
2589                              negate * aarch64_get_reg_u32 (cpu, rm, SP_OK));
2590     }
2591 }
2592
2593 static void
2594 do_vec_MOV_whole_vector (sim_cpu *cpu)
2595 {
2596   /* MOV Vd.T, Vs.T  (alias for ORR Vd.T, Vn.T, Vm.T where Vn == Vm)
2597
2598      instr[31]    = 0
2599      instr[30]    = half(0)/full(1)
2600      instr[29,21] = 001110101
2601      instr[20,16] = Vs
2602      instr[15,10] = 000111
2603      instr[9,5]   = Vs
2604      instr[4,0]   = Vd  */
2605
2606   unsigned vs = INSTR (9, 5);
2607   unsigned vd = INSTR (4, 0);
2608
2609   NYI_assert (29, 21, 0x075);
2610   NYI_assert (15, 10, 0x07);
2611
2612   if (INSTR (20, 16) != vs)
2613     HALT_NYI;
2614
2615   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2616   if (INSTR (30, 30))
2617     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
2618
2619   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
2620 }
2621
2622 static void
2623 do_vec_MOV_into_scalar (sim_cpu *cpu)
2624 {
2625   /* instr[31]    = 0
2626      instr[30]    = word(0)/long(1)
2627      instr[29,21] = 00 1110 000
2628      instr[20,18] = element size and index
2629      instr[17,10] = 00 0011 11
2630      instr[9,5]   = V source
2631      instr[4,0]   = R dest  */
2632
2633   unsigned vs = INSTR (9, 5);
2634   unsigned rd = INSTR (4, 0);
2635
2636   NYI_assert (29, 21, 0x070);
2637   NYI_assert (17, 10, 0x0F);
2638
2639   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2640   switch (INSTR (20, 18))
2641     {
2642     case 0x2:
2643       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 0));
2644       break;
2645
2646     case 0x6:
2647       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 1));
2648       break;
2649
2650     case 0x1:
2651     case 0x3:
2652     case 0x5:
2653     case 0x7:
2654       aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u32
2655                            (cpu, vs, INSTR (20, 19)));
2656       break;
2657
2658     default:
2659       HALT_NYI;
2660     }
2661 }
2662
2663 static void
2664 do_vec_INS (sim_cpu *cpu)
2665 {
2666   /* instr[31,21] = 01001110000
2667      instr[20,16] = element size and index
2668      instr[15,10] = 000111
2669      instr[9,5]   = W source
2670      instr[4,0]   = V dest  */
2671
2672   int index;
2673   unsigned rs = INSTR (9, 5);
2674   unsigned vd = INSTR (4, 0);
2675
2676   NYI_assert (31, 21, 0x270);
2677   NYI_assert (15, 10, 0x07);
2678
2679   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2680   if (INSTR (16, 16))
2681     {
2682       index = INSTR (20, 17);
2683       aarch64_set_vec_u8 (cpu, vd, index,
2684                           aarch64_get_reg_u8 (cpu, rs, NO_SP));
2685     }
2686   else if (INSTR (17, 17))
2687     {
2688       index = INSTR (20, 18);
2689       aarch64_set_vec_u16 (cpu, vd, index,
2690                            aarch64_get_reg_u16 (cpu, rs, NO_SP));
2691     }
2692   else if (INSTR (18, 18))
2693     {
2694       index = INSTR (20, 19);
2695       aarch64_set_vec_u32 (cpu, vd, index,
2696                            aarch64_get_reg_u32 (cpu, rs, NO_SP));
2697     }
2698   else if (INSTR (19, 19))
2699     {
2700       index = INSTR (20, 20);
2701       aarch64_set_vec_u64 (cpu, vd, index,
2702                            aarch64_get_reg_u64 (cpu, rs, NO_SP));
2703     }
2704   else
2705     HALT_NYI;
2706 }
2707
2708 static void
2709 do_vec_DUP_vector_into_vector (sim_cpu *cpu)
2710 {
2711   /* instr[31]    = 0
2712      instr[30]    = half(0)/full(1)
2713      instr[29,21] = 00 1110 000
2714      instr[20,16] = element size and index
2715      instr[15,10] = 0000 01
2716      instr[9,5]   = V source
2717      instr[4,0]   = V dest.  */
2718
2719   unsigned full = INSTR (30, 30);
2720   unsigned vs = INSTR (9, 5);
2721   unsigned vd = INSTR (4, 0);
2722   int i, index;
2723
2724   NYI_assert (29, 21, 0x070);
2725   NYI_assert (15, 10, 0x01);
2726
2727   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2728   if (INSTR (16, 16))
2729     {
2730       index = INSTR (20, 17);
2731
2732       for (i = 0; i < (full ? 16 : 8); i++)
2733         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
2734     }
2735   else if (INSTR (17, 17))
2736     {
2737       index = INSTR (20, 18);
2738
2739       for (i = 0; i < (full ? 8 : 4); i++)
2740         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
2741     }
2742   else if (INSTR (18, 18))
2743     {
2744       index = INSTR (20, 19);
2745
2746       for (i = 0; i < (full ? 4 : 2); i++)
2747         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
2748     }
2749   else
2750     {
2751       if (INSTR (19, 19) == 0)
2752         HALT_UNALLOC;
2753
2754       if (! full)
2755         HALT_UNALLOC;
2756
2757       index = INSTR (20, 20);
2758
2759       for (i = 0; i < 2; i++)
2760         aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
2761     }
2762 }
2763
2764 static void
2765 do_vec_TBL (sim_cpu *cpu)
2766 {
2767   /* instr[31]    = 0
2768      instr[30]    = half(0)/full(1)
2769      instr[29,21] = 00 1110 000
2770      instr[20,16] = Vm
2771      instr[15]    = 0
2772      instr[14,13] = vec length
2773      instr[12,10] = 000
2774      instr[9,5]   = V start
2775      instr[4,0]   = V dest  */
2776
2777   int full    = INSTR (30, 30);
2778   int len     = INSTR (14, 13) + 1;
2779   unsigned vm = INSTR (20, 16);
2780   unsigned vn = INSTR (9, 5);
2781   unsigned vd = INSTR (4, 0);
2782   unsigned i;
2783
2784   NYI_assert (29, 21, 0x070);
2785   NYI_assert (12, 10, 0);
2786
2787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2788   for (i = 0; i < (full ? 16 : 8); i++)
2789     {
2790       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
2791       uint8_t val;
2792
2793       if (selector < 16)
2794         val = aarch64_get_vec_u8 (cpu, vn, selector);
2795       else if (selector < 32)
2796         val = len < 2 ? 0 : aarch64_get_vec_u8 (cpu, vn + 1, selector - 16);
2797       else if (selector < 48)
2798         val = len < 3 ? 0 : aarch64_get_vec_u8 (cpu, vn + 2, selector - 32);
2799       else if (selector < 64)
2800         val = len < 4 ? 0 : aarch64_get_vec_u8 (cpu, vn + 3, selector - 48);
2801       else
2802         val = 0;
2803
2804       aarch64_set_vec_u8 (cpu, vd, i, val);
2805     }
2806 }
2807
2808 static void
2809 do_vec_TRN (sim_cpu *cpu)
2810 {
2811   /* instr[31]    = 0
2812      instr[30]    = half(0)/full(1)
2813      instr[29,24] = 00 1110
2814      instr[23,22] = size
2815      instr[21]    = 0
2816      instr[20,16] = Vm
2817      instr[15]    = 0
2818      instr[14]    = TRN1 (0) / TRN2 (1)
2819      instr[13,10] = 1010
2820      instr[9,5]   = V source
2821      instr[4,0]   = V dest.  */
2822
2823   int full    = INSTR (30, 30);
2824   int second  = INSTR (14, 14);
2825   unsigned vm = INSTR (20, 16);
2826   unsigned vn = INSTR (9, 5);
2827   unsigned vd = INSTR (4, 0);
2828   unsigned i;
2829
2830   NYI_assert (29, 24, 0x0E);
2831   NYI_assert (13, 10, 0xA);
2832
2833   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2834   switch (INSTR (23, 22))
2835     {
2836     case 0:
2837       for (i = 0; i < (full ? 8 : 4); i++)
2838         {
2839           aarch64_set_vec_u8
2840             (cpu, vd, i * 2,
2841              aarch64_get_vec_u8 (cpu, second ? vm : vn, i * 2));
2842           aarch64_set_vec_u8
2843             (cpu, vd, 1 * 2 + 1,
2844              aarch64_get_vec_u8 (cpu, second ? vn : vm, i * 2 + 1));
2845         }
2846       break;
2847
2848     case 1:
2849       for (i = 0; i < (full ? 4 : 2); i++)
2850         {
2851           aarch64_set_vec_u16
2852             (cpu, vd, i * 2,
2853              aarch64_get_vec_u16 (cpu, second ? vm : vn, i * 2));
2854           aarch64_set_vec_u16
2855             (cpu, vd, 1 * 2 + 1,
2856              aarch64_get_vec_u16 (cpu, second ? vn : vm, i * 2 + 1));
2857         }
2858       break;
2859
2860     case 2:
2861       aarch64_set_vec_u32
2862         (cpu, vd, 0, aarch64_get_vec_u32 (cpu, second ? vm : vn, 0));
2863       aarch64_set_vec_u32
2864         (cpu, vd, 1, aarch64_get_vec_u32 (cpu, second ? vn : vm, 1));
2865       aarch64_set_vec_u32
2866         (cpu, vd, 2, aarch64_get_vec_u32 (cpu, second ? vm : vn, 2));
2867       aarch64_set_vec_u32
2868         (cpu, vd, 3, aarch64_get_vec_u32 (cpu, second ? vn : vm, 3));
2869       break;
2870
2871     case 3:
2872       if (! full)
2873         HALT_UNALLOC;
2874
2875       aarch64_set_vec_u64 (cpu, vd, 0,
2876                            aarch64_get_vec_u64 (cpu, second ? vm : vn, 0));
2877       aarch64_set_vec_u64 (cpu, vd, 1,
2878                            aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
2879       break;
2880     }
2881 }
2882
2883 static void
2884 do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
2885 {
2886   /* instr[31]    = 0
2887      instr[30]    = 0=> zero top 64-bits, 1=> duplicate into top 64-bits
2888                     [must be 1 for 64-bit xfer]
2889      instr[29,20] = 00 1110 0000
2890      instr[19,16] = element size: 0001=> 8-bits, 0010=> 16-bits,
2891                                   0100=> 32-bits. 1000=>64-bits
2892      instr[15,10] = 0000 11
2893      instr[9,5]   = W source
2894      instr[4,0]   = V dest.  */
2895
2896   unsigned i;
2897   unsigned Vd = INSTR (4, 0);
2898   unsigned Rs = INSTR (9, 5);
2899   int both    = INSTR (30, 30);
2900
2901   NYI_assert (29, 20, 0x0E0);
2902   NYI_assert (15, 10, 0x03);
2903
2904   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2905   switch (INSTR (19, 16))
2906     {
2907     case 1:
2908       for (i = 0; i < (both ? 16 : 8); i++)
2909         aarch64_set_vec_u8 (cpu, Vd, i, aarch64_get_reg_u8 (cpu, Rs, NO_SP));
2910       break;
2911
2912     case 2:
2913       for (i = 0; i < (both ? 8 : 4); i++)
2914         aarch64_set_vec_u16 (cpu, Vd, i, aarch64_get_reg_u16 (cpu, Rs, NO_SP));
2915       break;
2916
2917     case 4:
2918       for (i = 0; i < (both ? 4 : 2); i++)
2919         aarch64_set_vec_u32 (cpu, Vd, i, aarch64_get_reg_u32 (cpu, Rs, NO_SP));
2920       break;
2921
2922     case 8:
2923       if (!both)
2924         HALT_NYI;
2925       aarch64_set_vec_u64 (cpu, Vd, 0, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2926       aarch64_set_vec_u64 (cpu, Vd, 1, aarch64_get_reg_u64 (cpu, Rs, NO_SP));
2927       break;
2928
2929     default:
2930       HALT_NYI;
2931     }
2932 }
2933
2934 static void
2935 do_vec_UZP (sim_cpu *cpu)
2936 {
2937   /* instr[31]    = 0
2938      instr[30]    = half(0)/full(1)
2939      instr[29,24] = 00 1110
2940      instr[23,22] = size: byte(00), half(01), word (10), long (11)
2941      instr[21]    = 0
2942      instr[20,16] = Vm
2943      instr[15]    = 0
2944      instr[14]    = lower (0) / upper (1)
2945      instr[13,10] = 0110
2946      instr[9,5]   = Vn
2947      instr[4,0]   = Vd.  */
2948
2949   int full = INSTR (30, 30);
2950   int upper = INSTR (14, 14);
2951
2952   unsigned vm = INSTR (20, 16);
2953   unsigned vn = INSTR (9, 5);
2954   unsigned vd = INSTR (4, 0);
2955
2956   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
2957   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
2958   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
2959   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
2960
2961   uint64_t val1;
2962   uint64_t val2;
2963
2964   uint64_t input2 = full ? val_n2 : val_m1;
2965
2966   NYI_assert (29, 24, 0x0E);
2967   NYI_assert (21, 21, 0);
2968   NYI_assert (15, 15, 0);
2969   NYI_assert (13, 10, 6);
2970
2971   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
2972   switch (INSTR (23, 22))
2973     {
2974     case 0:
2975       val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
2976       val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
2977       val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
2978       val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
2979
2980       val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
2981       val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
2982       val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
2983       val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
2984
2985       if (full)
2986         {
2987           val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
2988           val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
2989           val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
2990           val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
2991
2992           val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
2993           val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
2994           val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
2995           val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
2996         }
2997       break;
2998
2999     case 1:
3000       val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
3001       val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3002
3003       val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
3004       val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3005
3006       if (full)
3007         {
3008           val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
3009           val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
3010
3011           val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
3012           val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
3013         }
3014       break;
3015
3016     case 2:
3017       val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
3018       val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3019
3020       if (full)
3021         {
3022           val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
3023           val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
3024         }
3025       break;
3026
3027     case 3:
3028       if (! full)
3029         HALT_UNALLOC;
3030
3031       val1 = upper ? val_n2 : val_n1;
3032       val2 = upper ? val_m2 : val_m1;
3033       break;
3034     }
3035
3036   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3037   if (full)
3038     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3039 }
3040
3041 static void
3042 do_vec_ZIP (sim_cpu *cpu)
3043 {
3044   /* instr[31]    = 0
3045      instr[30]    = half(0)/full(1)
3046      instr[29,24] = 00 1110
3047      instr[23,22] = size: byte(00), hald(01), word (10), long (11)
3048      instr[21]    = 0
3049      instr[20,16] = Vm
3050      instr[15]    = 0
3051      instr[14]    = lower (0) / upper (1)
3052      instr[13,10] = 1110
3053      instr[9,5]   = Vn
3054      instr[4,0]   = Vd.  */
3055
3056   int full = INSTR (30, 30);
3057   int upper = INSTR (14, 14);
3058
3059   unsigned vm = INSTR (20, 16);
3060   unsigned vn = INSTR (9, 5);
3061   unsigned vd = INSTR (4, 0);
3062
3063   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
3064   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
3065   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
3066   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
3067
3068   uint64_t val1 = 0;
3069   uint64_t val2 = 0;
3070
3071   uint64_t input1 = upper ? val_n1 : val_m1;
3072   uint64_t input2 = upper ? val_n2 : val_m2;
3073
3074   NYI_assert (29, 24, 0x0E);
3075   NYI_assert (21, 21, 0);
3076   NYI_assert (15, 15, 0);
3077   NYI_assert (13, 10, 0xE);
3078
3079   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3080   switch (INSTR (23, 23))
3081     {
3082     case 0:
3083       val1 =
3084           ((input1 <<  0) & (0xFF    <<  0))
3085         | ((input2 <<  8) & (0xFF    <<  8))
3086         | ((input1 <<  8) & (0xFF    << 16))
3087         | ((input2 << 16) & (0xFF    << 24))
3088         | ((input1 << 16) & (0xFFULL << 32))
3089         | ((input2 << 24) & (0xFFULL << 40))
3090         | ((input1 << 24) & (0xFFULL << 48))
3091         | ((input2 << 32) & (0xFFULL << 56));
3092
3093       val2 =
3094           ((input1 >> 32) & (0xFF    <<  0))
3095         | ((input2 >> 24) & (0xFF    <<  8))
3096         | ((input1 >> 24) & (0xFF    << 16))
3097         | ((input2 >> 16) & (0xFF    << 24))
3098         | ((input1 >> 16) & (0xFFULL << 32))
3099         | ((input2 >>  8) & (0xFFULL << 40))
3100         | ((input1 >>  8) & (0xFFULL << 48))
3101         | ((input2 >>  0) & (0xFFULL << 56));
3102       break;
3103
3104     case 1:
3105       val1 =
3106           ((input1 <<  0) & (0xFFFF    <<  0))
3107         | ((input2 << 16) & (0xFFFF    << 16))
3108         | ((input1 << 16) & (0xFFFFULL << 32))
3109         | ((input2 << 32) & (0xFFFFULL << 48));
3110
3111       val2 =
3112           ((input1 >> 32) & (0xFFFF    <<  0))
3113         | ((input2 >> 16) & (0xFFFF    << 16))
3114         | ((input1 >> 16) & (0xFFFFULL << 32))
3115         | ((input2 >>  0) & (0xFFFFULL << 48));
3116       break;
3117
3118     case 2:
3119       val1 = (input1 & 0xFFFFFFFFULL) | (input2 << 32);
3120       val2 = (input2 & 0xFFFFFFFFULL) | (input1 << 32);
3121       break;
3122
3123     case 3:
3124       val1 = input1;
3125       val2 = input2;
3126       break;
3127     }
3128
3129   aarch64_set_vec_u64 (cpu, vd, 0, val1);
3130   if (full)
3131     aarch64_set_vec_u64 (cpu, vd, 1, val2);
3132 }
3133
3134 /* Floating point immediates are encoded in 8 bits.
3135    fpimm[7] = sign bit.
3136    fpimm[6:4] = signed exponent.
3137    fpimm[3:0] = fraction (assuming leading 1).
3138    i.e. F = s * 1.f * 2^(e - b).  */
3139
3140 static float
3141 fp_immediate_for_encoding_32 (uint32_t imm8)
3142 {
3143   float u;
3144   uint32_t s, e, f, i;
3145
3146   s = (imm8 >> 7) & 0x1;
3147   e = (imm8 >> 4) & 0x7;
3148   f = imm8 & 0xf;
3149
3150   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3151   u = (16.0 + f) / 16.0;
3152
3153   /* N.B. exponent is signed.  */
3154   if (e < 4)
3155     {
3156       int epos = e;
3157
3158       for (i = 0; i <= epos; i++)
3159         u *= 2.0;
3160     }
3161   else
3162     {
3163       int eneg = 7 - e;
3164
3165       for (i = 0; i < eneg; i++)
3166         u /= 2.0;
3167     }
3168
3169   if (s)
3170     u = - u;
3171
3172   return u;
3173 }
3174
3175 static double
3176 fp_immediate_for_encoding_64 (uint32_t imm8)
3177 {
3178   double u;
3179   uint32_t s, e, f, i;
3180
3181   s = (imm8 >> 7) & 0x1;
3182   e = (imm8 >> 4) & 0x7;
3183   f = imm8 & 0xf;
3184
3185   /* The fp value is s * n/16 * 2r where n is 16+e.  */
3186   u = (16.0 + f) / 16.0;
3187
3188   /* N.B. exponent is signed.  */
3189   if (e < 4)
3190     {
3191       int epos = e;
3192
3193       for (i = 0; i <= epos; i++)
3194         u *= 2.0;
3195     }
3196   else
3197     {
3198       int eneg = 7 - e;
3199
3200       for (i = 0; i < eneg; i++)
3201         u /= 2.0;
3202     }
3203
3204   if (s)
3205     u = - u;
3206
3207   return u;
3208 }
3209
3210 static void
3211 do_vec_MOV_immediate (sim_cpu *cpu)
3212 {
3213   /* instr[31]    = 0
3214      instr[30]    = full/half selector
3215      instr[29,19] = 00111100000
3216      instr[18,16] = high 3 bits of uimm8
3217      instr[15,12] = size & shift:
3218                                   0000 => 32-bit
3219                                   0010 => 32-bit + LSL#8
3220                                   0100 => 32-bit + LSL#16
3221                                   0110 => 32-bit + LSL#24
3222                                   1010 => 16-bit + LSL#8
3223                                   1000 => 16-bit
3224                                   1101 => 32-bit + MSL#16
3225                                   1100 => 32-bit + MSL#8
3226                                   1110 => 8-bit
3227                                   1111 => double
3228      instr[11,10] = 01
3229      instr[9,5]   = low 5-bits of uimm8
3230      instr[4,0]   = Vd.  */
3231
3232   int full     = INSTR (30, 30);
3233   unsigned vd  = INSTR (4, 0);
3234   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3235   unsigned i;
3236
3237   NYI_assert (29, 19, 0x1E0);
3238   NYI_assert (11, 10, 1);
3239
3240   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3241   switch (INSTR (15, 12))
3242     {
3243     case 0x0: /* 32-bit, no shift.  */
3244     case 0x2: /* 32-bit, shift by 8.  */
3245     case 0x4: /* 32-bit, shift by 16.  */
3246     case 0x6: /* 32-bit, shift by 24.  */
3247       val <<= (8 * INSTR (14, 13));
3248       for (i = 0; i < (full ? 4 : 2); i++)
3249         aarch64_set_vec_u32 (cpu, vd, i, val);
3250       break;
3251
3252     case 0xa: /* 16-bit, shift by 8.  */
3253       val <<= 8;
3254       /* Fall through.  */
3255     case 0x8: /* 16-bit, no shift.  */
3256       for (i = 0; i < (full ? 8 : 4); i++)
3257         aarch64_set_vec_u16 (cpu, vd, i, val);
3258       break;
3259
3260     case 0xd: /* 32-bit, mask shift by 16.  */
3261       val <<= 8;
3262       val |= 0xFF;
3263       /* Fall through.  */
3264     case 0xc: /* 32-bit, mask shift by 8. */
3265       val <<= 8;
3266       val |= 0xFF;
3267       for (i = 0; i < (full ? 4 : 2); i++)
3268         aarch64_set_vec_u32 (cpu, vd, i, val);
3269       break;
3270
3271     case 0xe: /* 8-bit, no shift.  */
3272       for (i = 0; i < (full ? 16 : 8); i++)
3273         aarch64_set_vec_u8 (cpu, vd, i, val);
3274       break;
3275
3276     case 0xf: /* FMOV Vs.{2|4}S, #fpimm.  */
3277       {
3278         float u = fp_immediate_for_encoding_32 (val);
3279         for (i = 0; i < (full ? 4 : 2); i++)
3280           aarch64_set_vec_float (cpu, vd, i, u);
3281         break;
3282       }
3283
3284     default:
3285       HALT_NYI;
3286     }
3287 }
3288
3289 static void
3290 do_vec_MVNI (sim_cpu *cpu)
3291 {
3292   /* instr[31]    = 0
3293      instr[30]    = full/half selector
3294      instr[29,19] = 10111100000
3295      instr[18,16] = high 3 bits of uimm8
3296      instr[15,12] = selector
3297      instr[11,10] = 01
3298      instr[9,5]   = low 5-bits of uimm8
3299      instr[4,0]   = Vd.  */
3300
3301   int full     = INSTR (30, 30);
3302   unsigned vd  = INSTR (4, 0);
3303   unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
3304   unsigned i;
3305
3306   NYI_assert (29, 19, 0x5E0);
3307   NYI_assert (11, 10, 1);
3308
3309   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3310   switch (INSTR (15, 12))
3311     {
3312     case 0x0: /* 32-bit, no shift.  */
3313     case 0x2: /* 32-bit, shift by 8.  */
3314     case 0x4: /* 32-bit, shift by 16.  */
3315     case 0x6: /* 32-bit, shift by 24.  */
3316       val <<= (8 * INSTR (14, 13));
3317       val = ~ val;
3318       for (i = 0; i < (full ? 4 : 2); i++)
3319         aarch64_set_vec_u32 (cpu, vd, i, val);
3320       return;
3321
3322     case 0xa: /* 16-bit, 8 bit shift. */
3323       val <<= 8;
3324     case 0x8: /* 16-bit, no shift. */
3325       val = ~ val;
3326       for (i = 0; i < (full ? 8 : 4); i++)
3327         aarch64_set_vec_u16 (cpu, vd, i, val);
3328       return;
3329
3330     case 0xd: /* 32-bit, mask shift by 16.  */
3331       val <<= 8;
3332       val |= 0xFF;
3333     case 0xc: /* 32-bit, mask shift by 8. */
3334       val <<= 8;
3335       val |= 0xFF;
3336       val = ~ val;
3337       for (i = 0; i < (full ? 4 : 2); i++)
3338         aarch64_set_vec_u32 (cpu, vd, i, val);
3339       return;
3340
3341     case 0xE: /* MOVI Dn, #mask64 */
3342       {
3343         uint64_t mask = 0;
3344
3345         for (i = 0; i < 8; i++)
3346           if (val & (1 << i))
3347             mask |= (0xFFUL << (i * 8));
3348         aarch64_set_vec_u64 (cpu, vd, 0, mask);
3349         aarch64_set_vec_u64 (cpu, vd, 1, mask);
3350         return;
3351       }
3352
3353     case 0xf: /* FMOV Vd.2D, #fpimm.  */
3354       {
3355         double u = fp_immediate_for_encoding_64 (val);
3356
3357         if (! full)
3358           HALT_UNALLOC;
3359
3360         aarch64_set_vec_double (cpu, vd, 0, u);
3361         aarch64_set_vec_double (cpu, vd, 1, u);
3362         return;
3363       }
3364
3365     default:
3366       HALT_NYI;
3367     }
3368 }
3369
3370 #define ABS(A) ((A) < 0 ? - (A) : (A))
3371
3372 static void
3373 do_vec_ABS (sim_cpu *cpu)
3374 {
3375   /* instr[31]    = 0
3376      instr[30]    = half(0)/full(1)
3377      instr[29,24] = 00 1110
3378      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3379      instr[21,10] = 10 0000 1011 10
3380      instr[9,5]   = Vn
3381      instr[4.0]   = Vd.  */
3382
3383   unsigned vn = INSTR (9, 5);
3384   unsigned vd = INSTR (4, 0);
3385   unsigned full = INSTR (30, 30);
3386   unsigned i;
3387
3388   NYI_assert (29, 24, 0x0E);
3389   NYI_assert (21, 10, 0x82E);
3390
3391   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3392   switch (INSTR (23, 22))
3393     {
3394     case 0:
3395       for (i = 0; i < (full ? 16 : 8); i++)
3396         aarch64_set_vec_s8 (cpu, vd, i,
3397                             ABS (aarch64_get_vec_s8 (cpu, vn, i)));
3398       break;
3399
3400     case 1:
3401       for (i = 0; i < (full ? 8 : 4); i++)
3402         aarch64_set_vec_s16 (cpu, vd, i,
3403                              ABS (aarch64_get_vec_s16 (cpu, vn, i)));
3404       break;
3405
3406     case 2:
3407       for (i = 0; i < (full ? 4 : 2); i++)
3408         aarch64_set_vec_s32 (cpu, vd, i,
3409                              ABS (aarch64_get_vec_s32 (cpu, vn, i)));
3410       break;
3411
3412     case 3:
3413       if (! full)
3414         HALT_NYI;
3415       for (i = 0; i < 2; i++)
3416         aarch64_set_vec_s64 (cpu, vd, i,
3417                              ABS (aarch64_get_vec_s64 (cpu, vn, i)));
3418       break;
3419     }
3420 }
3421
3422 static void
3423 do_vec_ADDV (sim_cpu *cpu)
3424 {
3425   /* instr[31]    = 0
3426      instr[30]    = full/half selector
3427      instr[29,24] = 00 1110
3428      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3429      instr[21,10] = 11 0001 1011 10
3430      instr[9,5]   = Vm
3431      instr[4.0]   = Rd.  */
3432
3433   unsigned vm = INSTR (9, 5);
3434   unsigned rd = INSTR (4, 0);
3435   unsigned i;
3436   uint64_t val = 0;
3437   int      full = INSTR (30, 30);
3438
3439   NYI_assert (29, 24, 0x0E);
3440   NYI_assert (21, 10, 0xC6E);
3441
3442   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3443   switch (INSTR (23, 22))
3444     {
3445     case 0:
3446       for (i = 0; i < (full ? 16 : 8); i++)
3447         val += aarch64_get_vec_u8 (cpu, vm, i);
3448       aarch64_set_vec_u64 (cpu, rd, 0, val);
3449       return;
3450
3451     case 1:
3452       for (i = 0; i < (full ? 8 : 4); i++)
3453         val += aarch64_get_vec_u16 (cpu, vm, i);
3454       aarch64_set_vec_u64 (cpu, rd, 0, val);
3455       return;
3456
3457     case 2:
3458       if (! full)
3459         HALT_UNALLOC;
3460       for (i = 0; i < 4; i++)
3461         val += aarch64_get_vec_u32 (cpu, vm, i);
3462       aarch64_set_vec_u64 (cpu, rd, 0, val);
3463       return;
3464
3465     case 3:
3466       HALT_UNALLOC;
3467     }
3468 }
3469
3470 static void
3471 do_vec_ins_2 (sim_cpu *cpu)
3472 {
3473   /* instr[31,21] = 01001110000
3474      instr[20,18] = size & element selector
3475      instr[17,14] = 0000
3476      instr[13]    = direction: to vec(0), from vec (1)
3477      instr[12,10] = 111
3478      instr[9,5]   = Vm
3479      instr[4,0]   = Vd.  */
3480
3481   unsigned elem;
3482   unsigned vm = INSTR (9, 5);
3483   unsigned vd = INSTR (4, 0);
3484
3485   NYI_assert (31, 21, 0x270);
3486   NYI_assert (17, 14, 0);
3487   NYI_assert (12, 10, 7);
3488
3489   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3490   if (INSTR (13, 13) == 1)
3491     {
3492       if (INSTR (18, 18) == 1)
3493         {
3494           /* 32-bit moves.  */
3495           elem = INSTR (20, 19);
3496           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3497                                aarch64_get_vec_u32 (cpu, vm, elem));
3498         }
3499       else
3500         {
3501           /* 64-bit moves.  */
3502           if (INSTR (19, 19) != 1)
3503             HALT_NYI;
3504
3505           elem = INSTR (20, 20);
3506           aarch64_set_reg_u64 (cpu, vd, NO_SP,
3507                                aarch64_get_vec_u64 (cpu, vm, elem));
3508         }
3509     }
3510   else
3511     {
3512       if (INSTR (18, 18) == 1)
3513         {
3514           /* 32-bit moves.  */
3515           elem = INSTR (20, 19);
3516           aarch64_set_vec_u32 (cpu, vd, elem,
3517                                aarch64_get_reg_u32 (cpu, vm, NO_SP));
3518         }
3519       else
3520         {
3521           /* 64-bit moves.  */
3522           if (INSTR (19, 19) != 1)
3523             HALT_NYI;
3524
3525           elem = INSTR (20, 20);
3526           aarch64_set_vec_u64 (cpu, vd, elem,
3527                                aarch64_get_reg_u64 (cpu, vm, NO_SP));
3528         }
3529     }
3530 }
3531
3532 #define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)   \
3533   do                                                              \
3534     {                                                             \
3535       DST_TYPE a[N], b[N];                                        \
3536                                                                   \
3537       for (i = 0; i < (N); i++)                                   \
3538         {                                                         \
3539           a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
3540           b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
3541         }                                                         \
3542       for (i = 0; i < (N); i++)                                   \
3543         aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);   \
3544     }                                                             \
3545   while (0)
3546
3547 static void
3548 do_vec_mull (sim_cpu *cpu)
3549 {
3550   /* instr[31]    = 0
3551      instr[30]    = lower(0)/upper(1) selector
3552      instr[29]    = signed(0)/unsigned(1)
3553      instr[28,24] = 0 1110
3554      instr[23,22] = size: 8-bit (00), 16-bit (01), 32-bit (10)
3555      instr[21]    = 1
3556      instr[20,16] = Vm
3557      instr[15,10] = 11 0000
3558      instr[9,5]   = Vn
3559      instr[4.0]   = Vd.  */
3560
3561   int    unsign = INSTR (29, 29);
3562   int    bias = INSTR (30, 30);
3563   unsigned vm = INSTR (20, 16);
3564   unsigned vn = INSTR ( 9,  5);
3565   unsigned vd = INSTR ( 4,  0);
3566   unsigned i;
3567
3568   NYI_assert (28, 24, 0x0E);
3569   NYI_assert (15, 10, 0x30);
3570
3571   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3572   /* NB: Read source values before writing results, in case
3573      the source and destination vectors are the same.  */
3574   switch (INSTR (23, 22))
3575     {
3576     case 0:
3577       if (bias)
3578         bias = 8;
3579       if (unsign)
3580         DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
3581       else
3582         DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
3583       return;
3584
3585     case 1:
3586       if (bias)
3587         bias = 4;
3588       if (unsign)
3589         DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
3590       else
3591         DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
3592       return;
3593
3594     case 2:
3595       if (bias)
3596         bias = 2;
3597       if (unsign)
3598         DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
3599       else
3600         DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
3601       return;
3602
3603     case 3:
3604       HALT_NYI;
3605     }
3606 }
3607
3608 static void
3609 do_vec_fadd (sim_cpu *cpu)
3610 {
3611   /* instr[31]    = 0
3612      instr[30]    = half(0)/full(1)
3613      instr[29,24] = 001110
3614      instr[23]    = FADD(0)/FSUB(1)
3615      instr[22]    = float (0)/double(1)
3616      instr[21]    = 1
3617      instr[20,16] = Vm
3618      instr[15,10] = 110101
3619      instr[9,5]   = Vn
3620      instr[4.0]   = Vd.  */
3621
3622   unsigned vm = INSTR (20, 16);
3623   unsigned vn = INSTR (9, 5);
3624   unsigned vd = INSTR (4, 0);
3625   unsigned i;
3626   int      full = INSTR (30, 30);
3627
3628   NYI_assert (29, 24, 0x0E);
3629   NYI_assert (21, 21, 1);
3630   NYI_assert (15, 10, 0x35);
3631
3632   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3633   if (INSTR (23, 23))
3634     {
3635       if (INSTR (22, 22))
3636         {
3637           if (! full)
3638             HALT_NYI;
3639
3640           for (i = 0; i < 2; i++)
3641             aarch64_set_vec_double (cpu, vd, i,
3642                                     aarch64_get_vec_double (cpu, vn, i)
3643                                     - aarch64_get_vec_double (cpu, vm, i));
3644         }
3645       else
3646         {
3647           for (i = 0; i < (full ? 4 : 2); i++)
3648             aarch64_set_vec_float (cpu, vd, i,
3649                                    aarch64_get_vec_float (cpu, vn, i)
3650                                    - aarch64_get_vec_float (cpu, vm, i));
3651         }
3652     }
3653   else
3654     {
3655       if (INSTR (22, 22))
3656         {
3657           if (! full)
3658             HALT_NYI;
3659
3660           for (i = 0; i < 2; i++)
3661             aarch64_set_vec_double (cpu, vd, i,
3662                                     aarch64_get_vec_double (cpu, vm, i)
3663                                     + aarch64_get_vec_double (cpu, vn, i));
3664         }
3665       else
3666         {
3667           for (i = 0; i < (full ? 4 : 2); i++)
3668             aarch64_set_vec_float (cpu, vd, i,
3669                                    aarch64_get_vec_float (cpu, vm, i)
3670                                    + aarch64_get_vec_float (cpu, vn, i));
3671         }
3672     }
3673 }
3674
3675 static void
3676 do_vec_add (sim_cpu *cpu)
3677 {
3678   /* instr[31]    = 0
3679      instr[30]    = full/half selector
3680      instr[29,24] = 001110
3681      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit, 11=> 64-bit
3682      instr[21]    = 1
3683      instr[20,16] = Vn
3684      instr[15,10] = 100001
3685      instr[9,5]   = Vm
3686      instr[4.0]   = Vd.  */
3687
3688   unsigned vm = INSTR (20, 16);
3689   unsigned vn = INSTR (9, 5);
3690   unsigned vd = INSTR (4, 0);
3691   unsigned i;
3692   int      full = INSTR (30, 30);
3693
3694   NYI_assert (29, 24, 0x0E);
3695   NYI_assert (21, 21, 1);
3696   NYI_assert (15, 10, 0x21);
3697
3698   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3699   switch (INSTR (23, 22))
3700     {
3701     case 0:
3702       for (i = 0; i < (full ? 16 : 8); i++)
3703         aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
3704                             + aarch64_get_vec_u8 (cpu, vm, i));
3705       return;
3706
3707     case 1:
3708       for (i = 0; i < (full ? 8 : 4); i++)
3709         aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
3710                              + aarch64_get_vec_u16 (cpu, vm, i));
3711       return;
3712
3713     case 2:
3714       for (i = 0; i < (full ? 4 : 2); i++)
3715         aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
3716                              + aarch64_get_vec_u32 (cpu, vm, i));
3717       return;
3718
3719     case 3:
3720       if (! full)
3721         HALT_UNALLOC;
3722       aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vn, 0)
3723                            + aarch64_get_vec_u64 (cpu, vm, 0));
3724       aarch64_set_vec_u64 (cpu, vd, 1,
3725                            aarch64_get_vec_u64 (cpu, vn, 1)
3726                            + aarch64_get_vec_u64 (cpu, vm, 1));
3727       return;
3728     }
3729 }
3730
3731 static void
3732 do_vec_mul (sim_cpu *cpu)
3733 {
3734   /* instr[31]    = 0
3735      instr[30]    = full/half selector
3736      instr[29,24] = 00 1110
3737      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3738      instr[21]    = 1
3739      instr[20,16] = Vn
3740      instr[15,10] = 10 0111
3741      instr[9,5]   = Vm
3742      instr[4.0]   = Vd.  */
3743
3744   unsigned vm = INSTR (20, 16);
3745   unsigned vn = INSTR (9, 5);
3746   unsigned vd = INSTR (4, 0);
3747   unsigned i;
3748   int      full = INSTR (30, 30);
3749   int      bias = 0;
3750
3751   NYI_assert (29, 24, 0x0E);
3752   NYI_assert (21, 21, 1);
3753   NYI_assert (15, 10, 0x27);
3754
3755   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3756   switch (INSTR (23, 22))
3757     {
3758     case 0:
3759       DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
3760       return;
3761
3762     case 1:
3763       DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
3764       return;
3765
3766     case 2:
3767       DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
3768       return;
3769
3770     case 3:
3771       HALT_UNALLOC;
3772     }
3773 }
3774
3775 static void
3776 do_vec_MLA (sim_cpu *cpu)
3777 {
3778   /* instr[31]    = 0
3779      instr[30]    = full/half selector
3780      instr[29,24] = 00 1110
3781      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
3782      instr[21]    = 1
3783      instr[20,16] = Vn
3784      instr[15,10] = 1001 01
3785      instr[9,5]   = Vm
3786      instr[4.0]   = Vd.  */
3787
3788   unsigned vm = INSTR (20, 16);
3789   unsigned vn = INSTR (9, 5);
3790   unsigned vd = INSTR (4, 0);
3791   unsigned i;
3792   int      full = INSTR (30, 30);
3793
3794   NYI_assert (29, 24, 0x0E);
3795   NYI_assert (21, 21, 1);
3796   NYI_assert (15, 10, 0x25);
3797
3798   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3799   switch (INSTR (23, 22))
3800     {
3801     case 0:
3802       for (i = 0; i < (full ? 16 : 8); i++)
3803         aarch64_set_vec_u8 (cpu, vd, i,
3804                             aarch64_get_vec_u8 (cpu, vd, i)
3805                             + (aarch64_get_vec_u8 (cpu, vn, i)
3806                                * aarch64_get_vec_u8 (cpu, vm, i)));
3807       return;
3808
3809     case 1:
3810       for (i = 0; i < (full ? 8 : 4); i++)
3811         aarch64_set_vec_u16 (cpu, vd, i,
3812                              aarch64_get_vec_u16 (cpu, vd, i)
3813                              + (aarch64_get_vec_u16 (cpu, vn, i)
3814                                 * aarch64_get_vec_u16 (cpu, vm, i)));
3815       return;
3816
3817     case 2:
3818       for (i = 0; i < (full ? 4 : 2); i++)
3819         aarch64_set_vec_u32 (cpu, vd, i,
3820                              aarch64_get_vec_u32 (cpu, vd, i)
3821                              + (aarch64_get_vec_u32 (cpu, vn, i)
3822                                 * aarch64_get_vec_u32 (cpu, vm, i)));
3823       return;
3824
3825     default:
3826       HALT_UNALLOC;
3827     }
3828 }
3829
3830 static float
3831 fmaxnm (float a, float b)
3832 {
3833   if (! isnan (a))
3834     {
3835       if (! isnan (b))
3836         return a > b ? a : b;
3837       return a;
3838     }
3839   else if (! isnan (b))
3840     return b;
3841   return a;
3842 }
3843
3844 static float
3845 fminnm (float a, float b)
3846 {
3847   if (! isnan (a))
3848     {
3849       if (! isnan (b))
3850         return a < b ? a : b;
3851       return a;
3852     }
3853   else if (! isnan (b))
3854     return b;
3855   return a;
3856 }
3857
3858 static double
3859 dmaxnm (double a, double b)
3860 {
3861   if (! isnan (a))
3862     {
3863       if (! isnan (b))
3864         return a > b ? a : b;
3865       return a;
3866     }
3867   else if (! isnan (b))
3868     return b;
3869   return a;
3870 }
3871
3872 static double
3873 dminnm (double a, double b)
3874 {
3875   if (! isnan (a))
3876     {
3877       if (! isnan (b))
3878         return a < b ? a : b;
3879       return a;
3880     }
3881   else if (! isnan (b))
3882     return b;
3883   return a;
3884 }
3885
3886 static void
3887 do_vec_FminmaxNMP (sim_cpu *cpu)
3888 {
3889   /* instr [31]    = 0
3890      instr [30]    = half (0)/full (1)
3891      instr [29,24] = 10 1110
3892      instr [23]    = max(0)/min(1)
3893      instr [22]    = float (0)/double (1)
3894      instr [21]    = 1
3895      instr [20,16] = Vn
3896      instr [15,10] = 1100 01
3897      instr [9,5]   = Vm
3898      instr [4.0]   = Vd.  */
3899
3900   unsigned vm = INSTR (20, 16);
3901   unsigned vn = INSTR (9, 5);
3902   unsigned vd = INSTR (4, 0);
3903   int      full = INSTR (30, 30);
3904
3905   NYI_assert (29, 24, 0x2E);
3906   NYI_assert (21, 21, 1);
3907   NYI_assert (15, 10, 0x31);
3908
3909   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3910   if (INSTR (22, 22))
3911     {
3912       double (* fn)(double, double) = INSTR (23, 23)
3913         ? dminnm : dmaxnm;
3914
3915       if (! full)
3916         HALT_NYI;
3917       aarch64_set_vec_double (cpu, vd, 0,
3918                               fn (aarch64_get_vec_double (cpu, vn, 0),
3919                                   aarch64_get_vec_double (cpu, vn, 1)));
3920       aarch64_set_vec_double (cpu, vd, 0,
3921                               fn (aarch64_get_vec_double (cpu, vm, 0),
3922                                   aarch64_get_vec_double (cpu, vm, 1)));
3923     }
3924   else
3925     {
3926       float (* fn)(float, float) = INSTR (23, 23)
3927         ? fminnm : fmaxnm;
3928
3929       aarch64_set_vec_float (cpu, vd, 0,
3930                              fn (aarch64_get_vec_float (cpu, vn, 0),
3931                                  aarch64_get_vec_float (cpu, vn, 1)));
3932       if (full)
3933         aarch64_set_vec_float (cpu, vd, 1,
3934                                fn (aarch64_get_vec_float (cpu, vn, 2),
3935                                    aarch64_get_vec_float (cpu, vn, 3)));
3936
3937       aarch64_set_vec_float (cpu, vd, (full ? 2 : 1),
3938                              fn (aarch64_get_vec_float (cpu, vm, 0),
3939                                  aarch64_get_vec_float (cpu, vm, 1)));
3940       if (full)
3941         aarch64_set_vec_float (cpu, vd, 3,
3942                                fn (aarch64_get_vec_float (cpu, vm, 2),
3943                                    aarch64_get_vec_float (cpu, vm, 3)));
3944     }
3945 }
3946
3947 static void
3948 do_vec_AND (sim_cpu *cpu)
3949 {
3950   /* instr[31]    = 0
3951      instr[30]    = half (0)/full (1)
3952      instr[29,21] = 001110001
3953      instr[20,16] = Vm
3954      instr[15,10] = 000111
3955      instr[9,5]   = Vn
3956      instr[4.0]   = Vd.  */
3957
3958   unsigned vm = INSTR (20, 16);
3959   unsigned vn = INSTR (9, 5);
3960   unsigned vd = INSTR (4, 0);
3961   unsigned i;
3962   int      full = INSTR (30, 30);
3963
3964   NYI_assert (29, 21, 0x071);
3965   NYI_assert (15, 10, 0x07);
3966
3967   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3968   for (i = 0; i < (full ? 4 : 2); i++)
3969     aarch64_set_vec_u32 (cpu, vd, i,
3970                          aarch64_get_vec_u32 (cpu, vn, i)
3971                          & aarch64_get_vec_u32 (cpu, vm, i));
3972 }
3973
3974 static void
3975 do_vec_BSL (sim_cpu *cpu)
3976 {
3977   /* instr[31]    = 0
3978      instr[30]    = half (0)/full (1)
3979      instr[29,21] = 101110011
3980      instr[20,16] = Vm
3981      instr[15,10] = 000111
3982      instr[9,5]   = Vn
3983      instr[4.0]   = Vd.  */
3984
3985   unsigned vm = INSTR (20, 16);
3986   unsigned vn = INSTR (9, 5);
3987   unsigned vd = INSTR (4, 0);
3988   unsigned i;
3989   int      full = INSTR (30, 30);
3990
3991   NYI_assert (29, 21, 0x173);
3992   NYI_assert (15, 10, 0x07);
3993
3994   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
3995   for (i = 0; i < (full ? 16 : 8); i++)
3996     aarch64_set_vec_u8 (cpu, vd, i,
3997                         (    aarch64_get_vec_u8 (cpu, vd, i)
3998                            & aarch64_get_vec_u8 (cpu, vn, i))
3999                         | ((~ aarch64_get_vec_u8 (cpu, vd, i))
4000                            & aarch64_get_vec_u8 (cpu, vm, i)));
4001 }
4002
4003 static void
4004 do_vec_EOR (sim_cpu *cpu)
4005 {
4006   /* instr[31]    = 0
4007      instr[30]    = half (0)/full (1)
4008      instr[29,21] = 10 1110 001
4009      instr[20,16] = Vm
4010      instr[15,10] = 000111
4011      instr[9,5]   = Vn
4012      instr[4.0]   = Vd.  */
4013
4014   unsigned vm = INSTR (20, 16);
4015   unsigned vn = INSTR (9, 5);
4016   unsigned vd = INSTR (4, 0);
4017   unsigned i;
4018   int      full = INSTR (30, 30);
4019
4020   NYI_assert (29, 21, 0x171);
4021   NYI_assert (15, 10, 0x07);
4022
4023   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4024   for (i = 0; i < (full ? 4 : 2); i++)
4025     aarch64_set_vec_u32 (cpu, vd, i,
4026                          aarch64_get_vec_u32 (cpu, vn, i)
4027                          ^ aarch64_get_vec_u32 (cpu, vm, i));
4028 }
4029
4030 static void
4031 do_vec_bit (sim_cpu *cpu)
4032 {
4033   /* instr[31]    = 0
4034      instr[30]    = half (0)/full (1)
4035      instr[29,23] = 10 1110 1
4036      instr[22]    = BIT (0) / BIF (1)
4037      instr[21]    = 1
4038      instr[20,16] = Vm
4039      instr[15,10] = 0001 11
4040      instr[9,5]   = Vn
4041      instr[4.0]   = Vd.  */
4042
4043   unsigned vm = INSTR (20, 16);
4044   unsigned vn = INSTR (9, 5);
4045   unsigned vd = INSTR (4, 0);
4046   unsigned full = INSTR (30, 30);
4047   unsigned test_false = INSTR (22, 22);
4048   unsigned i;
4049
4050   NYI_assert (29, 23, 0x5D);
4051   NYI_assert (21, 21, 1);
4052   NYI_assert (15, 10, 0x07);
4053
4054   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4055   for (i = 0; i < (full ? 4 : 2); i++)
4056     {
4057       uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
4058       uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
4059       uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
4060       if (test_false)
4061         aarch64_set_vec_u32 (cpu, vd, i,
4062                              (vd_val & vm_val) | (vn_val & ~vm_val));
4063       else
4064         aarch64_set_vec_u32 (cpu, vd, i,
4065                              (vd_val & ~vm_val) | (vn_val & vm_val));
4066     }
4067 }
4068
4069 static void
4070 do_vec_ORN (sim_cpu *cpu)
4071 {
4072   /* instr[31]    = 0
4073      instr[30]    = half (0)/full (1)
4074      instr[29,21] = 00 1110 111
4075      instr[20,16] = Vm
4076      instr[15,10] = 00 0111
4077      instr[9,5]   = Vn
4078      instr[4.0]   = Vd.  */
4079
4080   unsigned vm = INSTR (20, 16);
4081   unsigned vn = INSTR (9, 5);
4082   unsigned vd = INSTR (4, 0);
4083   unsigned i;
4084   int      full = INSTR (30, 30);
4085
4086   NYI_assert (29, 21, 0x077);
4087   NYI_assert (15, 10, 0x07);
4088
4089   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4090   for (i = 0; i < (full ? 16 : 8); i++)
4091     aarch64_set_vec_u8 (cpu, vd, i,
4092                         aarch64_get_vec_u8 (cpu, vn, i)
4093                         | ~ aarch64_get_vec_u8 (cpu, vm, i));
4094 }
4095
4096 static void
4097 do_vec_ORR (sim_cpu *cpu)
4098 {
4099   /* instr[31]    = 0
4100      instr[30]    = half (0)/full (1)
4101      instr[29,21] = 00 1110 101
4102      instr[20,16] = Vm
4103      instr[15,10] = 0001 11
4104      instr[9,5]   = Vn
4105      instr[4.0]   = Vd.  */
4106
4107   unsigned vm = INSTR (20, 16);
4108   unsigned vn = INSTR (9, 5);
4109   unsigned vd = INSTR (4, 0);
4110   unsigned i;
4111   int      full = INSTR (30, 30);
4112
4113   NYI_assert (29, 21, 0x075);
4114   NYI_assert (15, 10, 0x07);
4115
4116   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4117   for (i = 0; i < (full ? 16 : 8); i++)
4118     aarch64_set_vec_u8 (cpu, vd, i,
4119                         aarch64_get_vec_u8 (cpu, vn, i)
4120                         | aarch64_get_vec_u8 (cpu, vm, i));
4121 }
4122
4123 static void
4124 do_vec_BIC (sim_cpu *cpu)
4125 {
4126   /* instr[31]    = 0
4127      instr[30]    = half (0)/full (1)
4128      instr[29,21] = 00 1110 011
4129      instr[20,16] = Vm
4130      instr[15,10] = 00 0111
4131      instr[9,5]   = Vn
4132      instr[4.0]   = Vd.  */
4133
4134   unsigned vm = INSTR (20, 16);
4135   unsigned vn = INSTR (9, 5);
4136   unsigned vd = INSTR (4, 0);
4137   unsigned i;
4138   int      full = INSTR (30, 30);
4139
4140   NYI_assert (29, 21, 0x073);
4141   NYI_assert (15, 10, 0x07);
4142
4143   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4144   for (i = 0; i < (full ? 16 : 8); i++)
4145     aarch64_set_vec_u8 (cpu, vd, i,
4146                         aarch64_get_vec_u8 (cpu, vn, i)
4147                         & ~ aarch64_get_vec_u8 (cpu, vm, i));
4148 }
4149
4150 static void
4151 do_vec_XTN (sim_cpu *cpu)
4152 {
4153   /* instr[31]    = 0
4154      instr[30]    = first part (0)/ second part (1)
4155      instr[29,24] = 00 1110
4156      instr[23,22] = size: byte(00), half(01), word (10)
4157      instr[21,10] = 1000 0100 1010
4158      instr[9,5]   = Vs
4159      instr[4,0]   = Vd.  */
4160
4161   unsigned vs = INSTR (9, 5);
4162   unsigned vd = INSTR (4, 0);
4163   unsigned bias = INSTR (30, 30);
4164   unsigned i;
4165
4166   NYI_assert (29, 24, 0x0E);
4167   NYI_assert (21, 10, 0x84A);
4168
4169   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4170   switch (INSTR (23, 22))
4171     {
4172     case 0:
4173       for (i = 0; i < 8; i++)
4174         aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
4175                             aarch64_get_vec_u16 (cpu, vs, i));
4176       return;
4177
4178     case 1:
4179       for (i = 0; i < 4; i++)
4180         aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
4181                              aarch64_get_vec_u32 (cpu, vs, i));
4182       return;
4183
4184     case 2:
4185       for (i = 0; i < 2; i++)
4186         aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
4187                              aarch64_get_vec_u64 (cpu, vs, i));
4188       return;
4189     }
4190 }
4191
4192 static void
4193 do_vec_maxv (sim_cpu *cpu)
4194 {
4195   /* instr[31]    = 0
4196      instr[30]    = half(0)/full(1)
4197      instr[29]    = signed (0)/unsigned(1)
4198      instr[28,24] = 0 1110
4199      instr[23,22] = size: byte(00), half(01), word (10)
4200      instr[21]    = 1
4201      instr[20,17] = 1 000
4202      instr[16]    = max(0)/min(1)
4203      instr[15,10] = 1010 10
4204      instr[9,5]   = V source
4205      instr[4.0]   = R dest.  */
4206
4207   unsigned vs = INSTR (9, 5);
4208   unsigned rd = INSTR (4, 0);
4209   unsigned full = INSTR (30, 30);
4210   unsigned i;
4211
4212   NYI_assert (28, 24, 0x0E);
4213   NYI_assert (21, 21, 1);
4214   NYI_assert (20, 17, 8);
4215   NYI_assert (15, 10, 0x2A);
4216
4217   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4218   switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
4219     {
4220     case 0: /* SMAXV.  */
4221        {
4222         int64_t smax;
4223         switch (INSTR (23, 22))
4224           {
4225           case 0:
4226             smax = aarch64_get_vec_s8 (cpu, vs, 0);
4227             for (i = 1; i < (full ? 16 : 8); i++)
4228               smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
4229             break;
4230           case 1:
4231             smax = aarch64_get_vec_s16 (cpu, vs, 0);
4232             for (i = 1; i < (full ? 8 : 4); i++)
4233               smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
4234             break;
4235           case 2:
4236             smax = aarch64_get_vec_s32 (cpu, vs, 0);
4237             for (i = 1; i < (full ? 4 : 2); i++)
4238               smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
4239             break;
4240           case 3:
4241             HALT_UNALLOC;
4242           }
4243         aarch64_set_reg_s64 (cpu, rd, NO_SP, smax);
4244         return;
4245       }
4246
4247     case 1: /* SMINV.  */
4248       {
4249         int64_t smin;
4250         switch (INSTR (23, 22))
4251           {
4252           case 0:
4253             smin = aarch64_get_vec_s8 (cpu, vs, 0);
4254             for (i = 1; i < (full ? 16 : 8); i++)
4255               smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
4256             break;
4257           case 1:
4258             smin = aarch64_get_vec_s16 (cpu, vs, 0);
4259             for (i = 1; i < (full ? 8 : 4); i++)
4260               smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
4261             break;
4262           case 2:
4263             smin = aarch64_get_vec_s32 (cpu, vs, 0);
4264             for (i = 1; i < (full ? 4 : 2); i++)
4265               smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
4266             break;
4267
4268           case 3:
4269             HALT_UNALLOC;
4270           }
4271         aarch64_set_reg_s64 (cpu, rd, NO_SP, smin);
4272         return;
4273       }
4274
4275     case 2: /* UMAXV.  */
4276       {
4277         uint64_t umax;
4278         switch (INSTR (23, 22))
4279           {
4280           case 0:
4281             umax = aarch64_get_vec_u8 (cpu, vs, 0);
4282             for (i = 1; i < (full ? 16 : 8); i++)
4283               umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
4284             break;
4285           case 1:
4286             umax = aarch64_get_vec_u16 (cpu, vs, 0);
4287             for (i = 1; i < (full ? 8 : 4); i++)
4288               umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
4289             break;
4290           case 2:
4291             umax = aarch64_get_vec_u32 (cpu, vs, 0);
4292             for (i = 1; i < (full ? 4 : 2); i++)
4293               umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
4294             break;
4295
4296           case 3:
4297             HALT_UNALLOC;
4298           }
4299         aarch64_set_reg_u64 (cpu, rd, NO_SP, umax);
4300         return;
4301       }
4302
4303     case 3: /* UMINV.  */
4304       {
4305         uint64_t umin;
4306         switch (INSTR (23, 22))
4307           {
4308           case 0:
4309             umin = aarch64_get_vec_u8 (cpu, vs, 0);
4310             for (i = 1; i < (full ? 16 : 8); i++)
4311               umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
4312             break;
4313           case 1:
4314             umin = aarch64_get_vec_u16 (cpu, vs, 0);
4315             for (i = 1; i < (full ? 8 : 4); i++)
4316               umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
4317             break;
4318           case 2:
4319             umin = aarch64_get_vec_u32 (cpu, vs, 0);
4320             for (i = 1; i < (full ? 4 : 2); i++)
4321               umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
4322             break;
4323
4324           case 3:
4325             HALT_UNALLOC;
4326           }
4327         aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
4328         return;
4329       }
4330     }
4331 }
4332
4333 static void
4334 do_vec_fminmaxV (sim_cpu *cpu)
4335 {
4336   /* instr[31,24] = 0110 1110
4337      instr[23]    = max(0)/min(1)
4338      instr[22,14] = 011 0000 11
4339      instr[13,12] = nm(00)/normal(11)
4340      instr[11,10] = 10
4341      instr[9,5]   = V source
4342      instr[4.0]   = R dest.  */
4343
4344   unsigned vs = INSTR (9, 5);
4345   unsigned rd = INSTR (4, 0);
4346   unsigned i;
4347   float res   = aarch64_get_vec_float (cpu, vs, 0);
4348
4349   NYI_assert (31, 24, 0x6E);
4350   NYI_assert (22, 14, 0x0C3);
4351   NYI_assert (11, 10, 2);
4352
4353   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4354   if (INSTR (23, 23))
4355     {
4356       switch (INSTR (13, 12))
4357         {
4358         case 0: /* FMNINNMV.  */
4359           for (i = 1; i < 4; i++)
4360             res = fminnm (res, aarch64_get_vec_float (cpu, vs, i));
4361           break;
4362
4363         case 3: /* FMINV.  */
4364           for (i = 1; i < 4; i++)
4365             res = min (res, aarch64_get_vec_float (cpu, vs, i));
4366           break;
4367
4368         default:
4369           HALT_NYI;
4370         }
4371     }
4372   else
4373     {
4374       switch (INSTR (13, 12))
4375         {
4376         case 0: /* FMNAXNMV.  */
4377           for (i = 1; i < 4; i++)
4378             res = fmaxnm (res, aarch64_get_vec_float (cpu, vs, i));
4379           break;
4380
4381         case 3: /* FMAXV.  */
4382           for (i = 1; i < 4; i++)
4383             res = max (res, aarch64_get_vec_float (cpu, vs, i));
4384           break;
4385
4386         default:
4387           HALT_NYI;
4388         }
4389     }
4390
4391   aarch64_set_FP_float (cpu, rd, res);
4392 }
4393
4394 static void
4395 do_vec_Fminmax (sim_cpu *cpu)
4396 {
4397   /* instr[31]    = 0
4398      instr[30]    = half(0)/full(1)
4399      instr[29,24] = 00 1110
4400      instr[23]    = max(0)/min(1)
4401      instr[22]    = float(0)/double(1)
4402      instr[21]    = 1
4403      instr[20,16] = Vm
4404      instr[15,14] = 11
4405      instr[13,12] = nm(00)/normal(11)
4406      instr[11,10] = 01
4407      instr[9,5]   = Vn
4408      instr[4,0]   = Vd.  */
4409
4410   unsigned vm = INSTR (20, 16);
4411   unsigned vn = INSTR (9, 5);
4412   unsigned vd = INSTR (4, 0);
4413   unsigned full = INSTR (30, 30);
4414   unsigned min = INSTR (23, 23);
4415   unsigned i;
4416
4417   NYI_assert (29, 24, 0x0E);
4418   NYI_assert (21, 21, 1);
4419   NYI_assert (15, 14, 3);
4420   NYI_assert (11, 10, 1);
4421
4422   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4423   if (INSTR (22, 22))
4424     {
4425       double (* func)(double, double);
4426
4427       if (! full)
4428         HALT_NYI;
4429
4430       if (INSTR (13, 12) == 0)
4431         func = min ? dminnm : dmaxnm;
4432       else if (INSTR (13, 12) == 3)
4433         func = min ? fmin : fmax;
4434       else
4435         HALT_NYI;
4436
4437       for (i = 0; i < 2; i++)
4438         aarch64_set_vec_double (cpu, vd, i,
4439                                 func (aarch64_get_vec_double (cpu, vn, i),
4440                                       aarch64_get_vec_double (cpu, vm, i)));
4441     }
4442   else
4443     {
4444       float (* func)(float, float);
4445
4446       if (INSTR (13, 12) == 0)
4447         func = min ? fminnm : fmaxnm;
4448       else if (INSTR (13, 12) == 3)
4449         func = min ? fminf : fmaxf;
4450       else
4451         HALT_NYI;
4452
4453       for (i = 0; i < (full ? 4 : 2); i++)
4454         aarch64_set_vec_float (cpu, vd, i,
4455                                func (aarch64_get_vec_float (cpu, vn, i),
4456                                      aarch64_get_vec_float (cpu, vm, i)));
4457     }
4458 }
4459
4460 static void
4461 do_vec_SCVTF (sim_cpu *cpu)
4462 {
4463   /* instr[31]    = 0
4464      instr[30]    = Q
4465      instr[29,23] = 00 1110 0
4466      instr[22]    = float(0)/double(1)
4467      instr[21,10] = 10 0001 1101 10
4468      instr[9,5]   = Vn
4469      instr[4,0]   = Vd.  */
4470
4471   unsigned vn = INSTR (9, 5);
4472   unsigned vd = INSTR (4, 0);
4473   unsigned full = INSTR (30, 30);
4474   unsigned size = INSTR (22, 22);
4475   unsigned i;
4476
4477   NYI_assert (29, 23, 0x1C);
4478   NYI_assert (21, 10, 0x876);
4479
4480   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4481   if (size)
4482     {
4483       if (! full)
4484         HALT_UNALLOC;
4485
4486       for (i = 0; i < 2; i++)
4487         {
4488           double val = (double) aarch64_get_vec_u64 (cpu, vn, i);
4489           aarch64_set_vec_double (cpu, vd, i, val);
4490         }
4491     }
4492   else
4493     {
4494       for (i = 0; i < (full ? 4 : 2); i++)
4495         {
4496           float val = (float) aarch64_get_vec_u32 (cpu, vn, i);
4497           aarch64_set_vec_float (cpu, vd, i, val);
4498         }
4499     }
4500 }
4501
4502 #define VEC_CMP(SOURCE, CMP)                                            \
4503   do                                                                    \
4504     {                                                                   \
4505       switch (size)                                                     \
4506         {                                                               \
4507         case 0:                                                         \
4508           for (i = 0; i < (full ? 16 : 8); i++)                         \
4509             aarch64_set_vec_u8 (cpu, vd, i,                             \
4510                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4511                                 CMP                                     \
4512                                 aarch64_get_vec_##SOURCE##8 (cpu, vm, i) \
4513                                 ? -1 : 0);                              \
4514           return;                                                       \
4515         case 1:                                                         \
4516           for (i = 0; i < (full ? 8 : 4); i++)                          \
4517             aarch64_set_vec_u16 (cpu, vd, i,                            \
4518                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4519                                  CMP                                    \
4520                                  aarch64_get_vec_##SOURCE##16 (cpu, vm, i) \
4521                                  ? -1 : 0);                             \
4522           return;                                                       \
4523         case 2:                                                         \
4524           for (i = 0; i < (full ? 4 : 2); i++)                          \
4525             aarch64_set_vec_u32 (cpu, vd, i, \
4526                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4527                                  CMP                                    \
4528                                  aarch64_get_vec_##SOURCE##32 (cpu, vm, i) \
4529                                  ? -1 : 0);                             \
4530           return;                                                       \
4531         case 3:                                                         \
4532           if (! full)                                                   \
4533             HALT_UNALLOC;                                               \
4534           for (i = 0; i < 2; i++)                                       \
4535             aarch64_set_vec_u64 (cpu, vd, i, \
4536                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4537                                  CMP                                    \
4538                                  aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
4539                                  ? -1ULL : 0);                          \
4540           return;                                                       \
4541         }                                                               \
4542     }                                                                   \
4543   while (0)
4544
4545 #define VEC_CMP0(SOURCE, CMP)                                           \
4546   do                                                                    \
4547     {                                                                   \
4548       switch (size)                                                     \
4549         {                                                               \
4550         case 0:                                                         \
4551           for (i = 0; i < (full ? 16 : 8); i++)                         \
4552             aarch64_set_vec_u8 (cpu, vd, i,                             \
4553                                 aarch64_get_vec_##SOURCE##8 (cpu, vn, i) \
4554                                 CMP 0 ? -1 : 0);                        \
4555           return;                                                       \
4556         case 1:                                                         \
4557           for (i = 0; i < (full ? 8 : 4); i++)                          \
4558             aarch64_set_vec_u16 (cpu, vd, i,                            \
4559                                  aarch64_get_vec_##SOURCE##16 (cpu, vn, i) \
4560                                  CMP 0 ? -1 : 0);                       \
4561           return;                                                       \
4562         case 2:                                                         \
4563           for (i = 0; i < (full ? 4 : 2); i++)                          \
4564             aarch64_set_vec_u32 (cpu, vd, i,                            \
4565                                  aarch64_get_vec_##SOURCE##32 (cpu, vn, i) \
4566                                  CMP 0 ? -1 : 0);                       \
4567           return;                                                       \
4568         case 3:                                                         \
4569           if (! full)                                                   \
4570             HALT_UNALLOC;                                               \
4571           for (i = 0; i < 2; i++)                                       \
4572             aarch64_set_vec_u64 (cpu, vd, i,                            \
4573                                  aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
4574                                  CMP 0 ? -1ULL : 0);                    \
4575           return;                                                       \
4576         }                                                               \
4577     }                                                                   \
4578   while (0)
4579
4580 #define VEC_FCMP0(CMP)                                                  \
4581   do                                                                    \
4582     {                                                                   \
4583       if (vm != 0)                                                      \
4584         HALT_NYI;                                                       \
4585       if (INSTR (22, 22))                                               \
4586         {                                                               \
4587           if (! full)                                                   \
4588             HALT_NYI;                                                   \
4589           for (i = 0; i < 2; i++)                                       \
4590             aarch64_set_vec_u64 (cpu, vd, i,                            \
4591                                  aarch64_get_vec_double (cpu, vn, i)    \
4592                                  CMP 0.0 ? -1 : 0);                     \
4593         }                                                               \
4594       else                                                              \
4595         {                                                               \
4596           for (i = 0; i < (full ? 4 : 2); i++)                          \
4597             aarch64_set_vec_u32 (cpu, vd, i,                            \
4598                                  aarch64_get_vec_float (cpu, vn, i)     \
4599                                  CMP 0.0 ? -1 : 0);                     \
4600         }                                                               \
4601       return;                                                           \
4602     }                                                                   \
4603   while (0)
4604
4605 #define VEC_FCMP(CMP)                                                   \
4606   do                                                                    \
4607     {                                                                   \
4608       if (INSTR (22, 22))                                               \
4609         {                                                               \
4610           if (! full)                                                   \
4611             HALT_NYI;                                                   \
4612           for (i = 0; i < 2; i++)                                       \
4613             aarch64_set_vec_u64 (cpu, vd, i,                            \
4614                                  aarch64_get_vec_double (cpu, vn, i)    \
4615                                  CMP                                    \
4616                                  aarch64_get_vec_double (cpu, vm, i)    \
4617                                  ? -1 : 0);                             \
4618         }                                                               \
4619       else                                                              \
4620         {                                                               \
4621           for (i = 0; i < (full ? 4 : 2); i++)                          \
4622             aarch64_set_vec_u32 (cpu, vd, i,                            \
4623                                  aarch64_get_vec_float (cpu, vn, i)     \
4624                                  CMP                                    \
4625                                  aarch64_get_vec_float (cpu, vm, i)     \
4626                                  ? -1 : 0);                             \
4627         }                                                               \
4628       return;                                                           \
4629     }                                                                   \
4630   while (0)
4631
4632 static void
4633 do_vec_compare (sim_cpu *cpu)
4634 {
4635   /* instr[31]    = 0
4636      instr[30]    = half(0)/full(1)
4637      instr[29]    = part-of-comparison-type
4638      instr[28,24] = 0 1110
4639      instr[23,22] = size of integer compares: byte(00), half(01), word (10), long (11)
4640                     type of float compares: single (-0) / double (-1)
4641      instr[21]    = 1
4642      instr[20,16] = Vm or 00000 (compare vs 0)
4643      instr[15,10] = part-of-comparison-type
4644      instr[9,5]   = Vn
4645      instr[4.0]   = Vd.  */
4646
4647   int full = INSTR (30, 30);
4648   int size = INSTR (23, 22);
4649   unsigned vm = INSTR (20, 16);
4650   unsigned vn = INSTR (9, 5);
4651   unsigned vd = INSTR (4, 0);
4652   unsigned i;
4653
4654   NYI_assert (28, 24, 0x0E);
4655   NYI_assert (21, 21, 1);
4656
4657   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4658   if ((INSTR (11, 11)
4659        && INSTR (14, 14))
4660       || ((INSTR (11, 11) == 0
4661            && INSTR (10, 10) == 0)))
4662     {
4663       /* A compare vs 0.  */
4664       if (vm != 0)
4665         {
4666           if (INSTR (15, 10) == 0x2A)
4667             do_vec_maxv (cpu);
4668           else if (INSTR (15, 10) == 0x32
4669                    || INSTR (15, 10) == 0x3E)
4670             do_vec_fminmaxV (cpu);
4671           else if (INSTR (29, 23) == 0x1C
4672                    && INSTR (21, 10) == 0x876)
4673             do_vec_SCVTF (cpu);
4674           else
4675             HALT_NYI;
4676           return;
4677         }
4678     }
4679
4680   if (INSTR (14, 14))
4681     {
4682       /* A floating point compare.  */
4683       unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
4684         | INSTR (13, 10);
4685
4686       NYI_assert (15, 15, 1);
4687
4688       switch (decode)
4689         {
4690         case /* 0b010010: GT#0 */ 0x12: VEC_FCMP0 (>);
4691         case /* 0b110010: GE#0 */ 0x32: VEC_FCMP0 (>=);
4692         case /* 0b010110: EQ#0 */ 0x16: VEC_FCMP0 (==);
4693         case /* 0b110110: LE#0 */ 0x36: VEC_FCMP0 (<=);
4694         case /* 0b011010: LT#0 */ 0x1A: VEC_FCMP0 (<);
4695         case /* 0b111001: GT */   0x39: VEC_FCMP  (>);
4696         case /* 0b101001: GE */   0x29: VEC_FCMP  (>=);
4697         case /* 0b001001: EQ */   0x09: VEC_FCMP  (==);
4698
4699         default:
4700           HALT_NYI;
4701         }
4702     }
4703   else
4704     {
4705       unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
4706
4707       switch (decode)
4708         {
4709         case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
4710         case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
4711         case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
4712         case 0x23: /* 0100011 TST */    VEC_CMP  (u, & );
4713         case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
4714         case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
4715         case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
4716         case 0x4F: /* 1001111 HS */     VEC_CMP  (u, >= );
4717         case 0x62: /* 1100010 GE #0 */  VEC_CMP0 (s, >= );
4718         case 0x63: /* 1100011 EQ */     VEC_CMP  (u, == );
4719         case 0x66: /* 1100110 LE #0 */  VEC_CMP0 (s, <= );
4720         default:
4721           if (vm == 0)
4722             HALT_NYI;
4723           do_vec_maxv (cpu);
4724         }
4725     }
4726 }
4727
4728 static void
4729 do_vec_SSHL (sim_cpu *cpu)
4730 {
4731   /* instr[31]    = 0
4732      instr[30]    = first part (0)/ second part (1)
4733      instr[29,24] = 00 1110
4734      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4735      instr[21]    = 1
4736      instr[20,16] = Vm
4737      instr[15,10] = 0100 01
4738      instr[9,5]   = Vn
4739      instr[4,0]   = Vd.  */
4740
4741   unsigned full = INSTR (30, 30);
4742   unsigned vm = INSTR (20, 16);
4743   unsigned vn = INSTR (9, 5);
4744   unsigned vd = INSTR (4, 0);
4745   unsigned i;
4746   signed int shift;
4747
4748   NYI_assert (29, 24, 0x0E);
4749   NYI_assert (21, 21, 1);
4750   NYI_assert (15, 10, 0x11);
4751
4752   /* FIXME: What is a signed shift left in this context ?.  */
4753
4754   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4755   switch (INSTR (23, 22))
4756     {
4757     case 0:
4758       for (i = 0; i < (full ? 16 : 8); i++)
4759         {
4760           shift = aarch64_get_vec_s8 (cpu, vm, i);
4761           if (shift >= 0)
4762             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4763                                 << shift);
4764           else
4765             aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
4766                                 >> - shift);
4767         }
4768       return;
4769
4770     case 1:
4771       for (i = 0; i < (full ? 8 : 4); i++)
4772         {
4773           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4774           if (shift >= 0)
4775             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4776                                  << shift);
4777           else
4778             aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
4779                                  >> - shift);
4780         }
4781       return;
4782
4783     case 2:
4784       for (i = 0; i < (full ? 4 : 2); i++)
4785         {
4786           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4787           if (shift >= 0)
4788             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4789                                  << shift);
4790           else
4791             aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
4792                                  >> - shift);
4793         }
4794       return;
4795
4796     case 3:
4797       if (! full)
4798         HALT_UNALLOC;
4799       for (i = 0; i < 2; i++)
4800         {
4801           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4802           if (shift >= 0)
4803             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4804                                  << shift);
4805           else
4806             aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
4807                                  >> - shift);
4808         }
4809       return;
4810     }
4811 }
4812
4813 static void
4814 do_vec_USHL (sim_cpu *cpu)
4815 {
4816   /* instr[31]    = 0
4817      instr[30]    = first part (0)/ second part (1)
4818      instr[29,24] = 10 1110
4819      instr[23,22] = size: byte(00), half(01), word (10), long (11)
4820      instr[21]    = 1
4821      instr[20,16] = Vm
4822      instr[15,10] = 0100 01
4823      instr[9,5]   = Vn
4824      instr[4,0]   = Vd  */
4825
4826   unsigned full = INSTR (30, 30);
4827   unsigned vm = INSTR (20, 16);
4828   unsigned vn = INSTR (9, 5);
4829   unsigned vd = INSTR (4, 0);
4830   unsigned i;
4831   signed int shift;
4832
4833   NYI_assert (29, 24, 0x2E);
4834   NYI_assert (15, 10, 0x11);
4835
4836   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4837   switch (INSTR (23, 22))
4838     {
4839     case 0:
4840         for (i = 0; i < (full ? 16 : 8); i++)
4841           {
4842             shift = aarch64_get_vec_s8 (cpu, vm, i);
4843             if (shift >= 0)
4844               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4845                                   << shift);
4846             else
4847               aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
4848                                   >> - shift);
4849           }
4850       return;
4851
4852     case 1:
4853       for (i = 0; i < (full ? 8 : 4); i++)
4854         {
4855           shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
4856           if (shift >= 0)
4857             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4858                                  << shift);
4859           else
4860             aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
4861                                  >> - shift);
4862         }
4863       return;
4864
4865     case 2:
4866       for (i = 0; i < (full ? 4 : 2); i++)
4867         {
4868           shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
4869           if (shift >= 0)
4870             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4871                                  << shift);
4872           else
4873             aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
4874                                  >> - shift);
4875         }
4876       return;
4877
4878     case 3:
4879       if (! full)
4880         HALT_UNALLOC;
4881       for (i = 0; i < 2; i++)
4882         {
4883           shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
4884           if (shift >= 0)
4885             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4886                                  << shift);
4887           else
4888             aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
4889                                  >> - shift);
4890         }
4891       return;
4892     }
4893 }
4894
4895 static void
4896 do_vec_FMLA (sim_cpu *cpu)
4897 {
4898   /* instr[31]    = 0
4899      instr[30]    = full/half selector
4900      instr[29,23] = 0011100
4901      instr[22]    = size: 0=>float, 1=>double
4902      instr[21]    = 1
4903      instr[20,16] = Vn
4904      instr[15,10] = 1100 11
4905      instr[9,5]   = Vm
4906      instr[4.0]   = Vd.  */
4907
4908   unsigned vm = INSTR (20, 16);
4909   unsigned vn = INSTR (9, 5);
4910   unsigned vd = INSTR (4, 0);
4911   unsigned i;
4912   int      full = INSTR (30, 30);
4913
4914   NYI_assert (29, 23, 0x1C);
4915   NYI_assert (21, 21, 1);
4916   NYI_assert (15, 10, 0x33);
4917
4918   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4919   if (INSTR (22, 22))
4920     {
4921       if (! full)
4922         HALT_UNALLOC;
4923       for (i = 0; i < 2; i++)
4924         aarch64_set_vec_double (cpu, vd, i,
4925                                 aarch64_get_vec_double (cpu, vn, i) *
4926                                 aarch64_get_vec_double (cpu, vm, i) +
4927                                 aarch64_get_vec_double (cpu, vd, i));
4928     }
4929   else
4930     {
4931       for (i = 0; i < (full ? 4 : 2); i++)
4932         aarch64_set_vec_float (cpu, vd, i,
4933                                aarch64_get_vec_float (cpu, vn, i) *
4934                                aarch64_get_vec_float (cpu, vm, i) +
4935                                aarch64_get_vec_float (cpu, vd, i));
4936     }
4937 }
4938
4939 static void
4940 do_vec_max (sim_cpu *cpu)
4941 {
4942   /* instr[31]    = 0
4943      instr[30]    = full/half selector
4944      instr[29]    = SMAX (0) / UMAX (1)
4945      instr[28,24] = 0 1110
4946      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
4947      instr[21]    = 1
4948      instr[20,16] = Vn
4949      instr[15,10] = 0110 01
4950      instr[9,5]   = Vm
4951      instr[4.0]   = Vd.  */
4952
4953   unsigned vm = INSTR (20, 16);
4954   unsigned vn = INSTR (9, 5);
4955   unsigned vd = INSTR (4, 0);
4956   unsigned i;
4957   int      full = INSTR (30, 30);
4958
4959   NYI_assert (28, 24, 0x0E);
4960   NYI_assert (21, 21, 1);
4961   NYI_assert (15, 10, 0x19);
4962
4963   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
4964   if (INSTR (29, 29))
4965     {
4966       switch (INSTR (23, 22))
4967         {
4968         case 0:
4969           for (i = 0; i < (full ? 16 : 8); i++)
4970             aarch64_set_vec_u8 (cpu, vd, i,
4971                                 aarch64_get_vec_u8 (cpu, vn, i)
4972                                 > aarch64_get_vec_u8 (cpu, vm, i)
4973                                 ? aarch64_get_vec_u8 (cpu, vn, i)
4974                                 : aarch64_get_vec_u8 (cpu, vm, i));
4975           return;
4976
4977         case 1:
4978           for (i = 0; i < (full ? 8 : 4); i++)
4979             aarch64_set_vec_u16 (cpu, vd, i,
4980                                  aarch64_get_vec_u16 (cpu, vn, i)
4981                                  > aarch64_get_vec_u16 (cpu, vm, i)
4982                                  ? aarch64_get_vec_u16 (cpu, vn, i)
4983                                  : aarch64_get_vec_u16 (cpu, vm, i));
4984           return;
4985
4986         case 2:
4987           for (i = 0; i < (full ? 4 : 2); i++)
4988             aarch64_set_vec_u32 (cpu, vd, i,
4989                                  aarch64_get_vec_u32 (cpu, vn, i)
4990                                  > aarch64_get_vec_u32 (cpu, vm, i)
4991                                  ? aarch64_get_vec_u32 (cpu, vn, i)
4992                                  : aarch64_get_vec_u32 (cpu, vm, i));
4993           return;
4994
4995         case 3:
4996           HALT_UNALLOC;
4997         }
4998     }
4999   else
5000     {
5001       switch (INSTR (23, 22))
5002         {
5003         case 0:
5004           for (i = 0; i < (full ? 16 : 8); i++)
5005             aarch64_set_vec_s8 (cpu, vd, i,
5006                                 aarch64_get_vec_s8 (cpu, vn, i)
5007                                 > aarch64_get_vec_s8 (cpu, vm, i)
5008                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5009                                 : aarch64_get_vec_s8 (cpu, vm, i));
5010           return;
5011
5012         case 1:
5013           for (i = 0; i < (full ? 8 : 4); i++)
5014             aarch64_set_vec_s16 (cpu, vd, i,
5015                                  aarch64_get_vec_s16 (cpu, vn, i)
5016                                  > aarch64_get_vec_s16 (cpu, vm, i)
5017                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5018                                  : aarch64_get_vec_s16 (cpu, vm, i));
5019           return;
5020
5021         case 2:
5022           for (i = 0; i < (full ? 4 : 2); i++)
5023             aarch64_set_vec_s32 (cpu, vd, i,
5024                                  aarch64_get_vec_s32 (cpu, vn, i)
5025                                  > aarch64_get_vec_s32 (cpu, vm, i)
5026                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5027                                  : aarch64_get_vec_s32 (cpu, vm, i));
5028           return;
5029
5030         case 3:
5031           HALT_UNALLOC;
5032         }
5033     }
5034 }
5035
5036 static void
5037 do_vec_min (sim_cpu *cpu)
5038 {
5039   /* instr[31]    = 0
5040      instr[30]    = full/half selector
5041      instr[29]    = SMIN (0) / UMIN (1)
5042      instr[28,24] = 0 1110
5043      instr[23,22] = size: 00=> 8-bit, 01=> 16-bit, 10=> 32-bit
5044      instr[21]    = 1
5045      instr[20,16] = Vn
5046      instr[15,10] = 0110 11
5047      instr[9,5]   = Vm
5048      instr[4.0]   = Vd.  */
5049
5050   unsigned vm = INSTR (20, 16);
5051   unsigned vn = INSTR (9, 5);
5052   unsigned vd = INSTR (4, 0);
5053   unsigned i;
5054   int      full = INSTR (30, 30);
5055
5056   NYI_assert (28, 24, 0x0E);
5057   NYI_assert (21, 21, 1);
5058   NYI_assert (15, 10, 0x1B);
5059
5060   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5061   if (INSTR (29, 29))
5062     {
5063       switch (INSTR (23, 22))
5064         {
5065         case 0:
5066           for (i = 0; i < (full ? 16 : 8); i++)
5067             aarch64_set_vec_u8 (cpu, vd, i,
5068                                 aarch64_get_vec_u8 (cpu, vn, i)
5069                                 < aarch64_get_vec_u8 (cpu, vm, i)
5070                                 ? aarch64_get_vec_u8 (cpu, vn, i)
5071                                 : aarch64_get_vec_u8 (cpu, vm, i));
5072           return;
5073
5074         case 1:
5075           for (i = 0; i < (full ? 8 : 4); i++)
5076             aarch64_set_vec_u16 (cpu, vd, i,
5077                                  aarch64_get_vec_u16 (cpu, vn, i)
5078                                  < aarch64_get_vec_u16 (cpu, vm, i)
5079                                  ? aarch64_get_vec_u16 (cpu, vn, i)
5080                                  : aarch64_get_vec_u16 (cpu, vm, i));
5081           return;
5082
5083         case 2:
5084           for (i = 0; i < (full ? 4 : 2); i++)
5085             aarch64_set_vec_u32 (cpu, vd, i,
5086                                  aarch64_get_vec_u32 (cpu, vn, i)
5087                                  < aarch64_get_vec_u32 (cpu, vm, i)
5088                                  ? aarch64_get_vec_u32 (cpu, vn, i)
5089                                  : aarch64_get_vec_u32 (cpu, vm, i));
5090           return;
5091
5092         case 3:
5093           HALT_UNALLOC;
5094         }
5095     }
5096   else
5097     {
5098       switch (INSTR (23, 22))
5099         {
5100         case 0:
5101           for (i = 0; i < (full ? 16 : 8); i++)
5102             aarch64_set_vec_s8 (cpu, vd, i,
5103                                 aarch64_get_vec_s8 (cpu, vn, i)
5104                                 < aarch64_get_vec_s8 (cpu, vm, i)
5105                                 ? aarch64_get_vec_s8 (cpu, vn, i)
5106                                 : aarch64_get_vec_s8 (cpu, vm, i));
5107           return;
5108
5109         case 1:
5110           for (i = 0; i < (full ? 8 : 4); i++)
5111             aarch64_set_vec_s16 (cpu, vd, i,
5112                                  aarch64_get_vec_s16 (cpu, vn, i)
5113                                  < aarch64_get_vec_s16 (cpu, vm, i)
5114                                  ? aarch64_get_vec_s16 (cpu, vn, i)
5115                                  : aarch64_get_vec_s16 (cpu, vm, i));
5116           return;
5117
5118         case 2:
5119           for (i = 0; i < (full ? 4 : 2); i++)
5120             aarch64_set_vec_s32 (cpu, vd, i,
5121                                  aarch64_get_vec_s32 (cpu, vn, i)
5122                                  < aarch64_get_vec_s32 (cpu, vm, i)
5123                                  ? aarch64_get_vec_s32 (cpu, vn, i)
5124                                  : aarch64_get_vec_s32 (cpu, vm, i));
5125           return;
5126
5127         case 3:
5128           HALT_UNALLOC;
5129         }
5130     }
5131 }
5132
5133 static void
5134 do_vec_sub_long (sim_cpu *cpu)
5135 {
5136   /* instr[31]    = 0
5137      instr[30]    = lower (0) / upper (1)
5138      instr[29]    = signed (0) / unsigned (1)
5139      instr[28,24] = 0 1110
5140      instr[23,22] = size: bytes (00), half (01), word (10)
5141      instr[21]    = 1
5142      insrt[20,16] = Vm
5143      instr[15,10] = 0010 00
5144      instr[9,5]   = Vn
5145      instr[4,0]   = V dest.  */
5146
5147   unsigned size = INSTR (23, 22);
5148   unsigned vm = INSTR (20, 16);
5149   unsigned vn = INSTR (9, 5);
5150   unsigned vd = INSTR (4, 0);
5151   unsigned bias = 0;
5152   unsigned i;
5153
5154   NYI_assert (28, 24, 0x0E);
5155   NYI_assert (21, 21, 1);
5156   NYI_assert (15, 10, 0x08);
5157
5158   if (size == 3)
5159     HALT_UNALLOC;
5160
5161   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5162   switch (INSTR (30, 29))
5163     {
5164     case 2: /* SSUBL2.  */
5165       bias = 2;
5166     case 0: /* SSUBL.  */
5167       switch (size)
5168         {
5169         case 0:
5170           bias *= 3;
5171           for (i = 0; i < 8; i++)
5172             aarch64_set_vec_s16 (cpu, vd, i,
5173                                  aarch64_get_vec_s8 (cpu, vn, i + bias)
5174                                  - aarch64_get_vec_s8 (cpu, vm, i + bias));
5175           break;
5176
5177         case 1:
5178           bias *= 2;
5179           for (i = 0; i < 4; i++)
5180             aarch64_set_vec_s32 (cpu, vd, i,
5181                                  aarch64_get_vec_s16 (cpu, vn, i + bias)
5182                                  - aarch64_get_vec_s16 (cpu, vm, i + bias));
5183           break;
5184
5185         case 2:
5186           for (i = 0; i < 2; i++)
5187             aarch64_set_vec_s64 (cpu, vd, i,
5188                                  aarch64_get_vec_s32 (cpu, vn, i + bias)
5189                                  - aarch64_get_vec_s32 (cpu, vm, i + bias));
5190           break;
5191
5192         default:
5193           HALT_UNALLOC;
5194         }
5195       break;
5196
5197     case 3: /* USUBL2.  */
5198       bias = 2;
5199     case 1: /* USUBL.  */
5200       switch (size)
5201         {
5202         case 0:
5203           bias *= 3;
5204           for (i = 0; i < 8; i++)
5205             aarch64_set_vec_u16 (cpu, vd, i,
5206                                  aarch64_get_vec_u8 (cpu, vn, i + bias)
5207                                  - aarch64_get_vec_u8 (cpu, vm, i + bias));
5208           break;
5209
5210         case 1:
5211           bias *= 2;
5212           for (i = 0; i < 4; i++)
5213             aarch64_set_vec_u32 (cpu, vd, i,
5214                                  aarch64_get_vec_u16 (cpu, vn, i + bias)
5215                                  - aarch64_get_vec_u16 (cpu, vm, i + bias));
5216           break;
5217
5218         case 2:
5219           for (i = 0; i < 2; i++)
5220             aarch64_set_vec_u64 (cpu, vd, i,
5221                                  aarch64_get_vec_u32 (cpu, vn, i + bias)
5222                                  - aarch64_get_vec_u32 (cpu, vm, i + bias));
5223           break;
5224
5225         default:
5226           HALT_UNALLOC;
5227         }
5228       break;
5229     }
5230 }
5231
5232 static void
5233 do_vec_ADDP (sim_cpu *cpu)
5234 {
5235   /* instr[31]    = 0
5236      instr[30]    = half(0)/full(1)
5237      instr[29,24] = 00 1110
5238      instr[23,22] = size: bytes (00), half (01), word (10), long (11)
5239      instr[21]    = 1
5240      insrt[20,16] = Vm
5241      instr[15,10] = 1011 11
5242      instr[9,5]   = Vn
5243      instr[4,0]   = V dest.  */
5244
5245   FRegister copy_vn;
5246   FRegister copy_vm;
5247   unsigned full = INSTR (30, 30);
5248   unsigned size = INSTR (23, 22);
5249   unsigned vm = INSTR (20, 16);
5250   unsigned vn = INSTR (9, 5);
5251   unsigned vd = INSTR (4, 0);
5252   unsigned i, range;
5253
5254   NYI_assert (29, 24, 0x0E);
5255   NYI_assert (21, 21, 1);
5256   NYI_assert (15, 10, 0x2F);
5257
5258   /* Make copies of the source registers in case vd == vn/vm.  */
5259   copy_vn = cpu->fr[vn];
5260   copy_vm = cpu->fr[vm];
5261
5262   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5263   switch (size)
5264     {
5265     case 0:
5266       range = full ? 8 : 4;
5267       for (i = 0; i < range; i++)
5268         {
5269           aarch64_set_vec_u8 (cpu, vd, i,
5270                               copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
5271           aarch64_set_vec_u8 (cpu, vd, i + range,
5272                               copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
5273         }
5274       return;
5275
5276     case 1:
5277       range = full ? 4 : 2;
5278       for (i = 0; i < range; i++)
5279         {
5280           aarch64_set_vec_u16 (cpu, vd, i,
5281                                copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
5282           aarch64_set_vec_u16 (cpu, vd, i + range,
5283                                copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
5284         }
5285       return;
5286
5287     case 2:
5288       range = full ? 2 : 1;
5289       for (i = 0; i < range; i++)
5290         {
5291           aarch64_set_vec_u32 (cpu, vd, i,
5292                                copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
5293           aarch64_set_vec_u32 (cpu, vd, i + range,
5294                                copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
5295         }
5296       return;
5297
5298     case 3:
5299       if (! full)
5300         HALT_UNALLOC;
5301       aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
5302       aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
5303       return;
5304     }
5305 }
5306
5307 static void
5308 do_vec_UMOV (sim_cpu *cpu)
5309 {
5310   /* instr[31]    = 0
5311      instr[30]    = 32-bit(0)/64-bit(1)
5312      instr[29,21] = 00 1110 000
5313      insrt[20,16] = size & index
5314      instr[15,10] = 0011 11
5315      instr[9,5]   = V source
5316      instr[4,0]   = R dest.  */
5317
5318   unsigned vs = INSTR (9, 5);
5319   unsigned rd = INSTR (4, 0);
5320   unsigned index;
5321
5322   NYI_assert (29, 21, 0x070);
5323   NYI_assert (15, 10, 0x0F);
5324
5325   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5326   if (INSTR (16, 16))
5327     {
5328       /* Byte transfer.  */
5329       index = INSTR (20, 17);
5330       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5331                            aarch64_get_vec_u8 (cpu, vs, index));
5332     }
5333   else if (INSTR (17, 17))
5334     {
5335       index = INSTR (20, 18);
5336       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5337                            aarch64_get_vec_u16 (cpu, vs, index));
5338     }
5339   else if (INSTR (18, 18))
5340     {
5341       index = INSTR (20, 19);
5342       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5343                            aarch64_get_vec_u32 (cpu, vs, index));
5344     }
5345   else
5346     {
5347       if (INSTR (30, 30) != 1)
5348         HALT_UNALLOC;
5349
5350       index = INSTR (20, 20);
5351       aarch64_set_reg_u64 (cpu, rd, NO_SP,
5352                            aarch64_get_vec_u64 (cpu, vs, index));
5353     }
5354 }
5355
5356 static void
5357 do_vec_FABS (sim_cpu *cpu)
5358 {
5359   /* instr[31]    = 0
5360      instr[30]    = half(0)/full(1)
5361      instr[29,23] = 00 1110 1
5362      instr[22]    = float(0)/double(1)
5363      instr[21,16] = 10 0000
5364      instr[15,10] = 1111 10
5365      instr[9,5]   = Vn
5366      instr[4,0]   = Vd.  */
5367
5368   unsigned vn = INSTR (9, 5);
5369   unsigned vd = INSTR (4, 0);
5370   unsigned full = INSTR (30, 30);
5371   unsigned i;
5372
5373   NYI_assert (29, 23, 0x1D);
5374   NYI_assert (21, 10, 0x83E);
5375
5376   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5377   if (INSTR (22, 22))
5378     {
5379       if (! full)
5380         HALT_NYI;
5381
5382       for (i = 0; i < 2; i++)
5383         aarch64_set_vec_double (cpu, vd, i,
5384                                 fabs (aarch64_get_vec_double (cpu, vn, i)));
5385     }
5386   else
5387     {
5388       for (i = 0; i < (full ? 4 : 2); i++)
5389         aarch64_set_vec_float (cpu, vd, i,
5390                                fabsf (aarch64_get_vec_float (cpu, vn, i)));
5391     }
5392 }
5393
5394 static void
5395 do_vec_FCVTZS (sim_cpu *cpu)
5396 {
5397   /* instr[31]    = 0
5398      instr[30]    = half (0) / all (1)
5399      instr[29,23] = 00 1110 1
5400      instr[22]    = single (0) / double (1)
5401      instr[21,10] = 10 0001 1011 10
5402      instr[9,5]   = Rn
5403      instr[4,0]   = Rd.  */
5404
5405   unsigned rn = INSTR (9, 5);
5406   unsigned rd = INSTR (4, 0);
5407   unsigned full = INSTR (30, 30);
5408   unsigned i;
5409
5410   NYI_assert (31, 31, 0);
5411   NYI_assert (29, 23, 0x1D);
5412   NYI_assert (21, 10, 0x86E);
5413
5414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5415   if (INSTR (22, 22))
5416     {
5417       if (! full)
5418         HALT_UNALLOC;
5419
5420       for (i = 0; i < 2; i++)
5421         aarch64_set_vec_s64 (cpu, rd, i,
5422                              (int64_t) aarch64_get_vec_double (cpu, rn, i));
5423     }
5424   else
5425     for (i = 0; i < (full ? 4 : 2); i++)
5426       aarch64_set_vec_s32 (cpu, rd, i,
5427                            (int32_t) aarch64_get_vec_float (cpu, rn, i));
5428 }
5429
5430 static void
5431 do_vec_REV64 (sim_cpu *cpu)
5432 {
5433   /* instr[31]    = 0
5434      instr[30]    = full/half
5435      instr[29,24] = 00 1110
5436      instr[23,22] = size
5437      instr[21,10] = 10 0000 0000 10
5438      instr[9,5]   = Rn
5439      instr[4,0]   = Rd.  */
5440
5441   unsigned rn = INSTR (9, 5);
5442   unsigned rd = INSTR (4, 0);
5443   unsigned size = INSTR (23, 22);
5444   unsigned full = INSTR (30, 30);
5445   unsigned i;
5446   FRegister val;
5447
5448   NYI_assert (29, 24, 0x0E);
5449   NYI_assert (21, 10, 0x802);
5450
5451   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5452   switch (size)
5453     {
5454     case 0:
5455       for (i = 0; i < (full ? 16 : 8); i++)
5456         val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
5457       break;
5458
5459     case 1:
5460       for (i = 0; i < (full ? 8 : 4); i++)
5461         val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
5462       break;
5463
5464     case 2:
5465       for (i = 0; i < (full ? 4 : 2); i++)
5466         val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
5467       break;
5468
5469     case 3:
5470       HALT_UNALLOC;
5471     }
5472
5473   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5474   if (full)
5475     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5476 }
5477
5478 static void
5479 do_vec_REV16 (sim_cpu *cpu)
5480 {
5481   /* instr[31]    = 0
5482      instr[30]    = full/half
5483      instr[29,24] = 00 1110
5484      instr[23,22] = size
5485      instr[21,10] = 10 0000 0001 10
5486      instr[9,5]   = Rn
5487      instr[4,0]   = Rd.  */
5488
5489   unsigned rn = INSTR (9, 5);
5490   unsigned rd = INSTR (4, 0);
5491   unsigned size = INSTR (23, 22);
5492   unsigned full = INSTR (30, 30);
5493   unsigned i;
5494   FRegister val;
5495
5496   NYI_assert (29, 24, 0x0E);
5497   NYI_assert (21, 10, 0x806);
5498
5499   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5500   switch (size)
5501     {
5502     case 0:
5503       for (i = 0; i < (full ? 16 : 8); i++)
5504         val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
5505       break;
5506
5507     default:
5508       HALT_UNALLOC;
5509     }
5510
5511   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
5512   if (full)
5513     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
5514 }
5515
5516 static void
5517 do_vec_op1 (sim_cpu *cpu)
5518 {
5519   /* instr[31]    = 0
5520      instr[30]    = half/full
5521      instr[29,24] = 00 1110
5522      instr[23,21] = ???
5523      instr[20,16] = Vm
5524      instr[15,10] = sub-opcode
5525      instr[9,5]   = Vn
5526      instr[4,0]   = Vd  */
5527   NYI_assert (29, 24, 0x0E);
5528
5529   if (INSTR (21, 21) == 0)
5530     {
5531       if (INSTR (23, 22) == 0)
5532         {
5533           if (INSTR (30, 30) == 1
5534               && INSTR (17, 14) == 0
5535               && INSTR (12, 10) == 7)
5536             return do_vec_ins_2 (cpu);
5537
5538           switch (INSTR (15, 10))
5539             {
5540             case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
5541             case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
5542             case 0x07: do_vec_INS (cpu); return;
5543             case 0x0A: do_vec_TRN (cpu); return;
5544
5545             case 0x0F:
5546               if (INSTR (17, 16) == 0)
5547                 {
5548                   do_vec_MOV_into_scalar (cpu);
5549                   return;
5550                 }
5551               break;
5552
5553             case 0x00:
5554             case 0x08:
5555             case 0x10:
5556             case 0x18:
5557               do_vec_TBL (cpu); return;
5558
5559             case 0x06:
5560             case 0x16:
5561               do_vec_UZP (cpu); return;
5562
5563             case 0x0E:
5564             case 0x1E:
5565               do_vec_ZIP (cpu); return;
5566
5567             default:
5568               HALT_NYI;
5569             }
5570         }
5571
5572       switch (INSTR (13, 10))
5573         {
5574         case 0x6: do_vec_UZP (cpu); return;
5575         case 0xE: do_vec_ZIP (cpu); return;
5576         case 0xA: do_vec_TRN (cpu); return;
5577         case 0xF: do_vec_UMOV (cpu); return;
5578         default:  HALT_NYI;
5579         }
5580     }
5581
5582   switch (INSTR (15, 10))
5583     {
5584     case 0x02: do_vec_REV64 (cpu); return;
5585     case 0x06: do_vec_REV16 (cpu); return;
5586
5587     case 0x07:
5588       switch (INSTR (23, 21))
5589         {
5590         case 1: do_vec_AND (cpu); return;
5591         case 3: do_vec_BIC (cpu); return;
5592         case 5: do_vec_ORR (cpu); return;
5593         case 7: do_vec_ORN (cpu); return;
5594         default: HALT_NYI;
5595         }
5596
5597     case 0x08: do_vec_sub_long (cpu); return;
5598     case 0x0a: do_vec_XTN (cpu); return;
5599     case 0x11: do_vec_SSHL (cpu); return;
5600     case 0x19: do_vec_max (cpu); return;
5601     case 0x1B: do_vec_min (cpu); return;
5602     case 0x21: do_vec_add (cpu); return;
5603     case 0x25: do_vec_MLA (cpu); return;
5604     case 0x27: do_vec_mul (cpu); return;
5605     case 0x2F: do_vec_ADDP (cpu); return;
5606     case 0x30: do_vec_mull (cpu); return;
5607     case 0x33: do_vec_FMLA (cpu); return;
5608     case 0x35: do_vec_fadd (cpu); return;
5609
5610     case 0x2E:
5611       switch (INSTR (20, 16))
5612         {
5613         case 0x00: do_vec_ABS (cpu); return;
5614         case 0x01: do_vec_FCVTZS (cpu); return;
5615         case 0x11: do_vec_ADDV (cpu); return;
5616         default: HALT_NYI;
5617         }
5618
5619     case 0x31:
5620     case 0x3B:
5621       do_vec_Fminmax (cpu); return;
5622
5623     case 0x0D:
5624     case 0x0F:
5625     case 0x22:
5626     case 0x23:
5627     case 0x26:
5628     case 0x2A:
5629     case 0x32:
5630     case 0x36:
5631     case 0x39:
5632     case 0x3A:
5633       do_vec_compare (cpu); return;
5634
5635     case 0x3E:
5636       do_vec_FABS (cpu); return;
5637
5638     default:
5639       HALT_NYI;
5640     }
5641 }
5642
5643 static void
5644 do_vec_xtl (sim_cpu *cpu)
5645 {
5646   /* instr[31]    = 0
5647      instr[30,29] = SXTL (00), UXTL (01), SXTL2 (10), UXTL2 (11)
5648      instr[28,22] = 0 1111 00
5649      instr[21,16] = size & shift (USHLL, SSHLL, USHLL2, SSHLL2)
5650      instr[15,10] = 1010 01
5651      instr[9,5]   = V source
5652      instr[4,0]   = V dest.  */
5653
5654   unsigned vs = INSTR (9, 5);
5655   unsigned vd = INSTR (4, 0);
5656   unsigned i, shift, bias = 0;
5657
5658   NYI_assert (28, 22, 0x3C);
5659   NYI_assert (15, 10, 0x29);
5660
5661   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5662   switch (INSTR (30, 29))
5663     {
5664     case 2: /* SXTL2, SSHLL2.  */
5665       bias = 2;
5666     case 0: /* SXTL, SSHLL.  */
5667       if (INSTR (21, 21))
5668         {
5669           int64_t val1, val2;
5670
5671           shift = INSTR (20, 16);
5672           /* Get the source values before setting the destination values
5673              in case the source and destination are the same.  */
5674           val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
5675           val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
5676           aarch64_set_vec_s64 (cpu, vd, 0, val1);
5677           aarch64_set_vec_s64 (cpu, vd, 1, val2);
5678         }
5679       else if (INSTR (20, 20))
5680         {
5681           int32_t v[4];
5682           int32_t v1,v2,v3,v4;
5683
5684           shift = INSTR (19, 16);
5685           bias *= 2;
5686           for (i = 0; i < 4; i++)
5687             v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
5688           for (i = 0; i < 4; i++)
5689             aarch64_set_vec_s32 (cpu, vd, i, v[i]);
5690         }
5691       else
5692         {
5693           int16_t v[8];
5694           NYI_assert (19, 19, 1);
5695
5696           shift = INSTR (18, 16);
5697           bias *= 3;
5698           for (i = 0; i < 8; i++)
5699             v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
5700           for (i = 0; i < 8; i++)
5701             aarch64_set_vec_s16 (cpu, vd, i, v[i]);
5702         }
5703       return;
5704
5705     case 3: /* UXTL2, USHLL2.  */
5706       bias = 2;
5707     case 1: /* UXTL, USHLL.  */
5708       if (INSTR (21, 21))
5709         {
5710           uint64_t v1, v2;
5711           shift = INSTR (20, 16);
5712           v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
5713           v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
5714           aarch64_set_vec_u64 (cpu, vd, 0, v1);
5715           aarch64_set_vec_u64 (cpu, vd, 1, v2);
5716         }
5717       else if (INSTR (20, 20))
5718         {
5719           uint32_t v[4];
5720           shift = INSTR (19, 16);
5721           bias *= 2;
5722           for (i = 0; i < 4; i++)
5723             v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
5724           for (i = 0; i < 4; i++)
5725             aarch64_set_vec_u32 (cpu, vd, i, v[i]);
5726         }
5727       else
5728         {
5729           uint16_t v[8];
5730           NYI_assert (19, 19, 1);
5731
5732           shift = INSTR (18, 16);
5733           bias *= 3;
5734           for (i = 0; i < 8; i++)
5735             v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
5736           for (i = 0; i < 8; i++)
5737             aarch64_set_vec_u16 (cpu, vd, i, v[i]);
5738         }
5739       return;
5740     }
5741 }
5742
5743 static void
5744 do_vec_SHL (sim_cpu *cpu)
5745 {
5746   /* instr [31]    = 0
5747      instr [30]    = half(0)/full(1)
5748      instr [29,23] = 001 1110
5749      instr [22,16] = size and shift amount
5750      instr [15,10] = 01 0101
5751      instr [9, 5]  = Vs
5752      instr [4, 0]  = Vd.  */
5753
5754   int shift;
5755   int full    = INSTR (30, 30);
5756   unsigned vs = INSTR (9, 5);
5757   unsigned vd = INSTR (4, 0);
5758   unsigned i;
5759
5760   NYI_assert (29, 23, 0x1E);
5761   NYI_assert (15, 10, 0x15);
5762
5763   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5764   if (INSTR (22, 22))
5765     {
5766       shift = INSTR (21, 16);
5767
5768       if (full == 0)
5769         HALT_UNALLOC;
5770
5771       for (i = 0; i < 2; i++)
5772         {
5773           uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5774           aarch64_set_vec_u64 (cpu, vd, i, val << shift);
5775         }
5776
5777       return;
5778     }
5779
5780   if (INSTR (21, 21))
5781     {
5782       shift = INSTR (20, 16);
5783
5784       for (i = 0; i < (full ? 4 : 2); i++)
5785         {
5786           uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5787           aarch64_set_vec_u32 (cpu, vd, i, val << shift);
5788         }
5789
5790       return;
5791     }
5792
5793   if (INSTR (20, 20))
5794     {
5795       shift = INSTR (19, 16);
5796
5797       for (i = 0; i < (full ? 8 : 4); i++)
5798         {
5799           uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5800           aarch64_set_vec_u16 (cpu, vd, i, val << shift);
5801         }
5802
5803       return;
5804     }
5805
5806   if (INSTR (19, 19) == 0)
5807     HALT_UNALLOC;
5808
5809   shift = INSTR (18, 16);
5810
5811   for (i = 0; i < (full ? 16 : 8); i++)
5812     {
5813       uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5814       aarch64_set_vec_u8 (cpu, vd, i, val << shift);
5815     }
5816 }
5817
5818 static void
5819 do_vec_SSHR_USHR (sim_cpu *cpu)
5820 {
5821   /* instr [31]    = 0
5822      instr [30]    = half(0)/full(1)
5823      instr [29]    = signed(0)/unsigned(1)
5824      instr [28,23] = 0 1111 0
5825      instr [22,16] = size and shift amount
5826      instr [15,10] = 0000 01
5827      instr [9, 5]  = Vs
5828      instr [4, 0]  = Vd.  */
5829
5830   int full       = INSTR (30, 30);
5831   int sign       = ! INSTR (29, 29);
5832   unsigned shift = INSTR (22, 16);
5833   unsigned vs    = INSTR (9, 5);
5834   unsigned vd    = INSTR (4, 0);
5835   unsigned i;
5836
5837   NYI_assert (28, 23, 0x1E);
5838   NYI_assert (15, 10, 0x01);
5839
5840   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5841   if (INSTR (22, 22))
5842     {
5843       shift = 128 - shift;
5844
5845       if (full == 0)
5846         HALT_UNALLOC;
5847
5848       if (sign)
5849         for (i = 0; i < 2; i++)
5850           {
5851             int64_t val = aarch64_get_vec_s64 (cpu, vs, i);
5852             aarch64_set_vec_s64 (cpu, vd, i, val >> shift);
5853           }
5854       else
5855         for (i = 0; i < 2; i++)
5856           {
5857             uint64_t val = aarch64_get_vec_u64 (cpu, vs, i);
5858             aarch64_set_vec_u64 (cpu, vd, i, val >> shift);
5859           }
5860
5861       return;
5862     }
5863
5864   if (INSTR (21, 21))
5865     {
5866       shift = 64 - shift;
5867
5868       if (sign)
5869         for (i = 0; i < (full ? 4 : 2); i++)
5870           {
5871             int32_t val = aarch64_get_vec_s32 (cpu, vs, i);
5872             aarch64_set_vec_s32 (cpu, vd, i, val >> shift);
5873           }
5874       else
5875         for (i = 0; i < (full ? 4 : 2); i++)
5876           {
5877             uint32_t val = aarch64_get_vec_u32 (cpu, vs, i);
5878             aarch64_set_vec_u32 (cpu, vd, i, val >> shift);
5879           }
5880
5881       return;
5882     }
5883
5884   if (INSTR (20, 20))
5885     {
5886       shift = 32 - shift;
5887
5888       if (sign)
5889         for (i = 0; i < (full ? 8 : 4); i++)
5890           {
5891             int16_t val = aarch64_get_vec_s16 (cpu, vs, i);
5892             aarch64_set_vec_s16 (cpu, vd, i, val >> shift);
5893           }
5894       else
5895         for (i = 0; i < (full ? 8 : 4); i++)
5896           {
5897             uint16_t val = aarch64_get_vec_u16 (cpu, vs, i);
5898             aarch64_set_vec_u16 (cpu, vd, i, val >> shift);
5899           }
5900
5901       return;
5902     }
5903
5904   if (INSTR (19, 19) == 0)
5905     HALT_UNALLOC;
5906
5907   shift = 16 - shift;
5908
5909   if (sign)
5910     for (i = 0; i < (full ? 16 : 8); i++)
5911       {
5912         int8_t val = aarch64_get_vec_s8 (cpu, vs, i);
5913         aarch64_set_vec_s8 (cpu, vd, i, val >> shift);
5914       }
5915   else
5916     for (i = 0; i < (full ? 16 : 8); i++)
5917       {
5918         uint8_t val = aarch64_get_vec_u8 (cpu, vs, i);
5919         aarch64_set_vec_u8 (cpu, vd, i, val >> shift);
5920       }
5921 }
5922
5923 static void
5924 do_vec_MUL_by_element (sim_cpu *cpu)
5925 {
5926   /* instr[31]    = 0
5927      instr[30]    = half/full
5928      instr[29,24] = 00 1111
5929      instr[23,22] = size
5930      instr[21]    = L
5931      instr[20]    = M
5932      instr[19,16] = m
5933      instr[15,12] = 1000
5934      instr[11]    = H
5935      instr[10]    = 0
5936      instr[9,5]   = Vn
5937      instr[4,0]   = Vd  */
5938
5939   unsigned full     = INSTR (30, 30);
5940   unsigned L        = INSTR (21, 21);
5941   unsigned H        = INSTR (11, 11);
5942   unsigned vn       = INSTR (9, 5);
5943   unsigned vd       = INSTR (4, 0);
5944   unsigned size     = INSTR (23, 22);
5945   unsigned index;
5946   unsigned vm;
5947   unsigned e;
5948
5949   NYI_assert (29, 24, 0x0F);
5950   NYI_assert (15, 12, 0x8);
5951   NYI_assert (10, 10, 0);
5952
5953   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
5954   switch (size)
5955     {
5956     case 1:
5957       {
5958         /* 16 bit products.  */
5959         uint16_t product;
5960         uint16_t element1;
5961         uint16_t element2;
5962
5963         index = (H << 2) | (L << 1) | INSTR (20, 20);
5964         vm = INSTR (19, 16);
5965         element2 = aarch64_get_vec_u16 (cpu, vm, index);
5966
5967         for (e = 0; e < (full ? 8 : 4); e ++)
5968           {
5969             element1 = aarch64_get_vec_u16 (cpu, vn, e);
5970             product  = element1 * element2;
5971             aarch64_set_vec_u16 (cpu, vd, e, product);
5972           }
5973       }
5974       break;
5975
5976     case 2:
5977       {
5978         /* 32 bit products.  */
5979         uint32_t product;
5980         uint32_t element1;
5981         uint32_t element2;
5982
5983         index = (H << 1) | L;
5984         vm = INSTR (20, 16);
5985         element2 = aarch64_get_vec_u32 (cpu, vm, index);
5986
5987         for (e = 0; e < (full ? 4 : 2); e ++)
5988           {
5989             element1 = aarch64_get_vec_u32 (cpu, vn, e);
5990             product  = element1 * element2;
5991             aarch64_set_vec_u32 (cpu, vd, e, product);
5992           }
5993       }
5994       break;
5995
5996     default:
5997       HALT_UNALLOC;
5998     }
5999 }
6000
6001 static void
6002 do_FMLA_by_element (sim_cpu *cpu)
6003 {
6004   /* instr[31]    = 0
6005      instr[30]    = half/full
6006      instr[29,23] = 00 1111 1
6007      instr[22]    = size
6008      instr[21]    = L
6009      instr[20,16] = m
6010      instr[15,12] = 0001
6011      instr[11]    = H
6012      instr[10]    = 0
6013      instr[9,5]   = Vn
6014      instr[4,0]   = Vd  */
6015
6016   unsigned full     = INSTR (30, 30);
6017   unsigned size     = INSTR (22, 22);
6018   unsigned L        = INSTR (21, 21);
6019   unsigned vm       = INSTR (20, 16);
6020   unsigned H        = INSTR (11, 11);
6021   unsigned vn       = INSTR (9, 5);
6022   unsigned vd       = INSTR (4, 0);
6023   unsigned e;
6024
6025   NYI_assert (29, 23, 0x1F);
6026   NYI_assert (15, 12, 0x1);
6027   NYI_assert (10, 10, 0);
6028
6029   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6030   if (size)
6031     {
6032       double element1, element2;
6033
6034       if (! full || L)
6035         HALT_UNALLOC;
6036
6037       element2 = aarch64_get_vec_double (cpu, vm, H);
6038
6039       for (e = 0; e < 2; e++)
6040         {
6041           element1 = aarch64_get_vec_double (cpu, vn, e);
6042           element1 *= element2;
6043           element1 += aarch64_get_vec_double (cpu, vd, e);
6044           aarch64_set_vec_double (cpu, vd, e, element1);
6045         }
6046     }
6047   else
6048     {
6049       float element1;
6050       float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
6051
6052       for (e = 0; e < (full ? 4 : 2); e++)
6053         {
6054           element1 = aarch64_get_vec_float (cpu, vn, e);
6055           element1 *= element2;
6056           element1 += aarch64_get_vec_float (cpu, vd, e);
6057           aarch64_set_vec_float (cpu, vd, e, element1);
6058         }
6059     }
6060 }
6061
6062 static void
6063 do_vec_op2 (sim_cpu *cpu)
6064 {
6065   /* instr[31]    = 0
6066      instr[30]    = half/full
6067      instr[29,24] = 00 1111
6068      instr[23]    = ?
6069      instr[22,16] = element size & index
6070      instr[15,10] = sub-opcode
6071      instr[9,5]   = Vm
6072      instr[4,0]   = Vd  */
6073
6074   NYI_assert (29, 24, 0x0F);
6075
6076   if (INSTR (23, 23) != 0)
6077     {
6078       switch (INSTR (15, 10))
6079         {
6080         case 0x04:
6081         case 0x06:
6082           do_FMLA_by_element (cpu);
6083           return;
6084
6085         case 0x20:
6086         case 0x22:
6087           do_vec_MUL_by_element (cpu);
6088           return;
6089
6090         default:
6091           HALT_NYI;
6092         }
6093     }
6094   else
6095     {
6096       switch (INSTR (15, 10))
6097         {
6098         case 0x01: do_vec_SSHR_USHR (cpu); return;
6099         case 0x15: do_vec_SHL (cpu); return;
6100         case 0x20:
6101         case 0x22: do_vec_MUL_by_element (cpu); return;
6102         case 0x29: do_vec_xtl (cpu); return;
6103         default:   HALT_NYI;
6104         }
6105     }
6106 }
6107
6108 static void
6109 do_vec_neg (sim_cpu *cpu)
6110 {
6111   /* instr[31]    = 0
6112      instr[30]    = full(1)/half(0)
6113      instr[29,24] = 10 1110
6114      instr[23,22] = size: byte(00), half (01), word (10), long (11)
6115      instr[21,10] = 1000 0010 1110
6116      instr[9,5]   = Vs
6117      instr[4,0]   = Vd  */
6118
6119   int    full = INSTR (30, 30);
6120   unsigned vs = INSTR (9, 5);
6121   unsigned vd = INSTR (4, 0);
6122   unsigned i;
6123
6124   NYI_assert (29, 24, 0x2E);
6125   NYI_assert (21, 10, 0x82E);
6126
6127   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6128   switch (INSTR (23, 22))
6129     {
6130     case 0:
6131       for (i = 0; i < (full ? 16 : 8); i++)
6132         aarch64_set_vec_s8 (cpu, vd, i, - aarch64_get_vec_s8 (cpu, vs, i));
6133       return;
6134
6135     case 1:
6136       for (i = 0; i < (full ? 8 : 4); i++)
6137         aarch64_set_vec_s16 (cpu, vd, i, - aarch64_get_vec_s16 (cpu, vs, i));
6138       return;
6139
6140     case 2:
6141       for (i = 0; i < (full ? 4 : 2); i++)
6142         aarch64_set_vec_s32 (cpu, vd, i, - aarch64_get_vec_s32 (cpu, vs, i));
6143       return;
6144
6145     case 3:
6146       if (! full)
6147         HALT_NYI;
6148       for (i = 0; i < 2; i++)
6149         aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
6150       return;
6151     }
6152 }
6153
6154 static void
6155 do_vec_sqrt (sim_cpu *cpu)
6156 {
6157   /* instr[31]    = 0
6158      instr[30]    = full(1)/half(0)
6159      instr[29,23] = 101 1101
6160      instr[22]    = single(0)/double(1)
6161      instr[21,10] = 1000 0111 1110
6162      instr[9,5]   = Vs
6163      instr[4,0]   = Vd.  */
6164
6165   int    full = INSTR (30, 30);
6166   unsigned vs = INSTR (9, 5);
6167   unsigned vd = INSTR (4, 0);
6168   unsigned i;
6169
6170   NYI_assert (29, 23, 0x5B);
6171   NYI_assert (21, 10, 0x87E);
6172
6173   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6174   if (INSTR (22, 22) == 0)
6175     for (i = 0; i < (full ? 4 : 2); i++)
6176       aarch64_set_vec_float (cpu, vd, i,
6177                              sqrtf (aarch64_get_vec_float (cpu, vs, i)));
6178   else
6179     for (i = 0; i < 2; i++)
6180       aarch64_set_vec_double (cpu, vd, i,
6181                               sqrt (aarch64_get_vec_double (cpu, vs, i)));
6182 }
6183
6184 static void
6185 do_vec_mls_indexed (sim_cpu *cpu)
6186 {
6187   /* instr[31]       = 0
6188      instr[30]       = half(0)/full(1)
6189      instr[29,24]    = 10 1111
6190      instr[23,22]    = 16-bit(01)/32-bit(10)
6191      instr[21,20+11] = index (if 16-bit)
6192      instr[21+11]    = index (if 32-bit)
6193      instr[20,16]    = Vm
6194      instr[15,12]    = 0100
6195      instr[11]       = part of index
6196      instr[10]       = 0
6197      instr[9,5]      = Vs
6198      instr[4,0]      = Vd.  */
6199
6200   int    full = INSTR (30, 30);
6201   unsigned vs = INSTR (9, 5);
6202   unsigned vd = INSTR (4, 0);
6203   unsigned vm = INSTR (20, 16);
6204   unsigned i;
6205
6206   NYI_assert (15, 12, 4);
6207   NYI_assert (10, 10, 0);
6208
6209   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6210   switch (INSTR (23, 22))
6211     {
6212     case 1:
6213       {
6214         unsigned elem;
6215         uint32_t val;
6216
6217         if (vm > 15)
6218           HALT_NYI;
6219
6220         elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
6221         val = aarch64_get_vec_u16 (cpu, vm, elem);
6222
6223         for (i = 0; i < (full ? 8 : 4); i++)
6224           aarch64_set_vec_u32 (cpu, vd, i,
6225                                aarch64_get_vec_u32 (cpu, vd, i) -
6226                                (aarch64_get_vec_u32 (cpu, vs, i) * val));
6227         return;
6228       }
6229
6230     case 2:
6231       {
6232         unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
6233         uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
6234
6235         for (i = 0; i < (full ? 4 : 2); i++)
6236           aarch64_set_vec_u64 (cpu, vd, i,
6237                                aarch64_get_vec_u64 (cpu, vd, i) -
6238                                (aarch64_get_vec_u64 (cpu, vs, i) * val));
6239         return;
6240       }
6241
6242     case 0:
6243     case 3:
6244     default:
6245       HALT_NYI;
6246     }
6247 }
6248
6249 static void
6250 do_vec_SUB (sim_cpu *cpu)
6251 {
6252   /* instr [31]    = 0
6253      instr [30]    = half(0)/full(1)
6254      instr [29,24] = 10 1110
6255      instr [23,22] = size: byte(00, half(01), word (10), long (11)
6256      instr [21]    = 1
6257      instr [20,16] = Vm
6258      instr [15,10] = 10 0001
6259      instr [9, 5]  = Vn
6260      instr [4, 0]  = Vd.  */
6261
6262   unsigned full = INSTR (30, 30);
6263   unsigned vm = INSTR (20, 16);
6264   unsigned vn = INSTR (9, 5);
6265   unsigned vd = INSTR (4, 0);
6266   unsigned i;
6267
6268   NYI_assert (29, 24, 0x2E);
6269   NYI_assert (21, 21, 1);
6270   NYI_assert (15, 10, 0x21);
6271
6272   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6273   switch (INSTR (23, 22))
6274     {
6275     case 0:
6276       for (i = 0; i < (full ? 16 : 8); i++)
6277         aarch64_set_vec_s8 (cpu, vd, i,
6278                             aarch64_get_vec_s8 (cpu, vn, i)
6279                             - aarch64_get_vec_s8 (cpu, vm, i));
6280       return;
6281
6282     case 1:
6283       for (i = 0; i < (full ? 8 : 4); i++)
6284         aarch64_set_vec_s16 (cpu, vd, i,
6285                              aarch64_get_vec_s16 (cpu, vn, i)
6286                              - aarch64_get_vec_s16 (cpu, vm, i));
6287       return;
6288
6289     case 2:
6290       for (i = 0; i < (full ? 4 : 2); i++)
6291         aarch64_set_vec_s32 (cpu, vd, i,
6292                              aarch64_get_vec_s32 (cpu, vn, i)
6293                              - aarch64_get_vec_s32 (cpu, vm, i));
6294       return;
6295
6296     case 3:
6297       if (full == 0)
6298         HALT_UNALLOC;
6299
6300       for (i = 0; i < 2; i++)
6301         aarch64_set_vec_s64 (cpu, vd, i,
6302                              aarch64_get_vec_s64 (cpu, vn, i)
6303                              - aarch64_get_vec_s64 (cpu, vm, i));
6304       return;
6305     }
6306 }
6307
6308 static void
6309 do_vec_MLS (sim_cpu *cpu)
6310 {
6311   /* instr [31]    = 0
6312      instr [30]    = half(0)/full(1)
6313      instr [29,24] = 10 1110
6314      instr [23,22] = size: byte(00, half(01), word (10)
6315      instr [21]    = 1
6316      instr [20,16] = Vm
6317      instr [15,10] = 10 0101
6318      instr [9, 5]  = Vn
6319      instr [4, 0]  = Vd.  */
6320
6321   unsigned full = INSTR (30, 30);
6322   unsigned vm = INSTR (20, 16);
6323   unsigned vn = INSTR (9, 5);
6324   unsigned vd = INSTR (4, 0);
6325   unsigned i;
6326
6327   NYI_assert (29, 24, 0x2E);
6328   NYI_assert (21, 21, 1);
6329   NYI_assert (15, 10, 0x25);
6330
6331   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6332   switch (INSTR (23, 22))
6333     {
6334     case 0:
6335       for (i = 0; i < (full ? 16 : 8); i++)
6336         aarch64_set_vec_u8 (cpu, vd, i,
6337                             aarch64_get_vec_u8 (cpu, vd, i)
6338                             - (aarch64_get_vec_u8 (cpu, vn, i)
6339                                * aarch64_get_vec_u8 (cpu, vm, i)));
6340       return;
6341
6342     case 1:
6343       for (i = 0; i < (full ? 8 : 4); i++)
6344         aarch64_set_vec_u16 (cpu, vd, i,
6345                              aarch64_get_vec_u16 (cpu, vd, i)
6346                              - (aarch64_get_vec_u16 (cpu, vn, i)
6347                                 * aarch64_get_vec_u16 (cpu, vm, i)));
6348       return;
6349
6350     case 2:
6351       for (i = 0; i < (full ? 4 : 2); i++)
6352         aarch64_set_vec_u32 (cpu, vd, i,
6353                              aarch64_get_vec_u32 (cpu, vd, i)
6354                              - (aarch64_get_vec_u32 (cpu, vn, i)
6355                                 * aarch64_get_vec_u32 (cpu, vm, i)));
6356       return;
6357
6358     default:
6359       HALT_UNALLOC;
6360     }
6361 }
6362
6363 static void
6364 do_vec_FDIV (sim_cpu *cpu)
6365 {
6366   /* instr [31]    = 0
6367      instr [30]    = half(0)/full(1)
6368      instr [29,23] = 10 1110 0
6369      instr [22]    = float()/double(1)
6370      instr [21]    = 1
6371      instr [20,16] = Vm
6372      instr [15,10] = 1111 11
6373      instr [9, 5]  = Vn
6374      instr [4, 0]  = Vd.  */
6375
6376   unsigned full = INSTR (30, 30);
6377   unsigned vm = INSTR (20, 16);
6378   unsigned vn = INSTR (9, 5);
6379   unsigned vd = INSTR (4, 0);
6380   unsigned i;
6381
6382   NYI_assert (29, 23, 0x5C);
6383   NYI_assert (21, 21, 1);
6384   NYI_assert (15, 10, 0x3F);
6385
6386   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6387   if (INSTR (22, 22))
6388     {
6389       if (! full)
6390         HALT_UNALLOC;
6391
6392       for (i = 0; i < 2; i++)
6393         aarch64_set_vec_double (cpu, vd, i,
6394                                 aarch64_get_vec_double (cpu, vn, i)
6395                                 / aarch64_get_vec_double (cpu, vm, i));
6396     }
6397   else
6398     for (i = 0; i < (full ? 4 : 2); i++)
6399       aarch64_set_vec_float (cpu, vd, i,
6400                              aarch64_get_vec_float (cpu, vn, i)
6401                              / aarch64_get_vec_float (cpu, vm, i));
6402 }
6403
6404 static void
6405 do_vec_FMUL (sim_cpu *cpu)
6406 {
6407   /* instr [31]    = 0
6408      instr [30]    = half(0)/full(1)
6409      instr [29,23] = 10 1110 0
6410      instr [22]    = float(0)/double(1)
6411      instr [21]    = 1
6412      instr [20,16] = Vm
6413      instr [15,10] = 1101 11
6414      instr [9, 5]  = Vn
6415      instr [4, 0]  = Vd.  */
6416
6417   unsigned full = INSTR (30, 30);
6418   unsigned vm = INSTR (20, 16);
6419   unsigned vn = INSTR (9, 5);
6420   unsigned vd = INSTR (4, 0);
6421   unsigned i;
6422
6423   NYI_assert (29, 23, 0x5C);
6424   NYI_assert (21, 21, 1);
6425   NYI_assert (15, 10, 0x37);
6426
6427   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6428   if (INSTR (22, 22))
6429     {
6430       if (! full)
6431         HALT_UNALLOC;
6432
6433       for (i = 0; i < 2; i++)
6434         aarch64_set_vec_double (cpu, vd, i,
6435                                 aarch64_get_vec_double (cpu, vn, i)
6436                                 * aarch64_get_vec_double (cpu, vm, i));
6437     }
6438   else
6439     for (i = 0; i < (full ? 4 : 2); i++)
6440       aarch64_set_vec_float (cpu, vd, i,
6441                              aarch64_get_vec_float (cpu, vn, i)
6442                              * aarch64_get_vec_float (cpu, vm, i));
6443 }
6444
6445 static void
6446 do_vec_FADDP (sim_cpu *cpu)
6447 {
6448   /* instr [31]    = 0
6449      instr [30]    = half(0)/full(1)
6450      instr [29,23] = 10 1110 0
6451      instr [22]    = float(0)/double(1)
6452      instr [21]    = 1
6453      instr [20,16] = Vm
6454      instr [15,10] = 1101 01
6455      instr [9, 5]  = Vn
6456      instr [4, 0]  = Vd.  */
6457
6458   unsigned full = INSTR (30, 30);
6459   unsigned vm = INSTR (20, 16);
6460   unsigned vn = INSTR (9, 5);
6461   unsigned vd = INSTR (4, 0);
6462
6463   NYI_assert (29, 23, 0x5C);
6464   NYI_assert (21, 21, 1);
6465   NYI_assert (15, 10, 0x35);
6466
6467   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6468   if (INSTR (22, 22))
6469     {
6470       /* Extract values before adding them incase vd == vn/vm.  */
6471       double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
6472       double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
6473       double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
6474       double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
6475
6476       if (! full)
6477         HALT_UNALLOC;
6478
6479       aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
6480       aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
6481     }
6482   else
6483     {
6484       /* Extract values before adding them incase vd == vn/vm.  */
6485       float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
6486       float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
6487       float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
6488       float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
6489
6490       if (full)
6491         {
6492           float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
6493           float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
6494           float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
6495           float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
6496
6497           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6498           aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
6499           aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
6500           aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
6501         }
6502       else
6503         {
6504           aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
6505           aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
6506         }
6507     }
6508 }
6509
6510 static void
6511 do_vec_FSQRT (sim_cpu *cpu)
6512 {
6513   /* instr[31]    = 0
6514      instr[30]    = half(0)/full(1)
6515      instr[29,23] = 10 1110 1
6516      instr[22]    = single(0)/double(1)
6517      instr[21,10] = 10 0001 1111 10
6518      instr[9,5]   = Vsrc
6519      instr[4,0]   = Vdest.  */
6520
6521   unsigned vn = INSTR (9, 5);
6522   unsigned vd = INSTR (4, 0);
6523   unsigned full = INSTR (30, 30);
6524   int i;
6525
6526   NYI_assert (29, 23, 0x5D);
6527   NYI_assert (21, 10, 0x87E);
6528
6529   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6530   if (INSTR (22, 22))
6531     {
6532       if (! full)
6533         HALT_UNALLOC;
6534
6535       for (i = 0; i < 2; i++)
6536         aarch64_set_vec_double (cpu, vd, i,
6537                                 sqrt (aarch64_get_vec_double (cpu, vn, i)));
6538     }
6539   else
6540     {
6541       for (i = 0; i < (full ? 4 : 2); i++)
6542         aarch64_set_vec_float (cpu, vd, i,
6543                                sqrtf (aarch64_get_vec_float (cpu, vn, i)));
6544     }
6545 }
6546
6547 static void
6548 do_vec_FNEG (sim_cpu *cpu)
6549 {
6550   /* instr[31]    = 0
6551      instr[30]    = half (0)/full (1)
6552      instr[29,23] = 10 1110 1
6553      instr[22]    = single (0)/double (1)
6554      instr[21,10] = 10 0000 1111 10
6555      instr[9,5]   = Vsrc
6556      instr[4,0]   = Vdest.  */
6557
6558   unsigned vn = INSTR (9, 5);
6559   unsigned vd = INSTR (4, 0);
6560   unsigned full = INSTR (30, 30);
6561   int i;
6562
6563   NYI_assert (29, 23, 0x5D);
6564   NYI_assert (21, 10, 0x83E);
6565
6566   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6567   if (INSTR (22, 22))
6568     {
6569       if (! full)
6570         HALT_UNALLOC;
6571
6572       for (i = 0; i < 2; i++)
6573         aarch64_set_vec_double (cpu, vd, i,
6574                                 - aarch64_get_vec_double (cpu, vn, i));
6575     }
6576   else
6577     {
6578       for (i = 0; i < (full ? 4 : 2); i++)
6579         aarch64_set_vec_float (cpu, vd, i,
6580                                - aarch64_get_vec_float (cpu, vn, i));
6581     }
6582 }
6583
6584 static void
6585 do_vec_NOT (sim_cpu *cpu)
6586 {
6587   /* instr[31]    = 0
6588      instr[30]    = half (0)/full (1)
6589      instr[29,10] = 10 1110 0010 0000 0101 10
6590      instr[9,5]   = Vn
6591      instr[4.0]   = Vd.  */
6592
6593   unsigned vn = INSTR (9, 5);
6594   unsigned vd = INSTR (4, 0);
6595   unsigned i;
6596   int      full = INSTR (30, 30);
6597
6598   NYI_assert (29, 10, 0xB8816);
6599
6600   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6601   for (i = 0; i < (full ? 16 : 8); i++)
6602     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
6603 }
6604
6605 static unsigned int
6606 clz (uint64_t val, unsigned size)
6607 {
6608   uint64_t mask = 1;
6609   int      count;
6610
6611   mask <<= (size - 1);
6612   count = 0;
6613   do
6614     {
6615       if (val & mask)
6616         break;
6617       mask >>= 1;
6618       count ++;
6619     }
6620   while (mask);
6621
6622   return count;
6623 }
6624
6625 static void
6626 do_vec_CLZ (sim_cpu *cpu)
6627 {
6628   /* instr[31]    = 0
6629      instr[30]    = half (0)/full (1)
6630      instr[29,24] = 10 1110
6631      instr[23,22] = size
6632      instr[21,10] = 10 0000 0100 10
6633      instr[9,5]   = Vn
6634      instr[4.0]   = Vd.  */
6635
6636   unsigned vn = INSTR (9, 5);
6637   unsigned vd = INSTR (4, 0);
6638   unsigned i;
6639   int      full = INSTR (30,30);
6640
6641   NYI_assert (29, 24, 0x2E);
6642   NYI_assert (21, 10, 0x812);
6643
6644   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6645   switch (INSTR (23, 22))
6646     {
6647     case 0:
6648       for (i = 0; i < (full ? 16 : 8); i++)
6649         aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
6650       break;
6651     case 1:
6652       for (i = 0; i < (full ? 8 : 4); i++)
6653         aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
6654       break;
6655     case 2:
6656       for (i = 0; i < (full ? 4 : 2); i++)
6657         aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
6658       break;
6659     case 3:
6660       if (! full)
6661         HALT_UNALLOC;
6662       aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
6663       aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
6664       break;
6665     }
6666 }
6667
6668 static void
6669 do_vec_MOV_element (sim_cpu *cpu)
6670 {
6671   /* instr[31,21] = 0110 1110 000
6672      instr[20,16] = size & dest index
6673      instr[15]    = 0
6674      instr[14,11] = source index
6675      instr[10]    = 1
6676      instr[9,5]   = Vs
6677      instr[4.0]   = Vd.  */
6678
6679   unsigned vs = INSTR (9, 5);
6680   unsigned vd = INSTR (4, 0);
6681   unsigned src_index;
6682   unsigned dst_index;
6683
6684   NYI_assert (31, 21, 0x370);
6685   NYI_assert (15, 15, 0);
6686   NYI_assert (10, 10, 1);
6687
6688   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6689   if (INSTR (16, 16))
6690     {
6691       /* Move a byte.  */
6692       src_index = INSTR (14, 11);
6693       dst_index = INSTR (20, 17);
6694       aarch64_set_vec_u8 (cpu, vd, dst_index,
6695                           aarch64_get_vec_u8 (cpu, vs, src_index));
6696     }
6697   else if (INSTR (17, 17))
6698     {
6699       /* Move 16-bits.  */
6700       NYI_assert (11, 11, 0);
6701       src_index = INSTR (14, 12);
6702       dst_index = INSTR (20, 18);
6703       aarch64_set_vec_u16 (cpu, vd, dst_index,
6704                            aarch64_get_vec_u16 (cpu, vs, src_index));
6705     }
6706   else if (INSTR (18, 18))
6707     {
6708       /* Move 32-bits.  */
6709       NYI_assert (12, 11, 0);
6710       src_index = INSTR (14, 13);
6711       dst_index = INSTR (20, 19);
6712       aarch64_set_vec_u32 (cpu, vd, dst_index,
6713                            aarch64_get_vec_u32 (cpu, vs, src_index));
6714     }
6715   else
6716     {
6717       NYI_assert (19, 19, 1);
6718       NYI_assert (13, 11, 0);
6719       src_index = INSTR (14, 14);
6720       dst_index = INSTR (20, 20);
6721       aarch64_set_vec_u64 (cpu, vd, dst_index,
6722                            aarch64_get_vec_u64 (cpu, vs, src_index));
6723     }
6724 }
6725
6726 static void
6727 do_vec_REV32 (sim_cpu *cpu)
6728 {
6729   /* instr[31]    = 0
6730      instr[30]    = full/half
6731      instr[29,24] = 10 1110
6732      instr[23,22] = size
6733      instr[21,10] = 10 0000 0000 10
6734      instr[9,5]   = Rn
6735      instr[4,0]   = Rd.  */
6736
6737   unsigned rn = INSTR (9, 5);
6738   unsigned rd = INSTR (4, 0);
6739   unsigned size = INSTR (23, 22);
6740   unsigned full = INSTR (30, 30);
6741   unsigned i;
6742   FRegister val;
6743
6744   NYI_assert (29, 24, 0x2E);
6745   NYI_assert (21, 10, 0x802);
6746
6747   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6748   switch (size)
6749     {
6750     case 0:
6751       for (i = 0; i < (full ? 16 : 8); i++)
6752         val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
6753       break;
6754
6755     case 1:
6756       for (i = 0; i < (full ? 8 : 4); i++)
6757         val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
6758       break;
6759
6760     default:
6761       HALT_UNALLOC;
6762     }
6763
6764   aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
6765   if (full)
6766     aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
6767 }
6768
6769 static void
6770 do_vec_EXT (sim_cpu *cpu)
6771 {
6772   /* instr[31]    = 0
6773      instr[30]    = full/half
6774      instr[29,21] = 10 1110 000
6775      instr[20,16] = Vm
6776      instr[15]    = 0
6777      instr[14,11] = source index
6778      instr[10]    = 0
6779      instr[9,5]   = Vn
6780      instr[4.0]   = Vd.  */
6781
6782   unsigned vm = INSTR (20, 16);
6783   unsigned vn = INSTR (9, 5);
6784   unsigned vd = INSTR (4, 0);
6785   unsigned src_index = INSTR (14, 11);
6786   unsigned full = INSTR (30, 30);
6787   unsigned i;
6788   unsigned j;
6789   FRegister val;
6790
6791   NYI_assert (31, 21, 0x370);
6792   NYI_assert (15, 15, 0);
6793   NYI_assert (10, 10, 0);
6794
6795   if (!full && (src_index & 0x8))
6796     HALT_UNALLOC;
6797
6798   j = 0;
6799
6800   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6801   for (i = src_index; i < (full ? 16 : 8); i++)
6802     val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
6803   for (i = 0; i < src_index; i++)
6804     val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
6805
6806   aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
6807   if (full)
6808     aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
6809 }
6810
6811 static void
6812 dexAdvSIMD0 (sim_cpu *cpu)
6813 {
6814   /* instr [28,25] = 0 111.  */
6815   if (    INSTR (15, 10) == 0x07
6816       && (INSTR (9, 5) ==
6817           INSTR (20, 16)))
6818     {
6819       if (INSTR (31, 21) == 0x075
6820           || INSTR (31, 21) == 0x275)
6821         {
6822           do_vec_MOV_whole_vector (cpu);
6823           return;
6824         }
6825     }
6826
6827   if (INSTR (29, 19) == 0x1E0)
6828     {
6829       do_vec_MOV_immediate (cpu);
6830       return;
6831     }
6832
6833   if (INSTR (29, 19) == 0x5E0)
6834     {
6835       do_vec_MVNI (cpu);
6836       return;
6837     }
6838
6839   if (INSTR (29, 19) == 0x1C0
6840       || INSTR (29, 19) == 0x1C1)
6841     {
6842       if (INSTR (15, 10) == 0x03)
6843         {
6844           do_vec_DUP_scalar_into_vector (cpu);
6845           return;
6846         }
6847     }
6848
6849   switch (INSTR (29, 24))
6850     {
6851     case 0x0E: do_vec_op1 (cpu); return;
6852     case 0x0F: do_vec_op2 (cpu); return;
6853
6854     case 0x2E:
6855       if (INSTR (21, 21) == 1)
6856         {
6857           switch (INSTR (15, 10))
6858             {
6859             case 0x02:
6860               do_vec_REV32 (cpu);
6861               return;
6862
6863             case 0x07:
6864               switch (INSTR (23, 22))
6865                 {
6866                 case 0: do_vec_EOR (cpu); return;
6867                 case 1: do_vec_BSL (cpu); return;
6868                 case 2:
6869                 case 3: do_vec_bit (cpu); return;
6870                 }
6871               break;
6872
6873             case 0x08: do_vec_sub_long (cpu); return;
6874             case 0x11: do_vec_USHL (cpu); return;
6875             case 0x12: do_vec_CLZ (cpu); return;
6876             case 0x16: do_vec_NOT (cpu); return;
6877             case 0x19: do_vec_max (cpu); return;
6878             case 0x1B: do_vec_min (cpu); return;
6879             case 0x21: do_vec_SUB (cpu); return;
6880             case 0x25: do_vec_MLS (cpu); return;
6881             case 0x31: do_vec_FminmaxNMP (cpu); return;
6882             case 0x35: do_vec_FADDP (cpu); return;
6883             case 0x37: do_vec_FMUL (cpu); return;
6884             case 0x3F: do_vec_FDIV (cpu); return;
6885
6886             case 0x3E:
6887               switch (INSTR (20, 16))
6888                 {
6889                 case 0x00: do_vec_FNEG (cpu); return;
6890                 case 0x01: do_vec_FSQRT (cpu); return;
6891                 default:   HALT_NYI;
6892                 }
6893
6894             case 0x0D:
6895             case 0x0F:
6896             case 0x22:
6897             case 0x23:
6898             case 0x26:
6899             case 0x2A:
6900             case 0x32:
6901             case 0x36:
6902             case 0x39:
6903             case 0x3A:
6904               do_vec_compare (cpu); return;
6905
6906             default:
6907               break;
6908             }
6909         }
6910
6911       if (INSTR (31, 21) == 0x370)
6912         {
6913           if (INSTR (10, 10))
6914             do_vec_MOV_element (cpu);
6915           else
6916             do_vec_EXT (cpu);
6917           return;
6918         }
6919
6920       switch (INSTR (21, 10))
6921         {
6922         case 0x82E: do_vec_neg (cpu); return;
6923         case 0x87E: do_vec_sqrt (cpu); return;
6924         default:
6925           if (INSTR (15, 10) == 0x30)
6926             {
6927               do_vec_mull (cpu);
6928               return;
6929             }
6930           break;
6931         }
6932       break;
6933
6934     case 0x2f:
6935       switch (INSTR (15, 10))
6936         {
6937         case 0x01: do_vec_SSHR_USHR (cpu); return;
6938         case 0x10:
6939         case 0x12: do_vec_mls_indexed (cpu); return;
6940         case 0x29: do_vec_xtl (cpu); return;
6941         default:
6942           HALT_NYI;
6943         }
6944
6945     default:
6946       break;
6947     }
6948
6949   HALT_NYI;
6950 }
6951
6952 /* 3 sources.  */
6953
6954 /* Float multiply add.  */
6955 static void
6956 fmadds (sim_cpu *cpu)
6957 {
6958   unsigned sa = INSTR (14, 10);
6959   unsigned sm = INSTR (20, 16);
6960   unsigned sn = INSTR ( 9,  5);
6961   unsigned sd = INSTR ( 4,  0);
6962
6963   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6964   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
6965                         + aarch64_get_FP_float (cpu, sn)
6966                         * aarch64_get_FP_float (cpu, sm));
6967 }
6968
6969 /* Double multiply add.  */
6970 static void
6971 fmaddd (sim_cpu *cpu)
6972 {
6973   unsigned sa = INSTR (14, 10);
6974   unsigned sm = INSTR (20, 16);
6975   unsigned sn = INSTR ( 9,  5);
6976   unsigned sd = INSTR ( 4,  0);
6977
6978   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6979   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
6980                          + aarch64_get_FP_double (cpu, sn)
6981                          * aarch64_get_FP_double (cpu, sm));
6982 }
6983
6984 /* Float multiply subtract.  */
6985 static void
6986 fmsubs (sim_cpu *cpu)
6987 {
6988   unsigned sa = INSTR (14, 10);
6989   unsigned sm = INSTR (20, 16);
6990   unsigned sn = INSTR ( 9,  5);
6991   unsigned sd = INSTR ( 4,  0);
6992
6993   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
6994   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
6995                         - aarch64_get_FP_float (cpu, sn)
6996                         * aarch64_get_FP_float (cpu, sm));
6997 }
6998
6999 /* Double multiply subtract.  */
7000 static void
7001 fmsubd (sim_cpu *cpu)
7002 {
7003   unsigned sa = INSTR (14, 10);
7004   unsigned sm = INSTR (20, 16);
7005   unsigned sn = INSTR ( 9,  5);
7006   unsigned sd = INSTR ( 4,  0);
7007
7008   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7009   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
7010                          - aarch64_get_FP_double (cpu, sn)
7011                          * aarch64_get_FP_double (cpu, sm));
7012 }
7013
7014 /* Float negative multiply add.  */
7015 static void
7016 fnmadds (sim_cpu *cpu)
7017 {
7018   unsigned sa = INSTR (14, 10);
7019   unsigned sm = INSTR (20, 16);
7020   unsigned sn = INSTR ( 9,  5);
7021   unsigned sd = INSTR ( 4,  0);
7022
7023   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7024   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7025                         + (- aarch64_get_FP_float (cpu, sn))
7026                         * aarch64_get_FP_float (cpu, sm));
7027 }
7028
7029 /* Double negative multiply add.  */
7030 static void
7031 fnmaddd (sim_cpu *cpu)
7032 {
7033   unsigned sa = INSTR (14, 10);
7034   unsigned sm = INSTR (20, 16);
7035   unsigned sn = INSTR ( 9,  5);
7036   unsigned sd = INSTR ( 4,  0);
7037
7038   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7039   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7040                          + (- aarch64_get_FP_double (cpu, sn))
7041                          * aarch64_get_FP_double (cpu, sm));
7042 }
7043
7044 /* Float negative multiply subtract.  */
7045 static void
7046 fnmsubs (sim_cpu *cpu)
7047 {
7048   unsigned sa = INSTR (14, 10);
7049   unsigned sm = INSTR (20, 16);
7050   unsigned sn = INSTR ( 9,  5);
7051   unsigned sd = INSTR ( 4,  0);
7052
7053   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7054   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
7055                         + aarch64_get_FP_float (cpu, sn)
7056                         * aarch64_get_FP_float (cpu, sm));
7057 }
7058
7059 /* Double negative multiply subtract.  */
7060 static void
7061 fnmsubd (sim_cpu *cpu)
7062 {
7063   unsigned sa = INSTR (14, 10);
7064   unsigned sm = INSTR (20, 16);
7065   unsigned sn = INSTR ( 9,  5);
7066   unsigned sd = INSTR ( 4,  0);
7067
7068   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7069   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
7070                          + aarch64_get_FP_double (cpu, sn)
7071                          * aarch64_get_FP_double (cpu, sm));
7072 }
7073
7074 static void
7075 dexSimpleFPDataProc3Source (sim_cpu *cpu)
7076 {
7077   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7078      instr[30]    = 0
7079      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7080      instr[28,25] = 1111
7081      instr[24]    = 1
7082      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7083      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
7084      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
7085
7086   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7087   /* dispatch on combined type:o1:o2.  */
7088   uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
7089
7090   if (M_S != 0)
7091     HALT_UNALLOC;
7092
7093   switch (dispatch)
7094     {
7095     case 0: fmadds (cpu); return;
7096     case 1: fmsubs (cpu); return;
7097     case 2: fnmadds (cpu); return;
7098     case 3: fnmsubs (cpu); return;
7099     case 4: fmaddd (cpu); return;
7100     case 5: fmsubd (cpu); return;
7101     case 6: fnmaddd (cpu); return;
7102     case 7: fnmsubd (cpu); return;
7103     default:
7104       /* type > 1 is currently unallocated.  */
7105       HALT_UNALLOC;
7106     }
7107 }
7108
7109 static void
7110 dexSimpleFPFixedConvert (sim_cpu *cpu)
7111 {
7112   HALT_NYI;
7113 }
7114
7115 static void
7116 dexSimpleFPCondCompare (sim_cpu *cpu)
7117 {
7118   /* instr [31,23] = 0001 1110 0
7119      instr [22]    = type
7120      instr [21]    = 1
7121      instr [20,16] = Rm
7122      instr [15,12] = condition
7123      instr [11,10] = 01
7124      instr [9,5]   = Rn
7125      instr [4]     = 0
7126      instr [3,0]   = nzcv  */
7127
7128   unsigned rm = INSTR (20, 16);
7129   unsigned rn = INSTR (9, 5);
7130
7131   NYI_assert (31, 23, 0x3C);
7132   NYI_assert (11, 10, 0x1);
7133   NYI_assert (4,  4,  0);
7134
7135   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7136   if (! testConditionCode (cpu, INSTR (15, 12)))
7137     {
7138       aarch64_set_CPSR (cpu, INSTR (3, 0));
7139       return;
7140     }
7141
7142   if (INSTR (22, 22))
7143     {
7144       /* Double precision.  */
7145       double val1 = aarch64_get_vec_double (cpu, rn, 0);
7146       double val2 = aarch64_get_vec_double (cpu, rm, 0);
7147
7148       /* FIXME: Check for NaNs.  */
7149       if (val1 == val2)
7150         aarch64_set_CPSR (cpu, (Z | C));
7151       else if (val1 < val2)
7152         aarch64_set_CPSR (cpu, N);
7153       else /* val1 > val2 */
7154         aarch64_set_CPSR (cpu, C);
7155     }
7156   else
7157     {
7158       /* Single precision.  */
7159       float val1 = aarch64_get_vec_float (cpu, rn, 0);
7160       float val2 = aarch64_get_vec_float (cpu, rm, 0);
7161
7162       /* FIXME: Check for NaNs.  */
7163       if (val1 == val2)
7164         aarch64_set_CPSR (cpu, (Z | C));
7165       else if (val1 < val2)
7166         aarch64_set_CPSR (cpu, N);
7167       else /* val1 > val2 */
7168         aarch64_set_CPSR (cpu, C);
7169     }
7170 }
7171
7172 /* 2 sources.  */
7173
7174 /* Float add.  */
7175 static void
7176 fadds (sim_cpu *cpu)
7177 {
7178   unsigned sm = INSTR (20, 16);
7179   unsigned sn = INSTR ( 9,  5);
7180   unsigned sd = INSTR ( 4,  0);
7181
7182   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7183   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7184                         + aarch64_get_FP_float (cpu, sm));
7185 }
7186
7187 /* Double add.  */
7188 static void
7189 faddd (sim_cpu *cpu)
7190 {
7191   unsigned sm = INSTR (20, 16);
7192   unsigned sn = INSTR ( 9,  5);
7193   unsigned sd = INSTR ( 4,  0);
7194
7195   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7196   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7197                          + aarch64_get_FP_double (cpu, sm));
7198 }
7199
7200 /* Float divide.  */
7201 static void
7202 fdivs (sim_cpu *cpu)
7203 {
7204   unsigned sm = INSTR (20, 16);
7205   unsigned sn = INSTR ( 9,  5);
7206   unsigned sd = INSTR ( 4,  0);
7207
7208   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7209   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7210                         / aarch64_get_FP_float (cpu, sm));
7211 }
7212
7213 /* Double divide.  */
7214 static void
7215 fdivd (sim_cpu *cpu)
7216 {
7217   unsigned sm = INSTR (20, 16);
7218   unsigned sn = INSTR ( 9,  5);
7219   unsigned sd = INSTR ( 4,  0);
7220
7221   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7222   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7223                          / aarch64_get_FP_double (cpu, sm));
7224 }
7225
7226 /* Float multiply.  */
7227 static void
7228 fmuls (sim_cpu *cpu)
7229 {
7230   unsigned sm = INSTR (20, 16);
7231   unsigned sn = INSTR ( 9,  5);
7232   unsigned sd = INSTR ( 4,  0);
7233
7234   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7235   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7236                         * aarch64_get_FP_float (cpu, sm));
7237 }
7238
7239 /* Double multiply.  */
7240 static void
7241 fmuld (sim_cpu *cpu)
7242 {
7243   unsigned sm = INSTR (20, 16);
7244   unsigned sn = INSTR ( 9,  5);
7245   unsigned sd = INSTR ( 4,  0);
7246
7247   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7248   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7249                          * aarch64_get_FP_double (cpu, sm));
7250 }
7251
7252 /* Float negate and multiply.  */
7253 static void
7254 fnmuls (sim_cpu *cpu)
7255 {
7256   unsigned sm = INSTR (20, 16);
7257   unsigned sn = INSTR ( 9,  5);
7258   unsigned sd = INSTR ( 4,  0);
7259
7260   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7261   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
7262                                     * aarch64_get_FP_float (cpu, sm)));
7263 }
7264
7265 /* Double negate and multiply.  */
7266 static void
7267 fnmuld (sim_cpu *cpu)
7268 {
7269   unsigned sm = INSTR (20, 16);
7270   unsigned sn = INSTR ( 9,  5);
7271   unsigned sd = INSTR ( 4,  0);
7272
7273   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7274   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
7275                                      * aarch64_get_FP_double (cpu, sm)));
7276 }
7277
7278 /* Float subtract.  */
7279 static void
7280 fsubs (sim_cpu *cpu)
7281 {
7282   unsigned sm = INSTR (20, 16);
7283   unsigned sn = INSTR ( 9,  5);
7284   unsigned sd = INSTR ( 4,  0);
7285
7286   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7287   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
7288                         - aarch64_get_FP_float (cpu, sm));
7289 }
7290
7291 /* Double subtract.  */
7292 static void
7293 fsubd (sim_cpu *cpu)
7294 {
7295   unsigned sm = INSTR (20, 16);
7296   unsigned sn = INSTR ( 9,  5);
7297   unsigned sd = INSTR ( 4,  0);
7298
7299   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7300   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
7301                          - aarch64_get_FP_double (cpu, sm));
7302 }
7303
7304 static void
7305 do_FMINNM (sim_cpu *cpu)
7306 {
7307   /* instr[31,23] = 0 0011 1100
7308      instr[22]    = float(0)/double(1)
7309      instr[21]    = 1
7310      instr[20,16] = Sm
7311      instr[15,10] = 01 1110
7312      instr[9,5]   = Sn
7313      instr[4,0]   = Cpu  */
7314
7315   unsigned sm = INSTR (20, 16);
7316   unsigned sn = INSTR ( 9,  5);
7317   unsigned sd = INSTR ( 4,  0);
7318
7319   NYI_assert (31, 23, 0x03C);
7320   NYI_assert (15, 10, 0x1E);
7321
7322   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7323   if (INSTR (22, 22))
7324     aarch64_set_FP_double (cpu, sd,
7325                            dminnm (aarch64_get_FP_double (cpu, sn),
7326                                    aarch64_get_FP_double (cpu, sm)));
7327   else
7328     aarch64_set_FP_float (cpu, sd,
7329                           fminnm (aarch64_get_FP_float (cpu, sn),
7330                                   aarch64_get_FP_float (cpu, sm)));
7331 }
7332
7333 static void
7334 do_FMAXNM (sim_cpu *cpu)
7335 {
7336   /* instr[31,23] = 0 0011 1100
7337      instr[22]    = float(0)/double(1)
7338      instr[21]    = 1
7339      instr[20,16] = Sm
7340      instr[15,10] = 01 1010
7341      instr[9,5]   = Sn
7342      instr[4,0]   = Cpu  */
7343
7344   unsigned sm = INSTR (20, 16);
7345   unsigned sn = INSTR ( 9,  5);
7346   unsigned sd = INSTR ( 4,  0);
7347
7348   NYI_assert (31, 23, 0x03C);
7349   NYI_assert (15, 10, 0x1A);
7350
7351   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7352   if (INSTR (22, 22))
7353     aarch64_set_FP_double (cpu, sd,
7354                            dmaxnm (aarch64_get_FP_double (cpu, sn),
7355                                    aarch64_get_FP_double (cpu, sm)));
7356   else
7357     aarch64_set_FP_float (cpu, sd,
7358                           fmaxnm (aarch64_get_FP_float (cpu, sn),
7359                                   aarch64_get_FP_float (cpu, sm)));
7360 }
7361
7362 static void
7363 dexSimpleFPDataProc2Source (sim_cpu *cpu)
7364 {
7365   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7366      instr[30]    = 0
7367      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7368      instr[28,25] = 1111
7369      instr[24]    = 0
7370      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
7371      instr[21]    = 1
7372      instr[20,16] = Vm
7373      instr[15,12] ==> opcode : 0000 ==> FMUL, 0001 ==> FDIV
7374                                0010 ==> FADD, 0011 ==> FSUB,
7375                                0100 ==> FMAX, 0101 ==> FMIN
7376                                0110 ==> FMAXNM, 0111 ==> FMINNM
7377                                1000 ==> FNMUL, ow ==> UNALLOC
7378      instr[11,10] = 10
7379      instr[9,5]   = Vn
7380      instr[4,0]   = Vd  */
7381
7382   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7383   uint32_t type = INSTR (23, 22);
7384   /* Dispatch on opcode.  */
7385   uint32_t dispatch = INSTR (15, 12);
7386
7387   if (type > 1)
7388     HALT_UNALLOC;
7389
7390   if (M_S != 0)
7391     HALT_UNALLOC;
7392
7393   if (type)
7394     switch (dispatch)
7395       {
7396       case 0: fmuld (cpu); return;
7397       case 1: fdivd (cpu); return;
7398       case 2: faddd (cpu); return;
7399       case 3: fsubd (cpu); return;
7400       case 6: do_FMAXNM (cpu); return;
7401       case 7: do_FMINNM (cpu); return;
7402       case 8: fnmuld (cpu); return;
7403
7404         /* Have not yet implemented fmax and fmin.  */
7405       case 4:
7406       case 5:
7407         HALT_NYI;
7408
7409       default:
7410         HALT_UNALLOC;
7411       }
7412   else /* type == 0 => floats.  */
7413     switch (dispatch)
7414       {
7415       case 0: fmuls (cpu); return;
7416       case 1: fdivs (cpu); return;
7417       case 2: fadds (cpu); return;
7418       case 3: fsubs (cpu); return;
7419       case 6: do_FMAXNM (cpu); return;
7420       case 7: do_FMINNM (cpu); return;
7421       case 8: fnmuls (cpu); return;
7422
7423       case 4:
7424       case 5:
7425         HALT_NYI;
7426
7427       default:
7428         HALT_UNALLOC;
7429       }
7430 }
7431
7432 static void
7433 dexSimpleFPCondSelect (sim_cpu *cpu)
7434 {
7435   /* FCSEL
7436      instr[31,23] = 0 0011 1100
7437      instr[22]    = 0=>single 1=>double
7438      instr[21]    = 1
7439      instr[20,16] = Sm
7440      instr[15,12] = cond
7441      instr[11,10] = 11
7442      instr[9,5]   = Sn
7443      instr[4,0]   = Cpu  */
7444   unsigned sm = INSTR (20, 16);
7445   unsigned sn = INSTR ( 9, 5);
7446   unsigned sd = INSTR ( 4, 0);
7447   uint32_t set = testConditionCode (cpu, INSTR (15, 12));
7448
7449   NYI_assert (31, 23, 0x03C);
7450   NYI_assert (11, 10, 0x3);
7451
7452   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7453   if (INSTR (22, 22))
7454     aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
7455                                      : aarch64_get_FP_double (cpu, sm)));
7456   else
7457     aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
7458                                     : aarch64_get_FP_float (cpu, sm)));
7459 }
7460
7461 /* Store 32 bit unscaled signed 9 bit.  */
7462 static void
7463 fsturs (sim_cpu *cpu, int32_t offset)
7464 {
7465   unsigned int rn = INSTR (9, 5);
7466   unsigned int st = INSTR (4, 0);
7467
7468   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7469   aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7470                        aarch64_get_vec_u32 (cpu, st, 0));
7471 }
7472
7473 /* Store 64 bit unscaled signed 9 bit.  */
7474 static void
7475 fsturd (sim_cpu *cpu, int32_t offset)
7476 {
7477   unsigned int rn = INSTR (9, 5);
7478   unsigned int st = INSTR (4, 0);
7479
7480   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7481   aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
7482                        aarch64_get_vec_u64 (cpu, st, 0));
7483 }
7484
7485 /* Store 128 bit unscaled signed 9 bit.  */
7486 static void
7487 fsturq (sim_cpu *cpu, int32_t offset)
7488 {
7489   unsigned int rn = INSTR (9, 5);
7490   unsigned int st = INSTR (4, 0);
7491   FRegister a;
7492
7493   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7494   aarch64_get_FP_long_double (cpu, st, & a);
7495   aarch64_set_mem_long_double (cpu,
7496                                aarch64_get_reg_u64 (cpu, rn, 1)
7497                                + offset, a);
7498 }
7499
7500 /* TODO FP move register.  */
7501
7502 /* 32 bit fp to fp move register.  */
7503 static void
7504 ffmovs (sim_cpu *cpu)
7505 {
7506   unsigned int rn = INSTR (9, 5);
7507   unsigned int st = INSTR (4, 0);
7508
7509   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7510   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
7511 }
7512
7513 /* 64 bit fp to fp move register.  */
7514 static void
7515 ffmovd (sim_cpu *cpu)
7516 {
7517   unsigned int rn = INSTR (9, 5);
7518   unsigned int st = INSTR (4, 0);
7519
7520   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7521   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
7522 }
7523
7524 /* 32 bit GReg to Vec move register.  */
7525 static void
7526 fgmovs (sim_cpu *cpu)
7527 {
7528   unsigned int rn = INSTR (9, 5);
7529   unsigned int st = INSTR (4, 0);
7530
7531   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7532   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
7533 }
7534
7535 /* 64 bit g to fp move register.  */
7536 static void
7537 fgmovd (sim_cpu *cpu)
7538 {
7539   unsigned int rn = INSTR (9, 5);
7540   unsigned int st = INSTR (4, 0);
7541
7542   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7543   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
7544 }
7545
7546 /* 32 bit fp to g move register.  */
7547 static void
7548 gfmovs (sim_cpu *cpu)
7549 {
7550   unsigned int rn = INSTR (9, 5);
7551   unsigned int st = INSTR (4, 0);
7552
7553   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7554   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
7555 }
7556
7557 /* 64 bit fp to g move register.  */
7558 static void
7559 gfmovd (sim_cpu *cpu)
7560 {
7561   unsigned int rn = INSTR (9, 5);
7562   unsigned int st = INSTR (4, 0);
7563
7564   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7565   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
7566 }
7567
7568 /* FP move immediate
7569
7570    These install an immediate 8 bit value in the target register
7571    where the 8 bits comprise 1 sign bit, 4 bits of fraction and a 3
7572    bit exponent.  */
7573
7574 static void
7575 fmovs (sim_cpu *cpu)
7576 {
7577   unsigned int sd = INSTR (4, 0);
7578   uint32_t imm = INSTR (20, 13);
7579   float f = fp_immediate_for_encoding_32 (imm);
7580
7581   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7582   aarch64_set_FP_float (cpu, sd, f);
7583 }
7584
7585 static void
7586 fmovd (sim_cpu *cpu)
7587 {
7588   unsigned int sd = INSTR (4, 0);
7589   uint32_t imm = INSTR (20, 13);
7590   double d = fp_immediate_for_encoding_64 (imm);
7591
7592   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7593   aarch64_set_FP_double (cpu, sd, d);
7594 }
7595
7596 static void
7597 dexSimpleFPImmediate (sim_cpu *cpu)
7598 {
7599   /* instr[31,23] == 00111100
7600      instr[22]    == type : single(0)/double(1)
7601      instr[21]    == 1
7602      instr[20,13] == imm8
7603      instr[12,10] == 100
7604      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
7605      instr[4,0]   == Rd  */
7606   uint32_t imm5 = INSTR (9, 5);
7607
7608   NYI_assert (31, 23, 0x3C);
7609
7610   if (imm5 != 0)
7611     HALT_UNALLOC;
7612
7613   if (INSTR (22, 22))
7614     fmovd (cpu);
7615   else
7616     fmovs (cpu);
7617 }
7618
7619 /* TODO specific decode and execute for group Load Store.  */
7620
7621 /* TODO FP load/store single register (unscaled offset).  */
7622
7623 /* TODO load 8 bit unscaled signed 9 bit.  */
7624 /* TODO load 16 bit unscaled signed 9 bit.  */
7625
7626 /* Load 32 bit unscaled signed 9 bit.  */
7627 static void
7628 fldurs (sim_cpu *cpu, int32_t offset)
7629 {
7630   unsigned int rn = INSTR (9, 5);
7631   unsigned int st = INSTR (4, 0);
7632
7633   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7634   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
7635                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7636 }
7637
7638 /* Load 64 bit unscaled signed 9 bit.  */
7639 static void
7640 fldurd (sim_cpu *cpu, int32_t offset)
7641 {
7642   unsigned int rn = INSTR (9, 5);
7643   unsigned int st = INSTR (4, 0);
7644
7645   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7646   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
7647                        (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
7648 }
7649
7650 /* Load 128 bit unscaled signed 9 bit.  */
7651 static void
7652 fldurq (sim_cpu *cpu, int32_t offset)
7653 {
7654   unsigned int rn = INSTR (9, 5);
7655   unsigned int st = INSTR (4, 0);
7656   FRegister a;
7657   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
7658
7659   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7660   aarch64_get_mem_long_double (cpu, addr, & a);
7661   aarch64_set_FP_long_double (cpu, st, a);
7662 }
7663
7664 /* TODO store 8 bit unscaled signed 9 bit.  */
7665 /* TODO store 16 bit unscaled signed 9 bit.  */
7666
7667
7668 /* 1 source.  */
7669
7670 /* Float absolute value.  */
7671 static void
7672 fabss (sim_cpu *cpu)
7673 {
7674   unsigned sn = INSTR (9, 5);
7675   unsigned sd = INSTR (4, 0);
7676   float value = aarch64_get_FP_float (cpu, sn);
7677
7678   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7679   aarch64_set_FP_float (cpu, sd, fabsf (value));
7680 }
7681
7682 /* Double absolute value.  */
7683 static void
7684 fabcpu (sim_cpu *cpu)
7685 {
7686   unsigned sn = INSTR (9, 5);
7687   unsigned sd = INSTR (4, 0);
7688   double value = aarch64_get_FP_double (cpu, sn);
7689
7690   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7691   aarch64_set_FP_double (cpu, sd, fabs (value));
7692 }
7693
7694 /* Float negative value.  */
7695 static void
7696 fnegs (sim_cpu *cpu)
7697 {
7698   unsigned sn = INSTR (9, 5);
7699   unsigned sd = INSTR (4, 0);
7700
7701   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7702   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
7703 }
7704
7705 /* Double negative value.  */
7706 static void
7707 fnegd (sim_cpu *cpu)
7708 {
7709   unsigned sn = INSTR (9, 5);
7710   unsigned sd = INSTR (4, 0);
7711
7712   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7713   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
7714 }
7715
7716 /* Float square root.  */
7717 static void
7718 fsqrts (sim_cpu *cpu)
7719 {
7720   unsigned sn = INSTR (9, 5);
7721   unsigned sd = INSTR (4, 0);
7722
7723   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7724   aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
7725 }
7726
7727 /* Double square root.  */
7728 static void
7729 fsqrtd (sim_cpu *cpu)
7730 {
7731   unsigned sn = INSTR (9, 5);
7732   unsigned sd = INSTR (4, 0);
7733
7734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7735   aarch64_set_FP_double (cpu, sd,
7736                          sqrt (aarch64_get_FP_double (cpu, sn)));
7737 }
7738
7739 /* Convert double to float.  */
7740 static void
7741 fcvtds (sim_cpu *cpu)
7742 {
7743   unsigned sn = INSTR (9, 5);
7744   unsigned sd = INSTR (4, 0);
7745
7746   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7747   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
7748 }
7749
7750 /* Convert float to double.  */
7751 static void
7752 fcvtcpu (sim_cpu *cpu)
7753 {
7754   unsigned sn = INSTR (9, 5);
7755   unsigned sd = INSTR (4, 0);
7756
7757   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7758   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
7759 }
7760
7761 static void
7762 do_FRINT (sim_cpu *cpu)
7763 {
7764   /* instr[31,23] = 0001 1110 0
7765      instr[22]    = single(0)/double(1)
7766      instr[21,18] = 1001
7767      instr[17,15] = rounding mode
7768      instr[14,10] = 10000
7769      instr[9,5]   = source
7770      instr[4,0]   = dest  */
7771
7772   float val;
7773   unsigned rs = INSTR (9, 5);
7774   unsigned rd = INSTR (4, 0);
7775   unsigned int rmode = INSTR (17, 15);
7776
7777   NYI_assert (31, 23, 0x03C);
7778   NYI_assert (21, 18, 0x9);
7779   NYI_assert (14, 10, 0x10);
7780
7781   if (rmode == 6 || rmode == 7)
7782     /* FIXME: Add support for rmode == 6 exactness check.  */
7783     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
7784
7785   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7786   if (INSTR (22, 22))
7787     {
7788       double val = aarch64_get_FP_double (cpu, rs);
7789
7790       switch (rmode)
7791         {
7792         case 0: /* mode N: nearest or even.  */
7793           {
7794             double rval = round (val);
7795
7796             if (val - rval == 0.5)
7797               {
7798                 if (((rval / 2.0) * 2.0) != rval)
7799                   rval += 1.0;
7800               }
7801
7802             aarch64_set_FP_double (cpu, rd, round (val));
7803             return;
7804           }
7805
7806         case 1: /* mode P: towards +inf.  */
7807           if (val < 0.0)
7808             aarch64_set_FP_double (cpu, rd, trunc (val));
7809           else
7810             aarch64_set_FP_double (cpu, rd, round (val));
7811           return;
7812
7813         case 2: /* mode M: towards -inf.  */
7814           if (val < 0.0)
7815             aarch64_set_FP_double (cpu, rd, round (val));
7816           else
7817             aarch64_set_FP_double (cpu, rd, trunc (val));
7818           return;
7819
7820         case 3: /* mode Z: towards 0.  */
7821           aarch64_set_FP_double (cpu, rd, trunc (val));
7822           return;
7823
7824         case 4: /* mode A: away from 0.  */
7825           aarch64_set_FP_double (cpu, rd, round (val));
7826           return;
7827
7828         case 6: /* mode X: use FPCR with exactness check.  */
7829         case 7: /* mode I: use FPCR mode.  */
7830           HALT_NYI;
7831
7832         default:
7833           HALT_UNALLOC;
7834         }
7835     }
7836
7837   val = aarch64_get_FP_float (cpu, rs);
7838
7839   switch (rmode)
7840     {
7841     case 0: /* mode N: nearest or even.  */
7842       {
7843         float rval = roundf (val);
7844
7845         if (val - rval == 0.5)
7846           {
7847             if (((rval / 2.0) * 2.0) != rval)
7848               rval += 1.0;
7849           }
7850
7851         aarch64_set_FP_float (cpu, rd, rval);
7852         return;
7853       }
7854
7855     case 1: /* mode P: towards +inf.  */
7856       if (val < 0.0)
7857         aarch64_set_FP_float (cpu, rd, truncf (val));
7858       else
7859         aarch64_set_FP_float (cpu, rd, roundf (val));
7860       return;
7861
7862     case 2: /* mode M: towards -inf.  */
7863       if (val < 0.0)
7864         aarch64_set_FP_float (cpu, rd, truncf (val));
7865       else
7866         aarch64_set_FP_float (cpu, rd, roundf (val));
7867       return;
7868
7869     case 3: /* mode Z: towards 0.  */
7870       aarch64_set_FP_float (cpu, rd, truncf (val));
7871       return;
7872
7873     case 4: /* mode A: away from 0.  */
7874       aarch64_set_FP_float (cpu, rd, roundf (val));
7875       return;
7876
7877     case 6: /* mode X: use FPCR with exactness check.  */
7878     case 7: /* mode I: use FPCR mode.  */
7879       HALT_NYI;
7880
7881     default:
7882       HALT_UNALLOC;
7883     }
7884 }
7885
7886 /* Convert half to float.  */
7887 static void
7888 do_FCVT_half_to_single (sim_cpu *cpu)
7889 {
7890   unsigned rn = INSTR (9, 5);
7891   unsigned rd = INSTR (4, 0);
7892
7893   NYI_assert (31, 10, 0x7B890);
7894
7895   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7896   aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
7897 }
7898
7899 /* Convert half to double.  */
7900 static void
7901 do_FCVT_half_to_double (sim_cpu *cpu)
7902 {
7903   unsigned rn = INSTR (9, 5);
7904   unsigned rd = INSTR (4, 0);
7905
7906   NYI_assert (31, 10, 0x7B8B0);
7907
7908   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7909   aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
7910 }
7911
7912 static void
7913 do_FCVT_single_to_half (sim_cpu *cpu)
7914 {
7915   unsigned rn = INSTR (9, 5);
7916   unsigned rd = INSTR (4, 0);
7917
7918   NYI_assert (31, 10, 0x788F0);
7919
7920   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7921   aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
7922 }
7923
7924 /* Convert double to half.  */
7925 static void
7926 do_FCVT_double_to_half (sim_cpu *cpu)
7927 {
7928   unsigned rn = INSTR (9, 5);
7929   unsigned rd = INSTR (4, 0);
7930
7931   NYI_assert (31, 10, 0x798F0);
7932
7933   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
7934   aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
7935 }
7936
7937 static void
7938 dexSimpleFPDataProc1Source (sim_cpu *cpu)
7939 {
7940   /* instr[31]    ==> M : 0 ==> OK, 1 ==> UNALLOC
7941      instr[30]    = 0
7942      instr[29]    ==> S :  0 ==> OK, 1 ==> UNALLOC
7943      instr[28,25] = 1111
7944      instr[24]    = 0
7945      instr[23,22] ==> type : 00 ==> source is single,
7946                              01 ==> source is double
7947                              10 ==> UNALLOC
7948                              11 ==> UNALLOC or source is half
7949      instr[21]    = 1
7950      instr[20,15] ==> opcode : with type 00 or 01
7951                                000000 ==> FMOV, 000001 ==> FABS,
7952                                000010 ==> FNEG, 000011 ==> FSQRT,
7953                                000100 ==> UNALLOC, 000101 ==> FCVT,(to single/double)
7954                                000110 ==> UNALLOC, 000111 ==> FCVT (to half)
7955                                001000 ==> FRINTN, 001001 ==> FRINTP,
7956                                001010 ==> FRINTM, 001011 ==> FRINTZ,
7957                                001100 ==> FRINTA, 001101 ==> UNALLOC
7958                                001110 ==> FRINTX, 001111 ==> FRINTI
7959                                with type 11
7960                                000100 ==> FCVT (half-to-single)
7961                                000101 ==> FCVT (half-to-double)
7962                                instr[14,10] = 10000.  */
7963
7964   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
7965   uint32_t type   = INSTR (23, 22);
7966   uint32_t opcode = INSTR (20, 15);
7967
7968   if (M_S != 0)
7969     HALT_UNALLOC;
7970
7971   if (type == 3)
7972     {
7973       if (opcode == 4)
7974         do_FCVT_half_to_single (cpu);
7975       else if (opcode == 5)
7976         do_FCVT_half_to_double (cpu);
7977       else
7978         HALT_UNALLOC;
7979       return;
7980     }
7981
7982   if (type == 2)
7983     HALT_UNALLOC;
7984
7985   switch (opcode)
7986     {
7987     case 0:
7988       if (type)
7989         ffmovd (cpu);
7990       else
7991         ffmovs (cpu);
7992       return;
7993
7994     case 1:
7995       if (type)
7996         fabcpu (cpu);
7997       else
7998         fabss (cpu);
7999       return;
8000
8001     case 2:
8002       if (type)
8003         fnegd (cpu);
8004       else
8005         fnegs (cpu);
8006       return;
8007
8008     case 3:
8009       if (type)
8010         fsqrtd (cpu);
8011       else
8012         fsqrts (cpu);
8013       return;
8014
8015     case 4:
8016       if (type)
8017         fcvtds (cpu);
8018       else
8019         HALT_UNALLOC;
8020       return;
8021
8022     case 5:
8023       if (type)
8024         HALT_UNALLOC;
8025       fcvtcpu (cpu);
8026       return;
8027
8028     case 8:             /* FRINTN etc.  */
8029     case 9:
8030     case 10:
8031     case 11:
8032     case 12:
8033     case 14:
8034     case 15:
8035        do_FRINT (cpu);
8036        return;
8037
8038     case 7:
8039       if (INSTR (22, 22))
8040         do_FCVT_double_to_half (cpu);
8041       else
8042         do_FCVT_single_to_half (cpu);
8043       return;
8044
8045     case 13:
8046       HALT_NYI;
8047
8048     default:
8049       HALT_UNALLOC;
8050     }
8051 }
8052
8053 /* 32 bit signed int to float.  */
8054 static void
8055 scvtf32 (sim_cpu *cpu)
8056 {
8057   unsigned rn = INSTR (9, 5);
8058   unsigned sd = INSTR (4, 0);
8059
8060   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8061   aarch64_set_FP_float
8062     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8063 }
8064
8065 /* signed int to float.  */
8066 static void
8067 scvtf (sim_cpu *cpu)
8068 {
8069   unsigned rn = INSTR (9, 5);
8070   unsigned sd = INSTR (4, 0);
8071
8072   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8073   aarch64_set_FP_float
8074     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8075 }
8076
8077 /* 32 bit signed int to double.  */
8078 static void
8079 scvtd32 (sim_cpu *cpu)
8080 {
8081   unsigned rn = INSTR (9, 5);
8082   unsigned sd = INSTR (4, 0);
8083
8084   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8085   aarch64_set_FP_double
8086     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
8087 }
8088
8089 /* signed int to double.  */
8090 static void
8091 scvtd (sim_cpu *cpu)
8092 {
8093   unsigned rn = INSTR (9, 5);
8094   unsigned sd = INSTR (4, 0);
8095
8096   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8097   aarch64_set_FP_double
8098     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
8099 }
8100
8101 static const float  FLOAT_INT_MAX   = (float)  INT_MAX;
8102 static const float  FLOAT_INT_MIN   = (float)  INT_MIN;
8103 static const double DOUBLE_INT_MAX  = (double) INT_MAX;
8104 static const double DOUBLE_INT_MIN  = (double) INT_MIN;
8105 static const float  FLOAT_LONG_MAX  = (float)  LONG_MAX;
8106 static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
8107 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
8108 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
8109
8110 #define UINT_MIN 0
8111 #define ULONG_MIN 0
8112 static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
8113 static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
8114 static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
8115 static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
8116 static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
8117 static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
8118 static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
8119 static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
8120
8121 /* Check for FP exception conditions:
8122      NaN raises IO
8123      Infinity raises IO
8124      Out of Range raises IO and IX and saturates value
8125      Denormal raises ID and IX and sets to zero.  */
8126 #define RAISE_EXCEPTIONS(F, VALUE, FTYPE, ITYPE)        \
8127   do                                                    \
8128     {                                                   \
8129       switch (fpclassify (F))                           \
8130         {                                               \
8131         case FP_INFINITE:                               \
8132         case FP_NAN:                                    \
8133           aarch64_set_FPSR (cpu, IO);                   \
8134           if (signbit (F))                              \
8135             VALUE = ITYPE##_MAX;                        \
8136           else                                          \
8137             VALUE = ITYPE##_MIN;                        \
8138           break;                                        \
8139                                                         \
8140         case FP_NORMAL:                                 \
8141           if (F >= FTYPE##_##ITYPE##_MAX)               \
8142             {                                           \
8143               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8144               VALUE = ITYPE##_MAX;                      \
8145             }                                           \
8146           else if (F <= FTYPE##_##ITYPE##_MIN)          \
8147             {                                           \
8148               aarch64_set_FPSR_bits (cpu, IO | IX, IO | IX);    \
8149               VALUE = ITYPE##_MIN;                      \
8150             }                                           \
8151           break;                                        \
8152                                                         \
8153         case FP_SUBNORMAL:                              \
8154           aarch64_set_FPSR_bits (cpu, IO | IX | ID, IX | ID);   \
8155           VALUE = 0;                                    \
8156           break;                                        \
8157                                                         \
8158         default:                                        \
8159         case FP_ZERO:                                   \
8160           VALUE = 0;                                    \
8161           break;                                        \
8162         }                                               \
8163     }                                                   \
8164   while (0)
8165
8166 /* 32 bit convert float to signed int truncate towards zero.  */
8167 static void
8168 fcvtszs32 (sim_cpu *cpu)
8169 {
8170   unsigned sn = INSTR (9, 5);
8171   unsigned rd = INSTR (4, 0);
8172   /* TODO : check that this rounds toward zero.  */
8173   float   f = aarch64_get_FP_float (cpu, sn);
8174   int32_t value = (int32_t) f;
8175
8176   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
8177
8178   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8179   /* Avoid sign extension to 64 bit.  */
8180   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8181 }
8182
8183 /* 64 bit convert float to signed int truncate towards zero.  */
8184 static void
8185 fcvtszs (sim_cpu *cpu)
8186 {
8187   unsigned sn = INSTR (9, 5);
8188   unsigned rd = INSTR (4, 0);
8189   float f = aarch64_get_FP_float (cpu, sn);
8190   int64_t value = (int64_t) f;
8191
8192   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
8193
8194   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8195   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8196 }
8197
8198 /* 32 bit convert double to signed int truncate towards zero.  */
8199 static void
8200 fcvtszd32 (sim_cpu *cpu)
8201 {
8202   unsigned sn = INSTR (9, 5);
8203   unsigned rd = INSTR (4, 0);
8204   /* TODO : check that this rounds toward zero.  */
8205   double   d = aarch64_get_FP_double (cpu, sn);
8206   int32_t  value = (int32_t) d;
8207
8208   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
8209
8210   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8211   /* Avoid sign extension to 64 bit.  */
8212   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
8213 }
8214
8215 /* 64 bit convert double to signed int truncate towards zero.  */
8216 static void
8217 fcvtszd (sim_cpu *cpu)
8218 {
8219   unsigned sn = INSTR (9, 5);
8220   unsigned rd = INSTR (4, 0);
8221   /* TODO : check that this rounds toward zero.  */
8222   double  d = aarch64_get_FP_double (cpu, sn);
8223   int64_t value;
8224
8225   value = (int64_t) d;
8226
8227   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
8228
8229   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8230   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
8231 }
8232
8233 static void
8234 do_fcvtzu (sim_cpu *cpu)
8235 {
8236   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8237      instr[30,23] = 00111100
8238      instr[22]    = type: single (0)/ double (1)
8239      instr[21]    = enable (0)/disable(1) precision
8240      instr[20,16] = 11001
8241      instr[15,10] = precision
8242      instr[9,5]   = Rs
8243      instr[4,0]   = Rd.  */
8244
8245   unsigned rs = INSTR (9, 5);
8246   unsigned rd = INSTR (4, 0);
8247
8248   NYI_assert (30, 23, 0x3C);
8249   NYI_assert (20, 16, 0x19);
8250
8251   if (INSTR (21, 21) != 1)
8252     /* Convert to fixed point.  */
8253     HALT_NYI;
8254
8255   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8256   if (INSTR (31, 31))
8257     {
8258       /* Convert to unsigned 64-bit integer.  */
8259       if (INSTR (22, 22))
8260         {
8261           double  d = aarch64_get_FP_double (cpu, rs);
8262           uint64_t value = (uint64_t) d;
8263
8264           /* Do not raise an exception if we have reached ULONG_MAX.  */
8265           if (value != (1UL << 63))
8266             RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
8267
8268           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8269         }
8270       else
8271         {
8272           float  f = aarch64_get_FP_float (cpu, rs);
8273           uint64_t value = (uint64_t) f;
8274
8275           /* Do not raise an exception if we have reached ULONG_MAX.  */
8276           if (value != (1UL << 63))
8277             RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
8278
8279           aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8280         }
8281     }
8282   else
8283     {
8284       uint32_t value;
8285
8286       /* Convert to unsigned 32-bit integer.  */
8287       if (INSTR (22, 22))
8288         {
8289           double  d = aarch64_get_FP_double (cpu, rs);
8290
8291           value = (uint32_t) d;
8292           /* Do not raise an exception if we have reached UINT_MAX.  */
8293           if (value != (1UL << 31))
8294             RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
8295         }
8296       else
8297         {
8298           float  f = aarch64_get_FP_float (cpu, rs);
8299
8300           value = (uint32_t) f;
8301           /* Do not raise an exception if we have reached UINT_MAX.  */
8302           if (value != (1UL << 31))
8303             RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
8304         }
8305
8306       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
8307     }
8308 }
8309
8310 static void
8311 do_UCVTF (sim_cpu *cpu)
8312 {
8313   /* instr[31]    = size: 32-bit (0), 64-bit (1)
8314      instr[30,23] = 001 1110 0
8315      instr[22]    = type: single (0)/ double (1)
8316      instr[21]    = enable (0)/disable(1) precision
8317      instr[20,16] = 0 0011
8318      instr[15,10] = precision
8319      instr[9,5]   = Rs
8320      instr[4,0]   = Rd.  */
8321
8322   unsigned rs = INSTR (9, 5);
8323   unsigned rd = INSTR (4, 0);
8324
8325   NYI_assert (30, 23, 0x3C);
8326   NYI_assert (20, 16, 0x03);
8327
8328   if (INSTR (21, 21) != 1)
8329     HALT_NYI;
8330
8331   /* FIXME: Add exception raising.  */
8332   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8333   if (INSTR (31, 31))
8334     {
8335       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
8336
8337       if (INSTR (22, 22))
8338         aarch64_set_FP_double (cpu, rd, (double) value);
8339       else
8340         aarch64_set_FP_float (cpu, rd, (float) value);
8341     }
8342   else
8343     {
8344       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
8345
8346       if (INSTR (22, 22))
8347         aarch64_set_FP_double (cpu, rd, (double) value);
8348       else
8349         aarch64_set_FP_float (cpu, rd, (float) value);
8350     }
8351 }
8352
8353 static void
8354 float_vector_move (sim_cpu *cpu)
8355 {
8356   /* instr[31,17] == 100 1111 0101 0111
8357      instr[16]    ==> direction 0=> to GR, 1=> from GR
8358      instr[15,10] => ???
8359      instr[9,5]   ==> source
8360      instr[4,0]   ==> dest.  */
8361
8362   unsigned rn = INSTR (9, 5);
8363   unsigned rd = INSTR (4, 0);
8364
8365   NYI_assert (31, 17, 0x4F57);
8366
8367   if (INSTR (15, 10) != 0)
8368     HALT_UNALLOC;
8369
8370   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8371   if (INSTR (16, 16))
8372     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
8373   else
8374     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
8375 }
8376
8377 static void
8378 dexSimpleFPIntegerConvert (sim_cpu *cpu)
8379 {
8380   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
8381      instr[30     = 0
8382      instr[29]    = S :  0 ==> OK, 1 ==> UNALLOC
8383      instr[28,25] = 1111
8384      instr[24]    = 0
8385      instr[23,22] = type : 00 ==> single, 01 ==> double, 1x ==> UNALLOC
8386      instr[21]    = 1
8387      instr[20,19] = rmode
8388      instr[18,16] = opcode
8389      instr[15,10] = 10 0000  */
8390
8391   uint32_t rmode_opcode;
8392   uint32_t size_type;
8393   uint32_t type;
8394   uint32_t size;
8395   uint32_t S;
8396
8397   if (INSTR (31, 17) == 0x4F57)
8398     {
8399       float_vector_move (cpu);
8400       return;
8401     }
8402
8403   size = INSTR (31, 31);
8404   S = INSTR (29, 29);
8405   if (S != 0)
8406     HALT_UNALLOC;
8407
8408   type = INSTR (23, 22);
8409   if (type > 1)
8410     HALT_UNALLOC;
8411
8412   rmode_opcode = INSTR (20, 16);
8413   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
8414
8415   switch (rmode_opcode)
8416     {
8417     case 2:                     /* SCVTF.  */
8418       switch (size_type)
8419         {
8420         case 0: scvtf32 (cpu); return;
8421         case 1: scvtd32 (cpu); return;
8422         case 2: scvtf (cpu); return;
8423         case 3: scvtd (cpu); return;
8424         }
8425
8426     case 6:                     /* FMOV GR, Vec.  */
8427       switch (size_type)
8428         {
8429         case 0:  gfmovs (cpu); return;
8430         case 3:  gfmovd (cpu); return;
8431         default: HALT_UNALLOC;
8432         }
8433
8434     case 7:                     /* FMOV vec, GR.  */
8435       switch (size_type)
8436         {
8437         case 0:  fgmovs (cpu); return;
8438         case 3:  fgmovd (cpu); return;
8439         default: HALT_UNALLOC;
8440         }
8441
8442     case 24:                    /* FCVTZS.  */
8443       switch (size_type)
8444         {
8445         case 0: fcvtszs32 (cpu); return;
8446         case 1: fcvtszd32 (cpu); return;
8447         case 2: fcvtszs (cpu); return;
8448         case 3: fcvtszd (cpu); return;
8449         }
8450
8451     case 25: do_fcvtzu (cpu); return;
8452     case 3:  do_UCVTF (cpu); return;
8453
8454     case 0:     /* FCVTNS.  */
8455     case 1:     /* FCVTNU.  */
8456     case 4:     /* FCVTAS.  */
8457     case 5:     /* FCVTAU.  */
8458     case 8:     /* FCVPTS.  */
8459     case 9:     /* FCVTPU.  */
8460     case 16:    /* FCVTMS.  */
8461     case 17:    /* FCVTMU.  */
8462     default:
8463       HALT_NYI;
8464     }
8465 }
8466
8467 static void
8468 set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
8469 {
8470   uint32_t flags;
8471
8472   /* FIXME: Add exception raising.  */
8473   if (isnan (fvalue1) || isnan (fvalue2))
8474     flags = C|V;
8475   else if (isinf (fvalue1) && isinf (fvalue2))
8476     {
8477       /* Subtracting two infinities may give a NaN.  We only need to compare
8478          the signs, which we can get from isinf.  */
8479       int result = isinf (fvalue1) - isinf (fvalue2);
8480
8481       if (result == 0)
8482         flags = Z|C;
8483       else if (result < 0)
8484         flags = N;
8485       else /* (result > 0).  */
8486         flags = C;
8487     }
8488   else
8489     {
8490       float result = fvalue1 - fvalue2;
8491
8492       if (result == 0.0)
8493         flags = Z|C;
8494       else if (result < 0)
8495         flags = N;
8496       else /* (result > 0).  */
8497         flags = C;
8498     }
8499
8500   aarch64_set_CPSR (cpu, flags);
8501 }
8502
8503 static void
8504 fcmps (sim_cpu *cpu)
8505 {
8506   unsigned sm = INSTR (20, 16);
8507   unsigned sn = INSTR ( 9,  5);
8508
8509   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8510   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8511
8512   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8513   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8514 }
8515
8516 /* Float compare to zero -- Invalid Operation exception
8517    only on signaling NaNs.  */
8518 static void
8519 fcmpzs (sim_cpu *cpu)
8520 {
8521   unsigned sn = INSTR ( 9,  5);
8522   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8523
8524   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8525   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8526 }
8527
8528 /* Float compare -- Invalid Operation exception on all NaNs.  */
8529 static void
8530 fcmpes (sim_cpu *cpu)
8531 {
8532   unsigned sm = INSTR (20, 16);
8533   unsigned sn = INSTR ( 9,  5);
8534
8535   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8536   float fvalue2 = aarch64_get_FP_float (cpu, sm);
8537
8538   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8539   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
8540 }
8541
8542 /* Float compare to zero -- Invalid Operation exception on all NaNs.  */
8543 static void
8544 fcmpzes (sim_cpu *cpu)
8545 {
8546   unsigned sn = INSTR ( 9,  5);
8547   float fvalue1 = aarch64_get_FP_float (cpu, sn);
8548
8549   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8550   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
8551 }
8552
8553 static void
8554 set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
8555 {
8556   uint32_t flags;
8557
8558   /* FIXME: Add exception raising.  */
8559   if (isnan (dval1) || isnan (dval2))
8560     flags = C|V;
8561   else if (isinf (dval1) && isinf (dval2))
8562     {
8563       /* Subtracting two infinities may give a NaN.  We only need to compare
8564          the signs, which we can get from isinf.  */
8565       int result = isinf (dval1) - isinf (dval2);
8566
8567       if (result == 0)
8568         flags = Z|C;
8569       else if (result < 0)
8570         flags = N;
8571       else /* (result > 0).  */
8572         flags = C;
8573     }
8574   else
8575     {
8576       double result = dval1 - dval2;
8577
8578       if (result == 0.0)
8579         flags = Z|C;
8580       else if (result < 0)
8581         flags = N;
8582       else /* (result > 0).  */
8583         flags = C;
8584     }
8585
8586   aarch64_set_CPSR (cpu, flags);
8587 }
8588
8589 /* Double compare -- Invalid Operation exception only on signaling NaNs.  */
8590 static void
8591 fcmpd (sim_cpu *cpu)
8592 {
8593   unsigned sm = INSTR (20, 16);
8594   unsigned sn = INSTR ( 9,  5);
8595
8596   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8597   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8598
8599   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8600   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8601 }
8602
8603 /* Double compare to zero -- Invalid Operation exception
8604    only on signaling NaNs.  */
8605 static void
8606 fcmpzd (sim_cpu *cpu)
8607 {
8608   unsigned sn = INSTR ( 9,  5);
8609   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8610
8611   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8612   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8613 }
8614
8615 /* Double compare -- Invalid Operation exception on all NaNs.  */
8616 static void
8617 fcmped (sim_cpu *cpu)
8618 {
8619   unsigned sm = INSTR (20, 16);
8620   unsigned sn = INSTR ( 9,  5);
8621
8622   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8623   double dvalue2 = aarch64_get_FP_double (cpu, sm);
8624
8625   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8626   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
8627 }
8628
8629 /* Double compare to zero -- Invalid Operation exception on all NaNs.  */
8630 static void
8631 fcmpzed (sim_cpu *cpu)
8632 {
8633   unsigned sn = INSTR ( 9,  5);
8634   double dvalue1 = aarch64_get_FP_double (cpu, sn);
8635
8636   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8637   set_flags_for_double_compare (cpu, dvalue1, 0.0);
8638 }
8639
8640 static void
8641 dexSimpleFPCompare (sim_cpu *cpu)
8642 {
8643   /* assert instr[28,25] == 1111
8644      instr[30:24:21:13,10] = 0011000
8645      instr[31] = M : 0 ==> OK, 1 ==> UNALLOC
8646      instr[29] ==> S :  0 ==> OK, 1 ==> UNALLOC
8647      instr[23,22] ==> type : 0 ==> single, 01 ==> double, 1x ==> UNALLOC
8648      instr[15,14] ==> op : 00 ==> OK, ow ==> UNALLOC
8649      instr[4,0] ==> opcode2 : 00000 ==> FCMP, 10000 ==> FCMPE,
8650                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
8651                               ow ==> UNALLOC  */
8652   uint32_t dispatch;
8653   uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
8654   uint32_t type = INSTR (23, 22);
8655   uint32_t op = INSTR (15, 14);
8656   uint32_t op2_2_0 = INSTR (2, 0);
8657
8658   if (op2_2_0 != 0)
8659     HALT_UNALLOC;
8660
8661   if (M_S != 0)
8662     HALT_UNALLOC;
8663
8664   if (type > 1)
8665     HALT_UNALLOC;
8666
8667   if (op != 0)
8668     HALT_UNALLOC;
8669
8670   /* dispatch on type and top 2 bits of opcode.  */
8671   dispatch = (type << 2) | INSTR (4, 3);
8672
8673   switch (dispatch)
8674     {
8675     case 0: fcmps (cpu); return;
8676     case 1: fcmpzs (cpu); return;
8677     case 2: fcmpes (cpu); return;
8678     case 3: fcmpzes (cpu); return;
8679     case 4: fcmpd (cpu); return;
8680     case 5: fcmpzd (cpu); return;
8681     case 6: fcmped (cpu); return;
8682     case 7: fcmpzed (cpu); return;
8683     }
8684 }
8685
8686 static void
8687 do_scalar_FADDP (sim_cpu *cpu)
8688 {
8689   /* instr [31,23] = 0111 1110 0
8690      instr [22]    = single(0)/double(1)
8691      instr [21,10] = 11 0000 1101 10
8692      instr [9,5]   = Fn
8693      instr [4,0]   = Fd.  */
8694
8695   unsigned Fn = INSTR (9, 5);
8696   unsigned Fd = INSTR (4, 0);
8697
8698   NYI_assert (31, 23, 0x0FC);
8699   NYI_assert (21, 10, 0xC36);
8700
8701   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8702   if (INSTR (22, 22))
8703     {
8704       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
8705       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
8706
8707       aarch64_set_FP_double (cpu, Fd, val1 + val2);
8708     }
8709   else
8710     {
8711       float val1 = aarch64_get_vec_float (cpu, Fn, 0);
8712       float val2 = aarch64_get_vec_float (cpu, Fn, 1);
8713
8714       aarch64_set_FP_float (cpu, Fd, val1 + val2);
8715     }
8716 }
8717
8718 /* Floating point absolute difference.  */
8719
8720 static void
8721 do_scalar_FABD (sim_cpu *cpu)
8722 {
8723   /* instr [31,23] = 0111 1110 1
8724      instr [22]    = float(0)/double(1)
8725      instr [21]    = 1
8726      instr [20,16] = Rm
8727      instr [15,10] = 1101 01
8728      instr [9, 5]  = Rn
8729      instr [4, 0]  = Rd.  */
8730
8731   unsigned rm = INSTR (20, 16);
8732   unsigned rn = INSTR (9, 5);
8733   unsigned rd = INSTR (4, 0);
8734
8735   NYI_assert (31, 23, 0x0FD);
8736   NYI_assert (21, 21, 1);
8737   NYI_assert (15, 10, 0x35);
8738
8739   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8740   if (INSTR (22, 22))
8741     aarch64_set_FP_double (cpu, rd,
8742                            fabs (aarch64_get_FP_double (cpu, rn)
8743                                  - aarch64_get_FP_double (cpu, rm)));
8744   else
8745     aarch64_set_FP_float (cpu, rd,
8746                           fabsf (aarch64_get_FP_float (cpu, rn)
8747                                  - aarch64_get_FP_float (cpu, rm)));
8748 }
8749
8750 static void
8751 do_scalar_CMGT (sim_cpu *cpu)
8752 {
8753   /* instr [31,21] = 0101 1110 111
8754      instr [20,16] = Rm
8755      instr [15,10] = 00 1101
8756      instr [9, 5]  = Rn
8757      instr [4, 0]  = Rd.  */
8758
8759   unsigned rm = INSTR (20, 16);
8760   unsigned rn = INSTR (9, 5);
8761   unsigned rd = INSTR (4, 0);
8762
8763   NYI_assert (31, 21, 0x2F7);
8764   NYI_assert (15, 10, 0x0D);
8765
8766   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8767   aarch64_set_vec_u64 (cpu, rd, 0,
8768                        aarch64_get_vec_u64 (cpu, rn, 0) >
8769                        aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
8770 }
8771
8772 static void
8773 do_scalar_USHR (sim_cpu *cpu)
8774 {
8775   /* instr [31,23] = 0111 1111 0
8776      instr [22,16] = shift amount
8777      instr [15,10] = 0000 01
8778      instr [9, 5]  = Rn
8779      instr [4, 0]  = Rd.  */
8780
8781   unsigned amount = 128 - INSTR (22, 16);
8782   unsigned rn = INSTR (9, 5);
8783   unsigned rd = INSTR (4, 0);
8784
8785   NYI_assert (31, 23, 0x0FE);
8786   NYI_assert (15, 10, 0x01);
8787
8788   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8789   aarch64_set_vec_u64 (cpu, rd, 0,
8790                        aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
8791 }
8792
8793 static void
8794 do_scalar_SSHL (sim_cpu *cpu)
8795 {
8796   /* instr [31,21] = 0101 1110 111
8797      instr [20,16] = Rm
8798      instr [15,10] = 0100 01
8799      instr [9, 5]  = Rn
8800      instr [4, 0]  = Rd.  */
8801
8802   unsigned rm = INSTR (20, 16);
8803   unsigned rn = INSTR (9, 5);
8804   unsigned rd = INSTR (4, 0);
8805   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
8806
8807   NYI_assert (31, 21, 0x2F7);
8808   NYI_assert (15, 10, 0x11);
8809
8810   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8811   if (shift >= 0)
8812     aarch64_set_vec_s64 (cpu, rd, 0,
8813                          aarch64_get_vec_s64 (cpu, rn, 0) << shift);
8814   else
8815     aarch64_set_vec_s64 (cpu, rd, 0,
8816                          aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
8817 }
8818
8819 static void
8820 do_scalar_shift (sim_cpu *cpu)
8821 {
8822   /* instr [31,23] = 0101 1111 0
8823      instr [22,16] = shift amount
8824      instr [15,10] = 0101 01   [SHL]
8825      instr [15,10] = 0000 01   [SSHR]
8826      instr [9, 5]  = Rn
8827      instr [4, 0]  = Rd.  */
8828
8829   unsigned rn = INSTR (9, 5);
8830   unsigned rd = INSTR (4, 0);
8831   unsigned amount;
8832
8833   NYI_assert (31, 23, 0x0BE);
8834
8835   if (INSTR (22, 22) == 0)
8836     HALT_UNALLOC;
8837
8838   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8839   switch (INSTR (15, 10))
8840     {
8841     case 0x01: /* SSHR */
8842       amount = 128 - INSTR (22, 16);
8843       aarch64_set_vec_s64 (cpu, rd, 0,
8844                            aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
8845       return;
8846     case 0x15: /* SHL */
8847       amount = INSTR (22, 16) - 64;
8848       aarch64_set_vec_u64 (cpu, rd, 0,
8849                            aarch64_get_vec_u64 (cpu, rn, 0) << amount);
8850       return;
8851     default:
8852       HALT_NYI;
8853     }
8854 }
8855
8856 /* FCMEQ FCMGT FCMGE.  */
8857 static void
8858 do_scalar_FCM (sim_cpu *cpu)
8859 {
8860   /* instr [31,30] = 01
8861      instr [29]    = U
8862      instr [28,24] = 1 1110
8863      instr [23]    = E
8864      instr [22]    = size
8865      instr [21]    = 1
8866      instr [20,16] = Rm
8867      instr [15,12] = 1110
8868      instr [11]    = AC
8869      instr [10]    = 1
8870      instr [9, 5]  = Rn
8871      instr [4, 0]  = Rd.  */
8872
8873   unsigned rm = INSTR (20, 16);
8874   unsigned rn = INSTR (9, 5);
8875   unsigned rd = INSTR (4, 0);
8876   unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
8877   unsigned result;
8878   float val1;
8879   float val2;
8880
8881   NYI_assert (31, 30, 1);
8882   NYI_assert (28, 24, 0x1E);
8883   NYI_assert (21, 21, 1);
8884   NYI_assert (15, 12, 0xE);
8885   NYI_assert (10, 10, 1);
8886
8887   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8888   if (INSTR (22, 22))
8889     {
8890       double val1 = aarch64_get_FP_double (cpu, rn);
8891       double val2 = aarch64_get_FP_double (cpu, rm);
8892
8893       switch (EUac)
8894         {
8895         case 0: /* 000 */
8896           result = val1 == val2;
8897           break;
8898
8899         case 3: /* 011 */
8900           val1 = fabs (val1);
8901           val2 = fabs (val2);
8902           /* Fall through. */
8903         case 2: /* 010 */
8904           result = val1 >= val2;
8905           break;
8906
8907         case 7: /* 111 */
8908           val1 = fabs (val1);
8909           val2 = fabs (val2);
8910           /* Fall through. */
8911         case 6: /* 110 */
8912           result = val1 > val2;
8913           break;
8914
8915         default:
8916           HALT_UNALLOC;
8917         }
8918
8919       aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8920       return;
8921     }
8922
8923   val1 = aarch64_get_FP_float (cpu, rn);
8924   val2 = aarch64_get_FP_float (cpu, rm);
8925
8926   switch (EUac)
8927     {
8928     case 0: /* 000 */
8929       result = val1 == val2;
8930       break;
8931
8932     case 3: /* 011 */
8933       val1 = fabsf (val1);
8934       val2 = fabsf (val2);
8935       /* Fall through. */
8936     case 2: /* 010 */
8937       result = val1 >= val2;
8938       break;
8939
8940     case 7: /* 111 */
8941       val1 = fabsf (val1);
8942       val2 = fabsf (val2);
8943       /* Fall through. */
8944     case 6: /* 110 */
8945       result = val1 > val2;
8946       break;
8947
8948     default:
8949       HALT_UNALLOC;
8950     }
8951
8952   aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
8953 }
8954
8955 /* An alias of DUP.  */
8956 static void
8957 do_scalar_MOV (sim_cpu *cpu)
8958 {
8959   /* instr [31,21] = 0101 1110 000
8960      instr [20,16] = imm5
8961      instr [15,10] = 0000 01
8962      instr [9, 5]  = Rn
8963      instr [4, 0]  = Rd.  */
8964
8965   unsigned rn = INSTR (9, 5);
8966   unsigned rd = INSTR (4, 0);
8967   unsigned index;
8968
8969   NYI_assert (31, 21, 0x2F0);
8970   NYI_assert (15, 10, 0x01);
8971
8972   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
8973   if (INSTR (16, 16))
8974     {
8975       /* 8-bit.  */
8976       index = INSTR (20, 17);
8977       aarch64_set_vec_u8
8978         (cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
8979     }
8980   else if (INSTR (17, 17))
8981     {
8982       /* 16-bit.  */
8983       index = INSTR (20, 18);
8984       aarch64_set_vec_u16
8985         (cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
8986     }
8987   else if (INSTR (18, 18))
8988     {
8989       /* 32-bit.  */
8990       index = INSTR (20, 19);
8991       aarch64_set_vec_u32
8992         (cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
8993     }
8994   else if (INSTR (19, 19))
8995     {
8996       /* 64-bit.  */
8997       index = INSTR (20, 20);
8998       aarch64_set_vec_u64
8999         (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
9000     }
9001   else
9002     HALT_UNALLOC;
9003 }
9004
9005 static void
9006 do_scalar_NEG (sim_cpu *cpu)
9007 {
9008   /* instr [31,10] = 0111 1110 1110 0000 1011 10
9009      instr [9, 5]  = Rn
9010      instr [4, 0]  = Rd.  */
9011
9012   unsigned rn = INSTR (9, 5);
9013   unsigned rd = INSTR (4, 0);
9014
9015   NYI_assert (31, 10, 0x1FB82E);
9016
9017   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9018   aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
9019 }
9020
9021 static void
9022 do_scalar_USHL (sim_cpu *cpu)
9023 {
9024   /* instr [31,21] = 0111 1110 111
9025      instr [20,16] = Rm
9026      instr [15,10] = 0100 01
9027      instr [9, 5]  = Rn
9028      instr [4, 0]  = Rd.  */
9029
9030   unsigned rm = INSTR (20, 16);
9031   unsigned rn = INSTR (9, 5);
9032   unsigned rd = INSTR (4, 0);
9033   signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
9034
9035   NYI_assert (31, 21, 0x3F7);
9036   NYI_assert (15, 10, 0x11);
9037
9038   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9039   if (shift >= 0)
9040     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
9041   else
9042     aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
9043 }
9044
9045 static void
9046 do_double_add (sim_cpu *cpu)
9047 {
9048   /* instr [31,21] = 0101 1110 111
9049      instr [20,16] = Fn
9050      instr [15,10] = 1000 01
9051      instr [9,5]   = Fm
9052      instr [4,0]   = Fd.  */
9053   unsigned Fd;
9054   unsigned Fm;
9055   unsigned Fn;
9056   double val1;
9057   double val2;
9058
9059   NYI_assert (31, 21, 0x2F7);
9060   NYI_assert (15, 10, 0x21);
9061
9062   Fd = INSTR (4, 0);
9063   Fm = INSTR (9, 5);
9064   Fn = INSTR (20, 16);
9065
9066   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9067   val1 = aarch64_get_FP_double (cpu, Fm);
9068   val2 = aarch64_get_FP_double (cpu, Fn);
9069
9070   aarch64_set_FP_double (cpu, Fd, val1 + val2);
9071 }
9072
9073 static void
9074 do_scalar_UCVTF (sim_cpu *cpu)
9075 {
9076   /* instr [31,23] = 0111 1110 0
9077      instr [22]    = single(0)/double(1)
9078      instr [21,10] = 10 0001 1101 10
9079      instr [9,5]   = rn
9080      instr [4,0]   = rd.  */
9081
9082   unsigned rn = INSTR (9, 5);
9083   unsigned rd = INSTR (4, 0);
9084
9085   NYI_assert (31, 23, 0x0FC);
9086   NYI_assert (21, 10, 0x876);
9087
9088   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9089   if (INSTR (22, 22))
9090     {
9091       uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
9092
9093       aarch64_set_vec_double (cpu, rd, 0, (double) val);
9094     }
9095   else
9096     {
9097       uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
9098
9099       aarch64_set_vec_float (cpu, rd, 0, (float) val);
9100     }
9101 }
9102
9103 static void
9104 do_scalar_vec (sim_cpu *cpu)
9105 {
9106   /* instr [30] = 1.  */
9107   /* instr [28,25] = 1111.  */
9108   switch (INSTR (31, 23))
9109     {
9110     case 0xBC:
9111       switch (INSTR (15, 10))
9112         {
9113         case 0x01: do_scalar_MOV (cpu); return;
9114         case 0x39: do_scalar_FCM (cpu); return;
9115         case 0x3B: do_scalar_FCM (cpu); return;
9116         }
9117       break;
9118
9119     case 0xBE: do_scalar_shift (cpu); return;
9120
9121     case 0xFC:
9122       switch (INSTR (15, 10))
9123         {
9124         case 0x36:
9125           switch (INSTR (21, 16))
9126             {
9127             case 0x30: do_scalar_FADDP (cpu); return;
9128             case 0x21: do_scalar_UCVTF (cpu); return;
9129             }
9130           HALT_NYI;
9131         case 0x39: do_scalar_FCM (cpu); return;
9132         case 0x3B: do_scalar_FCM (cpu); return;
9133         }
9134       break;
9135
9136     case 0xFD:
9137       switch (INSTR (15, 10))
9138         {
9139         case 0x0D: do_scalar_CMGT (cpu); return;
9140         case 0x11: do_scalar_USHL (cpu); return;
9141         case 0x2E: do_scalar_NEG (cpu); return;
9142         case 0x35: do_scalar_FABD (cpu); return;
9143         case 0x39: do_scalar_FCM (cpu); return;
9144         case 0x3B: do_scalar_FCM (cpu); return;
9145         default:
9146           HALT_NYI;
9147         }
9148
9149     case 0xFE: do_scalar_USHR (cpu); return;
9150
9151     case 0xBD:
9152       switch (INSTR (15, 10))
9153         {
9154         case 0x21: do_double_add (cpu); return;
9155         case 0x11: do_scalar_SSHL (cpu); return;
9156         default:
9157           HALT_NYI;
9158         }
9159
9160     default:
9161       HALT_NYI;
9162     }
9163 }
9164
9165 static void
9166 dexAdvSIMD1 (sim_cpu *cpu)
9167 {
9168   /* instr [28,25] = 1 111.  */
9169
9170   /* We are currently only interested in the basic
9171      scalar fp routines which all have bit 30 = 0.  */
9172   if (INSTR (30, 30))
9173     do_scalar_vec (cpu);
9174
9175   /* instr[24] is set for FP data processing 3-source and clear for
9176      all other basic scalar fp instruction groups.  */
9177   else if (INSTR (24, 24))
9178     dexSimpleFPDataProc3Source (cpu);
9179
9180   /* instr[21] is clear for floating <-> fixed conversions and set for
9181      all other basic scalar fp instruction groups.  */
9182   else if (!INSTR (21, 21))
9183     dexSimpleFPFixedConvert (cpu);
9184
9185   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
9186      11 ==> cond select,  00 ==> other.  */
9187   else
9188     switch (INSTR (11, 10))
9189       {
9190       case 1: dexSimpleFPCondCompare (cpu); return;
9191       case 2: dexSimpleFPDataProc2Source (cpu); return;
9192       case 3: dexSimpleFPCondSelect (cpu); return;
9193
9194       default:
9195         /* Now an ordered cascade of tests.
9196            FP immediate has instr [12] == 1.
9197            FP compare has   instr [13] == 1.
9198            FP Data Proc 1 Source has instr [14] == 1.
9199            FP floating <--> integer conversions has instr [15] == 0.  */
9200         if (INSTR (12, 12))
9201           dexSimpleFPImmediate (cpu);
9202
9203         else if (INSTR (13, 13))
9204           dexSimpleFPCompare (cpu);
9205
9206         else if (INSTR (14, 14))
9207           dexSimpleFPDataProc1Source (cpu);
9208
9209         else if (!INSTR (15, 15))
9210           dexSimpleFPIntegerConvert (cpu);
9211
9212         else
9213           /* If we get here then instr[15] == 1 which means UNALLOC.  */
9214           HALT_UNALLOC;
9215       }
9216 }
9217
9218 /* PC relative addressing.  */
9219
9220 static void
9221 pcadr (sim_cpu *cpu)
9222 {
9223   /* instr[31] = op : 0 ==> ADR, 1 ==> ADRP
9224      instr[30,29] = immlo
9225      instr[23,5] = immhi.  */
9226   uint64_t address;
9227   unsigned rd = INSTR (4, 0);
9228   uint32_t isPage = INSTR (31, 31);
9229   union { int64_t u64; uint64_t s64; } imm;
9230   uint64_t offset;
9231
9232   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
9233   offset = imm.u64;
9234   offset = (offset << 2) | INSTR (30, 29);
9235
9236   address = aarch64_get_PC (cpu);
9237
9238   if (isPage)
9239     {
9240       offset <<= 12;
9241       address &= ~0xfff;
9242     }
9243
9244   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9245   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
9246 }
9247
9248 /* Specific decode and execute for group Data Processing Immediate.  */
9249
9250 static void
9251 dexPCRelAddressing (sim_cpu *cpu)
9252 {
9253   /* assert instr[28,24] = 10000.  */
9254   pcadr (cpu);
9255 }
9256
9257 /* Immediate logical.
9258    The bimm32/64 argument is constructed by replicating a 2, 4, 8,
9259    16, 32 or 64 bit sequence pulled out at decode and possibly
9260    inverting it..
9261
9262    N.B. the output register (dest) can normally be Xn or SP
9263    the exception occurs for flag setting instructions which may
9264    only use Xn for the output (dest).  The input register can
9265    never be SP.  */
9266
9267 /* 32 bit and immediate.  */
9268 static void
9269 and32 (sim_cpu *cpu, uint32_t bimm)
9270 {
9271   unsigned rn = INSTR (9, 5);
9272   unsigned rd = INSTR (4, 0);
9273
9274   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9275   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9276                        aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
9277 }
9278
9279 /* 64 bit and immediate.  */
9280 static void
9281 and64 (sim_cpu *cpu, uint64_t bimm)
9282 {
9283   unsigned rn = INSTR (9, 5);
9284   unsigned rd = INSTR (4, 0);
9285
9286   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9287   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9288                        aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
9289 }
9290
9291 /* 32 bit and immediate set flags.  */
9292 static void
9293 ands32 (sim_cpu *cpu, uint32_t bimm)
9294 {
9295   unsigned rn = INSTR (9, 5);
9296   unsigned rd = INSTR (4, 0);
9297
9298   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9299   uint32_t value2 = bimm;
9300
9301   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9302   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9303   set_flags_for_binop32 (cpu, value1 & value2);
9304 }
9305
9306 /* 64 bit and immediate set flags.  */
9307 static void
9308 ands64 (sim_cpu *cpu, uint64_t bimm)
9309 {
9310   unsigned rn = INSTR (9, 5);
9311   unsigned rd = INSTR (4, 0);
9312
9313   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9314   uint64_t value2 = bimm;
9315
9316   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9317   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9318   set_flags_for_binop64 (cpu, value1 & value2);
9319 }
9320
9321 /* 32 bit exclusive or immediate.  */
9322 static void
9323 eor32 (sim_cpu *cpu, uint32_t bimm)
9324 {
9325   unsigned rn = INSTR (9, 5);
9326   unsigned rd = INSTR (4, 0);
9327
9328   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9329   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9330                        aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
9331 }
9332
9333 /* 64 bit exclusive or immediate.  */
9334 static void
9335 eor64 (sim_cpu *cpu, uint64_t bimm)
9336 {
9337   unsigned rn = INSTR (9, 5);
9338   unsigned rd = INSTR (4, 0);
9339
9340   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9341   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9342                        aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
9343 }
9344
9345 /* 32 bit or immediate.  */
9346 static void
9347 orr32 (sim_cpu *cpu, uint32_t bimm)
9348 {
9349   unsigned rn = INSTR (9, 5);
9350   unsigned rd = INSTR (4, 0);
9351
9352   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9353   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9354                        aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
9355 }
9356
9357 /* 64 bit or immediate.  */
9358 static void
9359 orr64 (sim_cpu *cpu, uint64_t bimm)
9360 {
9361   unsigned rn = INSTR (9, 5);
9362   unsigned rd = INSTR (4, 0);
9363
9364   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9365   aarch64_set_reg_u64 (cpu, rd, SP_OK,
9366                        aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
9367 }
9368
9369 /* Logical shifted register.
9370    These allow an optional LSL, ASR, LSR or ROR to the second source
9371    register with a count up to the register bit count.
9372    N.B register args may not be SP.  */
9373
9374 /* 32 bit AND shifted register.  */
9375 static void
9376 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9377 {
9378   unsigned rm = INSTR (20, 16);
9379   unsigned rn = INSTR (9, 5);
9380   unsigned rd = INSTR (4, 0);
9381
9382   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9383   aarch64_set_reg_u64
9384     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9385      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9386 }
9387
9388 /* 64 bit AND shifted register.  */
9389 static void
9390 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9391 {
9392   unsigned rm = INSTR (20, 16);
9393   unsigned rn = INSTR (9, 5);
9394   unsigned rd = INSTR (4, 0);
9395
9396   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9397   aarch64_set_reg_u64
9398     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9399      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9400 }
9401
9402 /* 32 bit AND shifted register setting flags.  */
9403 static void
9404 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9405 {
9406   unsigned rm = INSTR (20, 16);
9407   unsigned rn = INSTR (9, 5);
9408   unsigned rd = INSTR (4, 0);
9409
9410   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9411   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9412                                shift, count);
9413
9414   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9415   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9416   set_flags_for_binop32 (cpu, value1 & value2);
9417 }
9418
9419 /* 64 bit AND shifted register setting flags.  */
9420 static void
9421 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9422 {
9423   unsigned rm = INSTR (20, 16);
9424   unsigned rn = INSTR (9, 5);
9425   unsigned rd = INSTR (4, 0);
9426
9427   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9428   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9429                                shift, count);
9430
9431   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9432   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9433   set_flags_for_binop64 (cpu, value1 & value2);
9434 }
9435
9436 /* 32 bit BIC shifted register.  */
9437 static void
9438 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9439 {
9440   unsigned rm = INSTR (20, 16);
9441   unsigned rn = INSTR (9, 5);
9442   unsigned rd = INSTR (4, 0);
9443
9444   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9445   aarch64_set_reg_u64
9446     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9447      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9448 }
9449
9450 /* 64 bit BIC shifted register.  */
9451 static void
9452 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9453 {
9454   unsigned rm = INSTR (20, 16);
9455   unsigned rn = INSTR (9, 5);
9456   unsigned rd = INSTR (4, 0);
9457
9458   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9459   aarch64_set_reg_u64
9460     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9461      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9462 }
9463
9464 /* 32 bit BIC shifted register setting flags.  */
9465 static void
9466 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9467 {
9468   unsigned rm = INSTR (20, 16);
9469   unsigned rn = INSTR (9, 5);
9470   unsigned rd = INSTR (4, 0);
9471
9472   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9473   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
9474                                  shift, count);
9475
9476   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9477   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9478   set_flags_for_binop32 (cpu, value1 & value2);
9479 }
9480
9481 /* 64 bit BIC shifted register setting flags.  */
9482 static void
9483 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9484 {
9485   unsigned rm = INSTR (20, 16);
9486   unsigned rn = INSTR (9, 5);
9487   unsigned rd = INSTR (4, 0);
9488
9489   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9490   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
9491                                  shift, count);
9492
9493   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9494   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
9495   set_flags_for_binop64 (cpu, value1 & value2);
9496 }
9497
9498 /* 32 bit EON shifted register.  */
9499 static void
9500 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9501 {
9502   unsigned rm = INSTR (20, 16);
9503   unsigned rn = INSTR (9, 5);
9504   unsigned rd = INSTR (4, 0);
9505
9506   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9507   aarch64_set_reg_u64
9508     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9509      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9510 }
9511
9512 /* 64 bit EON shifted register.  */
9513 static void
9514 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9515 {
9516   unsigned rm = INSTR (20, 16);
9517   unsigned rn = INSTR (9, 5);
9518   unsigned rd = INSTR (4, 0);
9519
9520   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9521   aarch64_set_reg_u64
9522     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9523      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9524 }
9525
9526 /* 32 bit EOR shifted register.  */
9527 static void
9528 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9529 {
9530   unsigned rm = INSTR (20, 16);
9531   unsigned rn = INSTR (9, 5);
9532   unsigned rd = INSTR (4, 0);
9533
9534   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9535   aarch64_set_reg_u64
9536     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9537      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9538 }
9539
9540 /* 64 bit EOR shifted register.  */
9541 static void
9542 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9543 {
9544   unsigned rm = INSTR (20, 16);
9545   unsigned rn = INSTR (9, 5);
9546   unsigned rd = INSTR (4, 0);
9547
9548   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9549   aarch64_set_reg_u64
9550     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9551      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9552 }
9553
9554 /* 32 bit ORR shifted register.  */
9555 static void
9556 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9557 {
9558   unsigned rm = INSTR (20, 16);
9559   unsigned rn = INSTR (9, 5);
9560   unsigned rd = INSTR (4, 0);
9561
9562   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9563   aarch64_set_reg_u64
9564     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9565      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9566 }
9567
9568 /* 64 bit ORR shifted register.  */
9569 static void
9570 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9571 {
9572   unsigned rm = INSTR (20, 16);
9573   unsigned rn = INSTR (9, 5);
9574   unsigned rd = INSTR (4, 0);
9575
9576   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9577   aarch64_set_reg_u64
9578     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9579      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9580 }
9581
9582 /* 32 bit ORN shifted register.  */
9583 static void
9584 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9585 {
9586   unsigned rm = INSTR (20, 16);
9587   unsigned rn = INSTR (9, 5);
9588   unsigned rd = INSTR (4, 0);
9589
9590   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9591   aarch64_set_reg_u64
9592     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
9593      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
9594 }
9595
9596 /* 64 bit ORN shifted register.  */
9597 static void
9598 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
9599 {
9600   unsigned rm = INSTR (20, 16);
9601   unsigned rn = INSTR (9, 5);
9602   unsigned rd = INSTR (4, 0);
9603
9604   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9605   aarch64_set_reg_u64
9606     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
9607      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
9608 }
9609
9610 static void
9611 dexLogicalImmediate (sim_cpu *cpu)
9612 {
9613   /* assert instr[28,23] = 1001000
9614      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9615      instr[30,29] = op : 0 ==> AND, 1 ==> ORR, 2 ==> EOR, 3 ==> ANDS
9616      instr[22] = N : used to construct immediate mask
9617      instr[21,16] = immr
9618      instr[15,10] = imms
9619      instr[9,5] = Rn
9620      instr[4,0] = Rd  */
9621
9622   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
9623   uint32_t size = INSTR (31, 31);
9624   uint32_t N = INSTR (22, 22);
9625   /* uint32_t immr = INSTR (21, 16);.  */
9626   /* uint32_t imms = INSTR (15, 10);.  */
9627   uint32_t index = INSTR (22, 10);
9628   uint64_t bimm64 = LITable [index];
9629   uint32_t dispatch = INSTR (30, 29);
9630
9631   if (~size & N)
9632     HALT_UNALLOC;
9633
9634   if (!bimm64)
9635     HALT_UNALLOC;
9636
9637   if (size == 0)
9638     {
9639       uint32_t bimm = (uint32_t) bimm64;
9640
9641       switch (dispatch)
9642         {
9643         case 0: and32 (cpu, bimm); return;
9644         case 1: orr32 (cpu, bimm); return;
9645         case 2: eor32 (cpu, bimm); return;
9646         case 3: ands32 (cpu, bimm); return;
9647         }
9648     }
9649   else
9650     {
9651       switch (dispatch)
9652         {
9653         case 0: and64 (cpu, bimm64); return;
9654         case 1: orr64 (cpu, bimm64); return;
9655         case 2: eor64 (cpu, bimm64); return;
9656         case 3: ands64 (cpu, bimm64); return;
9657         }
9658     }
9659   HALT_UNALLOC;
9660 }
9661
9662 /* Immediate move.
9663    The uimm argument is a 16 bit value to be inserted into the
9664    target register the pos argument locates the 16 bit word in the
9665    dest register i.e. it is in {0, 1} for 32 bit and {0, 1, 2,
9666    3} for 64 bit.
9667    N.B register arg may not be SP so it should be.
9668    accessed using the setGZRegisterXXX accessors.  */
9669
9670 /* 32 bit move 16 bit immediate zero remaining shorts.  */
9671 static void
9672 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9673 {
9674   unsigned rd = INSTR (4, 0);
9675
9676   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9677   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
9678 }
9679
9680 /* 64 bit move 16 bit immediate zero remaining shorts.  */
9681 static void
9682 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9683 {
9684   unsigned rd = INSTR (4, 0);
9685
9686   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9687   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
9688 }
9689
9690 /* 32 bit move 16 bit immediate negated.  */
9691 static void
9692 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9693 {
9694   unsigned rd = INSTR (4, 0);
9695
9696   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9697   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
9698 }
9699
9700 /* 64 bit move 16 bit immediate negated.  */
9701 static void
9702 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9703 {
9704   unsigned rd = INSTR (4, 0);
9705
9706   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9707   aarch64_set_reg_u64
9708     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
9709                       ^ 0xffffffffffffffffULL));
9710 }
9711
9712 /* 32 bit move 16 bit immediate keep remaining shorts.  */
9713 static void
9714 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9715 {
9716   unsigned rd = INSTR (4, 0);
9717   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9718   uint32_t value = val << (pos * 16);
9719   uint32_t mask = ~(0xffffU << (pos * 16));
9720
9721   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9722   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9723 }
9724
9725 /* 64 bit move 16 it immediate keep remaining shorts.  */
9726 static void
9727 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
9728 {
9729   unsigned rd = INSTR (4, 0);
9730   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
9731   uint64_t value = (uint64_t) val << (pos * 16);
9732   uint64_t mask = ~(0xffffULL << (pos * 16));
9733
9734   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9735   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
9736 }
9737
9738 static void
9739 dexMoveWideImmediate (sim_cpu *cpu)
9740 {
9741   /* assert instr[28:23] = 100101
9742      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
9743      instr[30,29] = op : 0 ==> MOVN, 1 ==> UNALLOC, 2 ==> MOVZ, 3 ==> MOVK
9744      instr[22,21] = shift : 00 == LSL#0, 01 = LSL#16, 10 = LSL#32, 11 = LSL#48
9745      instr[20,5] = uimm16
9746      instr[4,0] = Rd  */
9747
9748   /* N.B. the (multiple of 16) shift is applied by the called routine,
9749      we just pass the multiplier.  */
9750
9751   uint32_t imm;
9752   uint32_t size = INSTR (31, 31);
9753   uint32_t op = INSTR (30, 29);
9754   uint32_t shift = INSTR (22, 21);
9755
9756   /* 32 bit can only shift 0 or 1 lot of 16.
9757      anything else is an unallocated instruction.  */
9758   if (size == 0 && (shift > 1))
9759     HALT_UNALLOC;
9760
9761   if (op == 1)
9762     HALT_UNALLOC;
9763
9764   imm = INSTR (20, 5);
9765
9766   if (size == 0)
9767     {
9768       if (op == 0)
9769         movn32 (cpu, imm, shift);
9770       else if (op == 2)
9771         movz32 (cpu, imm, shift);
9772       else
9773         movk32 (cpu, imm, shift);
9774     }
9775   else
9776     {
9777       if (op == 0)
9778         movn64 (cpu, imm, shift);
9779       else if (op == 2)
9780         movz64 (cpu, imm, shift);
9781       else
9782         movk64 (cpu, imm, shift);
9783     }
9784 }
9785
9786 /* Bitfield operations.
9787    These take a pair of bit positions r and s which are in {0..31}
9788    or {0..63} depending on the instruction word size.
9789    N.B register args may not be SP.  */
9790
9791 /* OK, we start with ubfm which just needs to pick
9792    some bits out of source zero the rest and write
9793    the result to dest.  Just need two logical shifts.  */
9794
9795 /* 32 bit bitfield move, left and right of affected zeroed
9796    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9797 static void
9798 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9799 {
9800   unsigned rd;
9801   unsigned rn = INSTR (9, 5);
9802   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9803
9804   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9805   if (r <= s)
9806     {
9807       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9808          We want only bits s:xxx:r at the bottom of the word
9809          so we LSL bit s up to bit 31 i.e. by 31 - s
9810          and then we LSR to bring bit 31 down to bit s - r
9811          i.e. by 31 + r - s.  */
9812       value <<= 31 - s;
9813       value >>= 31 + r - s;
9814     }
9815   else
9816     {
9817       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0
9818          We want only bits s:xxx:0 starting at it 31-(r-1)
9819          so we LSL bit s up to bit 31 i.e. by 31 - s
9820          and then we LSL to bring bit 31 down to 31-(r-1)+s
9821          i.e. by r - (s + 1).  */
9822       value <<= 31 - s;
9823       value >>= r - (s + 1);
9824     }
9825
9826   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9827   rd = INSTR (4, 0);
9828   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9829 }
9830
9831 /* 64 bit bitfield move, left and right of affected zeroed
9832    if r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9833 static void
9834 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9835 {
9836   unsigned rd;
9837   unsigned rn = INSTR (9, 5);
9838   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9839
9840   if (r <= s)
9841     {
9842       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
9843          We want only bits s:xxx:r at the bottom of the word.
9844          So we LSL bit s up to bit 63 i.e. by 63 - s
9845          and then we LSR to bring bit 63 down to bit s - r
9846          i.e. by 63 + r - s.  */
9847       value <<= 63 - s;
9848       value >>= 63 + r - s;
9849     }
9850   else
9851     {
9852       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0.
9853          We want only bits s:xxx:0 starting at it 63-(r-1).
9854          So we LSL bit s up to bit 63 i.e. by 63 - s
9855          and then we LSL to bring bit 63 down to 63-(r-1)+s
9856          i.e. by r - (s + 1).  */
9857       value <<= 63 - s;
9858       value >>= r - (s + 1);
9859     }
9860
9861   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9862   rd = INSTR (4, 0);
9863   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
9864 }
9865
9866 /* The signed versions need to insert sign bits
9867    on the left of the inserted bit field. so we do
9868    much the same as the unsigned version except we
9869    use an arithmetic shift right -- this just means
9870    we need to operate on signed values.  */
9871
9872 /* 32 bit bitfield move, left of affected sign-extended, right zeroed.  */
9873 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9874 static void
9875 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9876 {
9877   unsigned rd;
9878   unsigned rn = INSTR (9, 5);
9879   /* as per ubfm32 but use an ASR instead of an LSR.  */
9880   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
9881
9882   if (r <= s)
9883     {
9884       value <<= 31 - s;
9885       value >>= 31 + r - s;
9886     }
9887   else
9888     {
9889       value <<= 31 - s;
9890       value >>= r - (s + 1);
9891     }
9892
9893   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9894   rd = INSTR (4, 0);
9895   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
9896 }
9897
9898 /* 64 bit bitfield move, left of affected sign-extended, right zeroed.  */
9899 /* If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9900 static void
9901 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9902 {
9903   unsigned rd;
9904   unsigned rn = INSTR (9, 5);
9905   /* acpu per ubfm but use an ASR instead of an LSR.  */
9906   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
9907
9908   if (r <= s)
9909     {
9910       value <<= 63 - s;
9911       value >>= 63 + r - s;
9912     }
9913   else
9914     {
9915       value <<= 63 - s;
9916       value >>= r - (s + 1);
9917     }
9918
9919   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9920   rd = INSTR (4, 0);
9921   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
9922 }
9923
9924 /* Finally, these versions leave non-affected bits
9925    as is. so we need to generate the bits as per
9926    ubfm and also generate a mask to pick the
9927    bits from the original and computed values.  */
9928
9929 /* 32 bit bitfield move, non-affected bits left as is.
9930    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<32+s-r,32-r> = Wn<s:0>.  */
9931 static void
9932 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
9933 {
9934   unsigned rn = INSTR (9, 5);
9935   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
9936   uint32_t mask = -1;
9937   unsigned rd;
9938   uint32_t value2;
9939
9940   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
9941   if (r <= s)
9942     {
9943       /* 31:...:s:xxx:r:...:0 ==> 31:...:s-r:xxx:0.
9944          We want only bits s:xxx:r at the bottom of the word
9945          so we LSL bit s up to bit 31 i.e. by 31 - s
9946          and then we LSR to bring bit 31 down to bit s - r
9947          i.e. by 31 + r - s.  */
9948       value <<= 31 - s;
9949       value >>= 31 + r - s;
9950       /* the mask must include the same bits.  */
9951       mask <<= 31 - s;
9952       mask >>= 31 + r - s;
9953     }
9954   else
9955     {
9956       /* 31:...:s:xxx:0 ==> 31:...:31-(r-1)+s:xxx:31-(r-1):...:0.
9957          We want only bits s:xxx:0 starting at it 31-(r-1)
9958          so we LSL bit s up to bit 31 i.e. by 31 - s
9959          and then we LSL to bring bit 31 down to 31-(r-1)+s
9960          i.e. by r - (s + 1).  */
9961       value <<= 31 - s;
9962       value >>= r - (s + 1);
9963       /* The mask must include the same bits.  */
9964       mask <<= 31 - s;
9965       mask >>= r - (s + 1);
9966     }
9967
9968   rd = INSTR (4, 0);
9969   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
9970
9971   value2 &= ~mask;
9972   value2 |= value;
9973
9974   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
9975   aarch64_set_reg_u64
9976     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
9977 }
9978
9979 /* 64 bit bitfield move, non-affected bits left as is.
9980    If r <= s Wd<s-r:0> = Wn<s:r> else Wd<64+s-r,64-r> = Wn<s:0>.  */
9981 static void
9982 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
9983 {
9984   unsigned rd;
9985   unsigned rn = INSTR (9, 5);
9986   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
9987   uint64_t mask = 0xffffffffffffffffULL;
9988
9989   if (r <= s)
9990     {
9991       /* 63:...:s:xxx:r:...:0 ==> 63:...:s-r:xxx:0.
9992          We want only bits s:xxx:r at the bottom of the word
9993          so we LSL bit s up to bit 63 i.e. by 63 - s
9994          and then we LSR to bring bit 63 down to bit s - r
9995          i.e. by 63 + r - s.  */
9996       value <<= 63 - s;
9997       value >>= 63 + r - s;
9998       /* The mask must include the same bits.  */
9999       mask <<= 63 - s;
10000       mask >>= 63 + r - s;
10001     }
10002   else
10003     {
10004       /* 63:...:s:xxx:0 ==> 63:...:63-(r-1)+s:xxx:63-(r-1):...:0
10005          We want only bits s:xxx:0 starting at it 63-(r-1)
10006          so we LSL bit s up to bit 63 i.e. by 63 - s
10007          and then we LSL to bring bit 63 down to 63-(r-1)+s
10008          i.e. by r - (s + 1).  */
10009       value <<= 63 - s;
10010       value >>= r - (s + 1);
10011       /* The mask must include the same bits.  */
10012       mask <<= 63 - s;
10013       mask >>= r - (s + 1);
10014     }
10015
10016   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10017   rd = INSTR (4, 0);
10018   aarch64_set_reg_u64
10019     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
10020 }
10021
10022 static void
10023 dexBitfieldImmediate (sim_cpu *cpu)
10024 {
10025   /* assert instr[28:23] = 100110
10026      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
10027      instr[30,29] = op : 0 ==> SBFM, 1 ==> BFM, 2 ==> UBFM, 3 ==> UNALLOC
10028      instr[22] = N : must be 0 for 32 bit, 1 for 64 bit ow UNALLOC
10029      instr[21,16] = immr : 0xxxxx for 32 bit, xxxxxx for 64 bit
10030      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10031      instr[9,5] = Rn
10032      instr[4,0] = Rd  */
10033
10034   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10035   uint32_t dispatch;
10036   uint32_t imms;
10037   uint32_t size = INSTR (31, 31);
10038   uint32_t N = INSTR (22, 22);
10039   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
10040   /* or else we have an UNALLOC.  */
10041   uint32_t immr = INSTR (21, 16);
10042
10043   if (~size & N)
10044     HALT_UNALLOC;
10045
10046   if (!size && uimm (immr, 5, 5))
10047     HALT_UNALLOC;
10048
10049   imms = INSTR (15, 10);
10050   if (!size && uimm (imms, 5, 5))
10051     HALT_UNALLOC;
10052
10053   /* Switch on combined size and op.  */
10054   dispatch = INSTR (31, 29);
10055   switch (dispatch)
10056     {
10057     case 0: sbfm32 (cpu, immr, imms); return;
10058     case 1: bfm32 (cpu, immr, imms); return;
10059     case 2: ubfm32 (cpu, immr, imms); return;
10060     case 4: sbfm (cpu, immr, imms); return;
10061     case 5: bfm (cpu, immr, imms); return;
10062     case 6: ubfm (cpu, immr, imms); return;
10063     default: HALT_UNALLOC;
10064     }
10065 }
10066
10067 static void
10068 do_EXTR_32 (sim_cpu *cpu)
10069 {
10070   /* instr[31:21] = 00010011100
10071      instr[20,16] = Rm
10072      instr[15,10] = imms :  0xxxxx for 32 bit
10073      instr[9,5]   = Rn
10074      instr[4,0]   = Rd  */
10075   unsigned rm   = INSTR (20, 16);
10076   unsigned imms = INSTR (15, 10) & 31;
10077   unsigned rn   = INSTR ( 9,  5);
10078   unsigned rd   = INSTR ( 4,  0);
10079   uint64_t val1;
10080   uint64_t val2;
10081
10082   val1 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
10083   val1 >>= imms;
10084   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
10085   val2 <<= (32 - imms);
10086
10087   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
10088   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
10089 }
10090
10091 static void
10092 do_EXTR_64 (sim_cpu *cpu)
10093 {
10094   /* instr[31:21] = 10010011100
10095      instr[20,16] = Rm
10096      instr[15,10] = imms
10097      instr[9,5]   = Rn
10098      instr[4,0]   = Rd  */
10099   unsigned rm   = INSTR (20, 16);
10100   unsigned imms = INSTR (15, 10) & 63;
10101   unsigned rn   = INSTR ( 9,  5);
10102   unsigned rd   = INSTR ( 4,  0);
10103   uint64_t val;
10104
10105   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
10106   val >>= imms;
10107   val |= (aarch64_get_reg_u64 (cpu, rn, NO_SP) << (64 - imms));
10108
10109   aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
10110 }
10111
10112 static void
10113 dexExtractImmediate (sim_cpu *cpu)
10114 {
10115   /* assert instr[28:23] = 100111
10116      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
10117      instr[30,29] = op21 : 0 ==> EXTR, 1,2,3 ==> UNALLOC
10118      instr[22]    = N : must be 0 for 32 bit, 1 for 64 bit or UNALLOC
10119      instr[21]    = op0 : must be 0 or UNALLOC
10120      instr[20,16] = Rm
10121      instr[15,10] = imms :  0xxxxx for 32 bit, xxxxxx for 64 bit
10122      instr[9,5]   = Rn
10123      instr[4,0]   = Rd  */
10124
10125   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
10126   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
10127   uint32_t dispatch;
10128   uint32_t size = INSTR (31, 31);
10129   uint32_t N = INSTR (22, 22);
10130   /* 32 bit operations must have imms[5] = 0
10131      or else we have an UNALLOC.  */
10132   uint32_t imms = INSTR (15, 10);
10133
10134   if (size ^ N)
10135     HALT_UNALLOC;
10136
10137   if (!size && uimm (imms, 5, 5))
10138     HALT_UNALLOC;
10139
10140   /* Switch on combined size and op.  */
10141   dispatch = INSTR (31, 29);
10142
10143   if (dispatch == 0)
10144     do_EXTR_32 (cpu);
10145
10146   else if (dispatch == 4)
10147     do_EXTR_64 (cpu);
10148
10149   else if (dispatch == 1)
10150     HALT_NYI;
10151   else
10152     HALT_UNALLOC;
10153 }
10154
10155 static void
10156 dexDPImm (sim_cpu *cpu)
10157 {
10158   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
10159      assert  group == GROUP_DPIMM_1000 || grpoup == GROUP_DPIMM_1001
10160      bits [25,23] of a DPImm are the secondary dispatch vector.  */
10161   uint32_t group2 = dispatchDPImm (aarch64_get_instr (cpu));
10162
10163   switch (group2)
10164     {
10165     case DPIMM_PCADR_000:
10166     case DPIMM_PCADR_001:
10167       dexPCRelAddressing (cpu);
10168       return;
10169
10170     case DPIMM_ADDSUB_010:
10171     case DPIMM_ADDSUB_011:
10172       dexAddSubtractImmediate (cpu);
10173       return;
10174
10175     case DPIMM_LOG_100:
10176       dexLogicalImmediate (cpu);
10177       return;
10178
10179     case DPIMM_MOV_101:
10180       dexMoveWideImmediate (cpu);
10181       return;
10182
10183     case DPIMM_BITF_110:
10184       dexBitfieldImmediate (cpu);
10185       return;
10186
10187     case DPIMM_EXTR_111:
10188       dexExtractImmediate (cpu);
10189       return;
10190
10191     default:
10192       /* Should never reach here.  */
10193       HALT_NYI;
10194     }
10195 }
10196
10197 static void
10198 dexLoadUnscaledImmediate (sim_cpu *cpu)
10199 {
10200   /* instr[29,24] == 111_00
10201      instr[21] == 0
10202      instr[11,10] == 00
10203      instr[31,30] = size
10204      instr[26] = V
10205      instr[23,22] = opc
10206      instr[20,12] = simm9
10207      instr[9,5] = rn may be SP.  */
10208   /* unsigned rt = INSTR (4, 0);  */
10209   uint32_t V = INSTR (26, 26);
10210   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10211   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
10212
10213   if (!V)
10214     {
10215       /* GReg operations.  */
10216       switch (dispatch)
10217         {
10218         case 0:  sturb (cpu, imm); return;
10219         case 1:  ldurb32 (cpu, imm); return;
10220         case 2:  ldursb64 (cpu, imm); return;
10221         case 3:  ldursb32 (cpu, imm); return;
10222         case 4:  sturh (cpu, imm); return;
10223         case 5:  ldurh32 (cpu, imm); return;
10224         case 6:  ldursh64 (cpu, imm); return;
10225         case 7:  ldursh32 (cpu, imm); return;
10226         case 8:  stur32 (cpu, imm); return;
10227         case 9:  ldur32 (cpu, imm); return;
10228         case 10: ldursw (cpu, imm); return;
10229         case 12: stur64 (cpu, imm); return;
10230         case 13: ldur64 (cpu, imm); return;
10231
10232         case 14:
10233           /* PRFUM NYI.  */
10234           HALT_NYI;
10235
10236         default:
10237         case 11:
10238         case 15:
10239           HALT_UNALLOC;
10240         }
10241     }
10242
10243   /* FReg operations.  */
10244   switch (dispatch)
10245     {
10246     case 2:  fsturq (cpu, imm); return;
10247     case 3:  fldurq (cpu, imm); return;
10248     case 8:  fsturs (cpu, imm); return;
10249     case 9:  fldurs (cpu, imm); return;
10250     case 12: fsturd (cpu, imm); return;
10251     case 13: fldurd (cpu, imm); return;
10252
10253     case 0: /* STUR 8 bit FP.  */
10254     case 1: /* LDUR 8 bit FP.  */
10255     case 4: /* STUR 16 bit FP.  */
10256     case 5: /* LDUR 8 bit FP.  */
10257       HALT_NYI;
10258
10259     default:
10260     case 6:
10261     case 7:
10262     case 10:
10263     case 11:
10264     case 14:
10265     case 15:
10266       HALT_UNALLOC;
10267     }
10268 }
10269
10270 /*  N.B. A preliminary note regarding all the ldrs<x>32
10271     instructions
10272
10273    The signed value loaded by these instructions is cast to unsigned
10274    before being assigned to aarch64_get_reg_u64 (cpu, N) i.e. to the
10275    64 bit element of the GReg union. this performs a 32 bit sign extension
10276    (as required) but avoids 64 bit sign extension, thus ensuring that the
10277    top half of the register word is zero. this is what the spec demands
10278    when a 32 bit load occurs.  */
10279
10280 /* 32 bit load sign-extended byte scaled unsigned 12 bit.  */
10281 static void
10282 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
10283 {
10284   unsigned int rn = INSTR (9, 5);
10285   unsigned int rt = INSTR (4, 0);
10286
10287   /* The target register may not be SP but the source may be
10288      there is no scaling required for a byte load.  */
10289   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
10290   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10291                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10292 }
10293
10294 /* 32 bit load sign-extended byte scaled or unscaled zero-
10295    or sign-extended 32-bit register offset.  */
10296 static void
10297 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10298 {
10299   unsigned int rm = INSTR (20, 16);
10300   unsigned int rn = INSTR (9, 5);
10301   unsigned int rt = INSTR (4, 0);
10302
10303   /* rn may reference SP, rm and rt must reference ZR.  */
10304
10305   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10306   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10307                                  extension);
10308
10309   /* There is no scaling required for a byte load.  */
10310   aarch64_set_reg_u64
10311     (cpu, rt, NO_SP, (int64_t) aarch64_get_mem_s8 (cpu, address
10312                                                    + displacement));
10313 }
10314
10315 /* 32 bit load sign-extended byte unscaled signed 9 bit with
10316    pre- or post-writeback.  */
10317 static void
10318 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10319 {
10320   uint64_t address;
10321   unsigned int rn = INSTR (9, 5);
10322   unsigned int rt = INSTR (4, 0);
10323
10324   if (rn == rt && wb != NoWriteBack)
10325     HALT_UNALLOC;
10326
10327   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10328
10329   if (wb == Pre)
10330       address += offset;
10331
10332   aarch64_set_reg_u64 (cpu, rt, NO_SP,
10333                        (int64_t) aarch64_get_mem_s8 (cpu, address));
10334
10335   if (wb == Post)
10336     address += offset;
10337
10338   if (wb != NoWriteBack)
10339     aarch64_set_reg_u64 (cpu, rn, NO_SP, address);
10340 }
10341
10342 /* 8 bit store scaled.  */
10343 static void
10344 fstrb_abs (sim_cpu *cpu, uint32_t offset)
10345 {
10346   unsigned st = INSTR (4, 0);
10347   unsigned rn = INSTR (9, 5);
10348
10349   aarch64_set_mem_u8 (cpu,
10350                       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
10351                       aarch64_get_vec_u8 (cpu, st, 0));
10352 }
10353
10354 /* 8 bit store scaled or unscaled zero- or
10355    sign-extended 8-bit register offset.  */
10356 static void
10357 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10358 {
10359   unsigned rm = INSTR (20, 16);
10360   unsigned rn = INSTR (9, 5);
10361   unsigned st = INSTR (4, 0);
10362
10363   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10364   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10365                                extension);
10366   uint64_t  displacement = scaling == Scaled ? extended : 0;
10367
10368   aarch64_set_mem_u8
10369     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
10370 }
10371
10372 /* 16 bit store scaled.  */
10373 static void
10374 fstrh_abs (sim_cpu *cpu, uint32_t offset)
10375 {
10376   unsigned st = INSTR (4, 0);
10377   unsigned rn = INSTR (9, 5);
10378
10379   aarch64_set_mem_u16
10380     (cpu,
10381      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16),
10382      aarch64_get_vec_u16 (cpu, st, 0));
10383 }
10384
10385 /* 16 bit store scaled or unscaled zero-
10386    or sign-extended 16-bit register offset.  */
10387 static void
10388 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10389 {
10390   unsigned rm = INSTR (20, 16);
10391   unsigned rn = INSTR (9, 5);
10392   unsigned st = INSTR (4, 0);
10393
10394   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10395   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10396                                extension);
10397   uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
10398
10399   aarch64_set_mem_u16
10400     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
10401 }
10402
10403 /* 32 bit store scaled unsigned 12 bit.  */
10404 static void
10405 fstrs_abs (sim_cpu *cpu, uint32_t offset)
10406 {
10407   unsigned st = INSTR (4, 0);
10408   unsigned rn = INSTR (9, 5);
10409
10410   aarch64_set_mem_u32
10411     (cpu,
10412      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
10413      aarch64_get_vec_u32 (cpu, st, 0));
10414 }
10415
10416 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10417 static void
10418 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10419 {
10420   unsigned rn = INSTR (9, 5);
10421   unsigned st = INSTR (4, 0);
10422
10423   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10424
10425   if (wb != Post)
10426     address += offset;
10427
10428   aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
10429
10430   if (wb == Post)
10431     address += offset;
10432
10433   if (wb != NoWriteBack)
10434     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10435 }
10436
10437 /* 32 bit store scaled or unscaled zero-
10438    or sign-extended 32-bit register offset.  */
10439 static void
10440 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10441 {
10442   unsigned rm = INSTR (20, 16);
10443   unsigned rn = INSTR (9, 5);
10444   unsigned st = INSTR (4, 0);
10445
10446   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10447   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10448                                extension);
10449   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
10450
10451   aarch64_set_mem_u32
10452     (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
10453 }
10454
10455 /* 64 bit store scaled unsigned 12 bit.  */
10456 static void
10457 fstrd_abs (sim_cpu *cpu, uint32_t offset)
10458 {
10459   unsigned st = INSTR (4, 0);
10460   unsigned rn = INSTR (9, 5);
10461
10462   aarch64_set_mem_u64
10463     (cpu,
10464      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
10465      aarch64_get_vec_u64 (cpu, st, 0));
10466 }
10467
10468 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10469 static void
10470 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10471 {
10472   unsigned rn = INSTR (9, 5);
10473   unsigned st = INSTR (4, 0);
10474
10475   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10476
10477   if (wb != Post)
10478     address += offset;
10479
10480   aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
10481
10482   if (wb == Post)
10483     address += offset;
10484
10485   if (wb != NoWriteBack)
10486     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10487 }
10488
10489 /* 64 bit store scaled or unscaled zero-
10490    or sign-extended 32-bit register offset.  */
10491 static void
10492 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10493 {
10494   unsigned rm = INSTR (20, 16);
10495   unsigned rn = INSTR (9, 5);
10496   unsigned st = INSTR (4, 0);
10497
10498   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10499   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10500                                extension);
10501   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
10502
10503   aarch64_set_mem_u64
10504     (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
10505 }
10506
10507 /* 128 bit store scaled unsigned 12 bit.  */
10508 static void
10509 fstrq_abs (sim_cpu *cpu, uint32_t offset)
10510 {
10511   FRegister a;
10512   unsigned st = INSTR (4, 0);
10513   unsigned rn = INSTR (9, 5);
10514   uint64_t addr;
10515
10516   aarch64_get_FP_long_double (cpu, st, & a);
10517
10518   addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
10519   aarch64_set_mem_long_double (cpu, addr, a);
10520 }
10521
10522 /* 128 bit store unscaled signed 9 bit with pre- or post-writeback.  */
10523 static void
10524 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
10525 {
10526   FRegister a;
10527   unsigned rn = INSTR (9, 5);
10528   unsigned st = INSTR (4, 0);
10529   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10530
10531   if (wb != Post)
10532     address += offset;
10533
10534   aarch64_get_FP_long_double (cpu, st, & a);
10535   aarch64_set_mem_long_double (cpu, address, a);
10536
10537   if (wb == Post)
10538     address += offset;
10539
10540   if (wb != NoWriteBack)
10541     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
10542 }
10543
10544 /* 128 bit store scaled or unscaled zero-
10545    or sign-extended 32-bit register offset.  */
10546 static void
10547 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
10548 {
10549   unsigned rm = INSTR (20, 16);
10550   unsigned rn = INSTR (9, 5);
10551   unsigned st = INSTR (4, 0);
10552
10553   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
10554   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
10555                                extension);
10556   uint64_t  displacement = OPT_SCALE (extended, 128, scaling);
10557
10558   FRegister a;
10559
10560   aarch64_get_FP_long_double (cpu, st, & a);
10561   aarch64_set_mem_long_double (cpu, address + displacement, a);
10562 }
10563
10564 static void
10565 dexLoadImmediatePrePost (sim_cpu *cpu)
10566 {
10567   /* instr[31,30] = size
10568      instr[29,27] = 111
10569      instr[26]    = V
10570      instr[25,24] = 00
10571      instr[23,22] = opc
10572      instr[21]    = 0
10573      instr[20,12] = simm9
10574      instr[11]    = wb : 0 ==> Post, 1 ==> Pre
10575      instr[10]    = 0
10576      instr[9,5]   = Rn may be SP.
10577      instr[4,0]   = Rt */
10578
10579   uint32_t  V        = INSTR (26, 26);
10580   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10581   int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
10582   WriteBack wb       = INSTR (11, 11);
10583
10584   if (!V)
10585     {
10586       /* GReg operations.  */
10587       switch (dispatch)
10588         {
10589         case 0:  strb_wb (cpu, imm, wb); return;
10590         case 1:  ldrb32_wb (cpu, imm, wb); return;
10591         case 2:  ldrsb_wb (cpu, imm, wb); return;
10592         case 3:  ldrsb32_wb (cpu, imm, wb); return;
10593         case 4:  strh_wb (cpu, imm, wb); return;
10594         case 5:  ldrh32_wb (cpu, imm, wb); return;
10595         case 6:  ldrsh64_wb (cpu, imm, wb); return;
10596         case 7:  ldrsh32_wb (cpu, imm, wb); return;
10597         case 8:  str32_wb (cpu, imm, wb); return;
10598         case 9:  ldr32_wb (cpu, imm, wb); return;
10599         case 10: ldrsw_wb (cpu, imm, wb); return;
10600         case 12: str_wb (cpu, imm, wb); return;
10601         case 13: ldr_wb (cpu, imm, wb); return;
10602
10603         default:
10604         case 11:
10605         case 14:
10606         case 15:
10607           HALT_UNALLOC;
10608         }
10609     }
10610
10611   /* FReg operations.  */
10612   switch (dispatch)
10613     {
10614     case 2:  fstrq_wb (cpu, imm, wb); return;
10615     case 3:  fldrq_wb (cpu, imm, wb); return;
10616     case 8:  fstrs_wb (cpu, imm, wb); return;
10617     case 9:  fldrs_wb (cpu, imm, wb); return;
10618     case 12: fstrd_wb (cpu, imm, wb); return;
10619     case 13: fldrd_wb (cpu, imm, wb); return;
10620
10621     case 0:       /* STUR 8 bit FP.  */
10622     case 1:       /* LDUR 8 bit FP.  */
10623     case 4:       /* STUR 16 bit FP.  */
10624     case 5:       /* LDUR 8 bit FP.  */
10625       HALT_NYI;
10626
10627     default:
10628     case 6:
10629     case 7:
10630     case 10:
10631     case 11:
10632     case 14:
10633     case 15:
10634       HALT_UNALLOC;
10635     }
10636 }
10637
10638 static void
10639 dexLoadRegisterOffset (sim_cpu *cpu)
10640 {
10641   /* instr[31,30] = size
10642      instr[29,27] = 111
10643      instr[26]    = V
10644      instr[25,24] = 00
10645      instr[23,22] = opc
10646      instr[21]    = 1
10647      instr[20,16] = rm
10648      instr[15,13] = option : 010 ==> UXTW, 011 ==> UXTX/LSL,
10649                              110 ==> SXTW, 111 ==> SXTX,
10650                              ow ==> RESERVED
10651      instr[12]    = scaled
10652      instr[11,10] = 10
10653      instr[9,5]   = rn
10654      instr[4,0]   = rt.  */
10655
10656   uint32_t  V = INSTR (26, 26);
10657   uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10658   Scaling   scale = INSTR (12, 12);
10659   Extension extensionType = INSTR (15, 13);
10660
10661   /* Check for illegal extension types.  */
10662   if (uimm (extensionType, 1, 1) == 0)
10663     HALT_UNALLOC;
10664
10665   if (extensionType == UXTX || extensionType == SXTX)
10666     extensionType = NoExtension;
10667
10668   if (!V)
10669     {
10670       /* GReg operations.  */
10671       switch (dispatch)
10672         {
10673         case 0:  strb_scale_ext (cpu, scale, extensionType); return;
10674         case 1:  ldrb32_scale_ext (cpu, scale, extensionType); return;
10675         case 2:  ldrsb_scale_ext (cpu, scale, extensionType); return;
10676         case 3:  ldrsb32_scale_ext (cpu, scale, extensionType); return;
10677         case 4:  strh_scale_ext (cpu, scale, extensionType); return;
10678         case 5:  ldrh32_scale_ext (cpu, scale, extensionType); return;
10679         case 6:  ldrsh_scale_ext (cpu, scale, extensionType); return;
10680         case 7:  ldrsh32_scale_ext (cpu, scale, extensionType); return;
10681         case 8:  str32_scale_ext (cpu, scale, extensionType); return;
10682         case 9:  ldr32_scale_ext (cpu, scale, extensionType); return;
10683         case 10: ldrsw_scale_ext (cpu, scale, extensionType); return;
10684         case 12: str_scale_ext (cpu, scale, extensionType); return;
10685         case 13: ldr_scale_ext (cpu, scale, extensionType); return;
10686         case 14: prfm_scale_ext (cpu, scale, extensionType); return;
10687
10688         default:
10689         case 11:
10690         case 15:
10691           HALT_UNALLOC;
10692         }
10693     }
10694
10695   /* FReg operations.  */
10696   switch (dispatch)
10697     {
10698     case 1: /* LDUR 8 bit FP.  */
10699       HALT_NYI;
10700     case 3:  fldrq_scale_ext (cpu, scale, extensionType); return;
10701     case 5: /* LDUR 8 bit FP.  */
10702       HALT_NYI;
10703     case 9:  fldrs_scale_ext (cpu, scale, extensionType); return;
10704     case 13: fldrd_scale_ext (cpu, scale, extensionType); return;
10705
10706     case 0:  fstrb_scale_ext (cpu, scale, extensionType); return;
10707     case 2:  fstrq_scale_ext (cpu, scale, extensionType); return;
10708     case 4:  fstrh_scale_ext (cpu, scale, extensionType); return;
10709     case 8:  fstrs_scale_ext (cpu, scale, extensionType); return;
10710     case 12: fstrd_scale_ext (cpu, scale, extensionType); return;
10711
10712     default:
10713     case 6:
10714     case 7:
10715     case 10:
10716     case 11:
10717     case 14:
10718     case 15:
10719       HALT_UNALLOC;
10720     }
10721 }
10722
10723 static void
10724 dexLoadUnsignedImmediate (sim_cpu *cpu)
10725 {
10726   /* instr[29,24] == 111_01
10727      instr[31,30] = size
10728      instr[26]    = V
10729      instr[23,22] = opc
10730      instr[21,10] = uimm12 : unsigned immediate offset
10731      instr[9,5]   = rn may be SP.
10732      instr[4,0]   = rt.  */
10733
10734   uint32_t V = INSTR (26,26);
10735   uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
10736   uint32_t imm = INSTR (21, 10);
10737
10738   if (!V)
10739     {
10740       /* GReg operations.  */
10741       switch (dispatch)
10742         {
10743         case 0:  strb_abs (cpu, imm); return;
10744         case 1:  ldrb32_abs (cpu, imm); return;
10745         case 2:  ldrsb_abs (cpu, imm); return;
10746         case 3:  ldrsb32_abs (cpu, imm); return;
10747         case 4:  strh_abs (cpu, imm); return;
10748         case 5:  ldrh32_abs (cpu, imm); return;
10749         case 6:  ldrsh_abs (cpu, imm); return;
10750         case 7:  ldrsh32_abs (cpu, imm); return;
10751         case 8:  str32_abs (cpu, imm); return;
10752         case 9:  ldr32_abs (cpu, imm); return;
10753         case 10: ldrsw_abs (cpu, imm); return;
10754         case 12: str_abs (cpu, imm); return;
10755         case 13: ldr_abs (cpu, imm); return;
10756         case 14: prfm_abs (cpu, imm); return;
10757
10758         default:
10759         case 11:
10760         case 15:
10761           HALT_UNALLOC;
10762         }
10763     }
10764
10765   /* FReg operations.  */
10766   switch (dispatch)
10767     {
10768     case 0:  fstrb_abs (cpu, imm); return;
10769     case 4:  fstrh_abs (cpu, imm); return;
10770     case 8:  fstrs_abs (cpu, imm); return;
10771     case 12: fstrd_abs (cpu, imm); return;
10772     case 2:  fstrq_abs (cpu, imm); return;
10773
10774     case 1:  fldrb_abs (cpu, imm); return;
10775     case 5:  fldrh_abs (cpu, imm); return;
10776     case 9:  fldrs_abs (cpu, imm); return;
10777     case 13: fldrd_abs (cpu, imm); return;
10778     case 3:  fldrq_abs (cpu, imm); return;
10779
10780     default:
10781     case 6:
10782     case 7:
10783     case 10:
10784     case 11:
10785     case 14:
10786     case 15:
10787       HALT_UNALLOC;
10788     }
10789 }
10790
10791 static void
10792 dexLoadExclusive (sim_cpu *cpu)
10793 {
10794   /* assert instr[29:24] = 001000;
10795      instr[31,30] = size
10796      instr[23] = 0 if exclusive
10797      instr[22] = L : 1 if load, 0 if store
10798      instr[21] = 1 if pair
10799      instr[20,16] = Rs
10800      instr[15] = o0 : 1 if ordered
10801      instr[14,10] = Rt2
10802      instr[9,5] = Rn
10803      instr[4.0] = Rt.  */
10804
10805   switch (INSTR (22, 21))
10806     {
10807     case 2:   ldxr (cpu); return;
10808     case 0:   stxr (cpu); return;
10809     default:  HALT_NYI;
10810     }
10811 }
10812
10813 static void
10814 dexLoadOther (sim_cpu *cpu)
10815 {
10816   uint32_t dispatch;
10817
10818   /* instr[29,25] = 111_0
10819      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
10820      instr[21:11,10] is the secondary dispatch.  */
10821   if (INSTR (24, 24))
10822     {
10823       dexLoadUnsignedImmediate (cpu);
10824       return;
10825     }
10826
10827   dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
10828   switch (dispatch)
10829     {
10830     case 0: dexLoadUnscaledImmediate (cpu); return;
10831     case 1: dexLoadImmediatePrePost (cpu); return;
10832     case 3: dexLoadImmediatePrePost (cpu); return;
10833     case 6: dexLoadRegisterOffset (cpu); return;
10834
10835     default:
10836     case 2:
10837     case 4:
10838     case 5:
10839     case 7:
10840       HALT_NYI;
10841     }
10842 }
10843
10844 static void
10845 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10846 {
10847   unsigned rn = INSTR (14, 10);
10848   unsigned rd = INSTR (9, 5);
10849   unsigned rm = INSTR (4, 0);
10850   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10851
10852   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10853     HALT_UNALLOC; /* ??? */
10854
10855   offset <<= 2;
10856
10857   if (wb != Post)
10858     address += offset;
10859
10860   aarch64_set_mem_u32 (cpu, address,
10861                        aarch64_get_reg_u32 (cpu, rm, NO_SP));
10862   aarch64_set_mem_u32 (cpu, address + 4,
10863                        aarch64_get_reg_u32 (cpu, rn, NO_SP));
10864
10865   if (wb == Post)
10866     address += offset;
10867
10868   if (wb != NoWriteBack)
10869     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10870 }
10871
10872 static void
10873 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10874 {
10875   unsigned rn = INSTR (14, 10);
10876   unsigned rd = INSTR (9, 5);
10877   unsigned rm = INSTR (4, 0);
10878   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10879
10880   if ((rn == rd || rm == rd) && wb != NoWriteBack)
10881     HALT_UNALLOC; /* ??? */
10882
10883   offset <<= 3;
10884
10885   if (wb != Post)
10886     address += offset;
10887
10888   aarch64_set_mem_u64 (cpu, address,
10889                        aarch64_get_reg_u64 (cpu, rm, NO_SP));
10890   aarch64_set_mem_u64 (cpu, address + 8,
10891                        aarch64_get_reg_u64 (cpu, rn, NO_SP));
10892
10893   if (wb == Post)
10894     address += offset;
10895
10896   if (wb != NoWriteBack)
10897     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10898 }
10899
10900 static void
10901 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10902 {
10903   unsigned rn = INSTR (14, 10);
10904   unsigned rd = INSTR (9, 5);
10905   unsigned rm = INSTR (4, 0);
10906   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10907
10908   /* Treat this as unalloc to make sure we don't do it.  */
10909   if (rn == rm)
10910     HALT_UNALLOC;
10911
10912   offset <<= 2;
10913
10914   if (wb != Post)
10915     address += offset;
10916
10917   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u32 (cpu, address));
10918   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u32 (cpu, address + 4));
10919
10920   if (wb == Post)
10921     address += offset;
10922
10923   if (wb != NoWriteBack)
10924     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10925 }
10926
10927 static void
10928 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10929 {
10930   unsigned rn = INSTR (14, 10);
10931   unsigned rd = INSTR (9, 5);
10932   unsigned rm = INSTR (4, 0);
10933   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10934
10935   /* Treat this as unalloc to make sure we don't do it.  */
10936   if (rn == rm)
10937     HALT_UNALLOC;
10938
10939   offset <<= 2;
10940
10941   if (wb != Post)
10942     address += offset;
10943
10944   aarch64_set_reg_s64 (cpu, rm, SP_OK, aarch64_get_mem_s32 (cpu, address));
10945   aarch64_set_reg_s64 (cpu, rn, SP_OK, aarch64_get_mem_s32 (cpu, address + 4));
10946
10947   if (wb == Post)
10948     address += offset;
10949
10950   if (wb != NoWriteBack)
10951     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10952 }
10953
10954 static void
10955 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
10956 {
10957   unsigned rn = INSTR (14, 10);
10958   unsigned rd = INSTR (9, 5);
10959   unsigned rm = INSTR (4, 0);
10960   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
10961
10962   /* Treat this as unalloc to make sure we don't do it.  */
10963   if (rn == rm)
10964     HALT_UNALLOC;
10965
10966   offset <<= 3;
10967
10968   if (wb != Post)
10969     address += offset;
10970
10971   aarch64_set_reg_u64 (cpu, rm, SP_OK, aarch64_get_mem_u64 (cpu, address));
10972   aarch64_set_reg_u64 (cpu, rn, SP_OK, aarch64_get_mem_u64 (cpu, address + 8));
10973
10974   if (wb == Post)
10975     address += offset;
10976
10977   if (wb != NoWriteBack)
10978     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
10979 }
10980
10981 static void
10982 dex_load_store_pair_gr (sim_cpu *cpu)
10983 {
10984   /* instr[31,30] = size (10=> 64-bit, 01=> signed 32-bit, 00=> 32-bit)
10985      instr[29,25] = instruction encoding: 101_0
10986      instr[26]    = V : 1 if fp 0 if gp
10987      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
10988      instr[22]    = load/store (1=> load)
10989      instr[21,15] = signed, scaled, offset
10990      instr[14,10] = Rn
10991      instr[ 9, 5] = Rd
10992      instr[ 4, 0] = Rm.  */
10993
10994   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
10995   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
10996
10997   switch (dispatch)
10998     {
10999     case 2: store_pair_u32 (cpu, offset, Post); return;
11000     case 3: load_pair_u32  (cpu, offset, Post); return;
11001     case 4: store_pair_u32 (cpu, offset, NoWriteBack); return;
11002     case 5: load_pair_u32  (cpu, offset, NoWriteBack); return;
11003     case 6: store_pair_u32 (cpu, offset, Pre); return;
11004     case 7: load_pair_u32  (cpu, offset, Pre); return;
11005
11006     case 11: load_pair_s32  (cpu, offset, Post); return;
11007     case 13: load_pair_s32  (cpu, offset, NoWriteBack); return;
11008     case 15: load_pair_s32  (cpu, offset, Pre); return;
11009
11010     case 18: store_pair_u64 (cpu, offset, Post); return;
11011     case 19: load_pair_u64  (cpu, offset, Post); return;
11012     case 20: store_pair_u64 (cpu, offset, NoWriteBack); return;
11013     case 21: load_pair_u64  (cpu, offset, NoWriteBack); return;
11014     case 22: store_pair_u64 (cpu, offset, Pre); return;
11015     case 23: load_pair_u64  (cpu, offset, Pre); return;
11016
11017     default:
11018       HALT_UNALLOC;
11019     }
11020 }
11021
11022 static void
11023 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11024 {
11025   unsigned rn = INSTR (14, 10);
11026   unsigned rd = INSTR (9, 5);
11027   unsigned rm = INSTR (4, 0);
11028   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11029
11030   offset <<= 2;
11031
11032   if (wb != Post)
11033     address += offset;
11034
11035   aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
11036   aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
11037
11038   if (wb == Post)
11039     address += offset;
11040
11041   if (wb != NoWriteBack)
11042     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11043 }
11044
11045 static void
11046 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11047 {
11048   unsigned rn = INSTR (14, 10);
11049   unsigned rd = INSTR (9, 5);
11050   unsigned rm = INSTR (4, 0);
11051   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11052
11053   offset <<= 3;
11054
11055   if (wb != Post)
11056     address += offset;
11057
11058   aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
11059   aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
11060
11061   if (wb == Post)
11062     address += offset;
11063
11064   if (wb != NoWriteBack)
11065     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11066 }
11067
11068 static void
11069 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11070 {
11071   FRegister a;
11072   unsigned rn = INSTR (14, 10);
11073   unsigned rd = INSTR (9, 5);
11074   unsigned rm = INSTR (4, 0);
11075   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11076
11077   offset <<= 4;
11078
11079   if (wb != Post)
11080     address += offset;
11081
11082   aarch64_get_FP_long_double (cpu, rm, & a);
11083   aarch64_set_mem_long_double (cpu, address, a);
11084   aarch64_get_FP_long_double (cpu, rn, & a);
11085   aarch64_set_mem_long_double (cpu, address + 16, a);
11086
11087   if (wb == Post)
11088     address += offset;
11089
11090   if (wb != NoWriteBack)
11091     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11092 }
11093
11094 static void
11095 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
11096 {
11097   unsigned rn = INSTR (14, 10);
11098   unsigned rd = INSTR (9, 5);
11099   unsigned rm = INSTR (4, 0);
11100   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11101
11102   if (rm == rn)
11103     HALT_UNALLOC;
11104
11105   offset <<= 2;
11106
11107   if (wb != Post)
11108     address += offset;
11109
11110   aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
11111   aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
11112
11113   if (wb == Post)
11114     address += offset;
11115
11116   if (wb != NoWriteBack)
11117     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11118 }
11119
11120 static void
11121 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11122 {
11123   unsigned rn = INSTR (14, 10);
11124   unsigned rd = INSTR (9, 5);
11125   unsigned rm = INSTR (4, 0);
11126   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11127
11128   if (rm == rn)
11129     HALT_UNALLOC;
11130
11131   offset <<= 3;
11132
11133   if (wb != Post)
11134     address += offset;
11135
11136   aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
11137   aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
11138
11139   if (wb == Post)
11140     address += offset;
11141
11142   if (wb != NoWriteBack)
11143     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11144 }
11145
11146 static void
11147 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
11148 {
11149   FRegister a;
11150   unsigned rn = INSTR (14, 10);
11151   unsigned rd = INSTR (9, 5);
11152   unsigned rm = INSTR (4, 0);
11153   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
11154
11155   if (rm == rn)
11156     HALT_UNALLOC;
11157
11158   offset <<= 4;
11159
11160   if (wb != Post)
11161     address += offset;
11162
11163   aarch64_get_mem_long_double (cpu, address, & a);
11164   aarch64_set_FP_long_double (cpu, rm, a);
11165   aarch64_get_mem_long_double (cpu, address + 16, & a);
11166   aarch64_set_FP_long_double (cpu, rn, a);
11167
11168   if (wb == Post)
11169     address += offset;
11170
11171   if (wb != NoWriteBack)
11172     aarch64_set_reg_u64 (cpu, rd, SP_OK, address);
11173 }
11174
11175 static void
11176 dex_load_store_pair_fp (sim_cpu *cpu)
11177 {
11178   /* instr[31,30] = size (10=> 128-bit, 01=> 64-bit, 00=> 32-bit)
11179      instr[29,25] = instruction encoding
11180      instr[24,23] = addressing mode (10=> offset, 01=> post, 11=> pre)
11181      instr[22]    = load/store (1=> load)
11182      instr[21,15] = signed, scaled, offset
11183      instr[14,10] = Rn
11184      instr[ 9, 5] = Rd
11185      instr[ 4, 0] = Rm  */
11186
11187   uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
11188   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
11189
11190   switch (dispatch)
11191     {
11192     case 2: store_pair_float (cpu, offset, Post); return;
11193     case 3: load_pair_float  (cpu, offset, Post); return;
11194     case 4: store_pair_float (cpu, offset, NoWriteBack); return;
11195     case 5: load_pair_float  (cpu, offset, NoWriteBack); return;
11196     case 6: store_pair_float (cpu, offset, Pre); return;
11197     case 7: load_pair_float  (cpu, offset, Pre); return;
11198
11199     case 10: store_pair_double (cpu, offset, Post); return;
11200     case 11: load_pair_double  (cpu, offset, Post); return;
11201     case 12: store_pair_double (cpu, offset, NoWriteBack); return;
11202     case 13: load_pair_double  (cpu, offset, NoWriteBack); return;
11203     case 14: store_pair_double (cpu, offset, Pre); return;
11204     case 15: load_pair_double  (cpu, offset, Pre); return;
11205
11206     case 18: store_pair_long_double (cpu, offset, Post); return;
11207     case 19: load_pair_long_double  (cpu, offset, Post); return;
11208     case 20: store_pair_long_double (cpu, offset, NoWriteBack); return;
11209     case 21: load_pair_long_double  (cpu, offset, NoWriteBack); return;
11210     case 22: store_pair_long_double (cpu, offset, Pre); return;
11211     case 23: load_pair_long_double  (cpu, offset, Pre); return;
11212
11213     default:
11214       HALT_UNALLOC;
11215     }
11216 }
11217
11218 static inline unsigned
11219 vec_reg (unsigned v, unsigned o)
11220 {
11221   return (v + o) & 0x3F;
11222 }
11223
11224 /* Load multiple N-element structures to N consecutive registers.  */
11225 static void
11226 vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
11227 {
11228   int      all  = INSTR (30, 30);
11229   unsigned size = INSTR (11, 10);
11230   unsigned vd   = INSTR (4, 0);
11231   unsigned i;
11232
11233   switch (size)
11234     {
11235     case 0: /* 8-bit operations.  */
11236       if (all)
11237         for (i = 0; i < (16 * N); i++)
11238           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
11239                               aarch64_get_mem_u8 (cpu, address + i));
11240       else
11241         for (i = 0; i < (8 * N); i++)
11242           aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
11243                               aarch64_get_mem_u8 (cpu, address + i));
11244       return;
11245
11246     case 1: /* 16-bit operations.  */
11247       if (all)
11248         for (i = 0; i < (8 * N); i++)
11249           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
11250                                aarch64_get_mem_u16 (cpu, address + i * 2));
11251       else
11252         for (i = 0; i < (4 * N); i++)
11253           aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
11254                                aarch64_get_mem_u16 (cpu, address + i * 2));
11255       return;
11256
11257     case 2: /* 32-bit operations.  */
11258       if (all)
11259         for (i = 0; i < (4 * N); i++)
11260           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
11261                                aarch64_get_mem_u32 (cpu, address + i * 4));
11262       else
11263         for (i = 0; i < (2 * N); i++)
11264           aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
11265                                aarch64_get_mem_u32 (cpu, address + i * 4));
11266       return;
11267
11268     case 3: /* 64-bit operations.  */
11269       if (all)
11270         for (i = 0; i < (2 * N); i++)
11271           aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
11272                                aarch64_get_mem_u64 (cpu, address + i * 8));
11273       else
11274         for (i = 0; i < N; i++)
11275           aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
11276                                aarch64_get_mem_u64 (cpu, address + i * 8));
11277       return;
11278     }
11279 }
11280
11281 /* LD4: load multiple 4-element to four consecutive registers.  */
11282 static void
11283 LD4 (sim_cpu *cpu, uint64_t address)
11284 {
11285   vec_load (cpu, address, 4);
11286 }
11287
11288 /* LD3: load multiple 3-element structures to three consecutive registers.  */
11289 static void
11290 LD3 (sim_cpu *cpu, uint64_t address)
11291 {
11292   vec_load (cpu, address, 3);
11293 }
11294
11295 /* LD2: load multiple 2-element structures to two consecutive registers.  */
11296 static void
11297 LD2 (sim_cpu *cpu, uint64_t address)
11298 {
11299   vec_load (cpu, address, 2);
11300 }
11301
11302 /* Load multiple 1-element structures into one register.  */
11303 static void
11304 LD1_1 (sim_cpu *cpu, uint64_t address)
11305 {
11306   int      all  = INSTR (30, 30);
11307   unsigned size = INSTR (11, 10);
11308   unsigned vd   = INSTR (4, 0);
11309   unsigned i;
11310
11311   switch (size)
11312     {
11313     case 0:
11314       /* LD1 {Vd.16b}, addr, #16 */
11315       /* LD1 {Vd.8b}, addr, #8 */
11316       for (i = 0; i < (all ? 16 : 8); i++)
11317         aarch64_set_vec_u8 (cpu, vd, i,
11318                             aarch64_get_mem_u8 (cpu, address + i));
11319       return;
11320
11321     case 1:
11322       /* LD1 {Vd.8h}, addr, #16 */
11323       /* LD1 {Vd.4h}, addr, #8 */
11324       for (i = 0; i < (all ? 8 : 4); i++)
11325         aarch64_set_vec_u16 (cpu, vd, i,
11326                              aarch64_get_mem_u16 (cpu, address + i * 2));
11327       return;
11328
11329     case 2:
11330       /* LD1 {Vd.4s}, addr, #16 */
11331       /* LD1 {Vd.2s}, addr, #8 */
11332       for (i = 0; i < (all ? 4 : 2); i++)
11333         aarch64_set_vec_u32 (cpu, vd, i,
11334                              aarch64_get_mem_u32 (cpu, address + i * 4));
11335       return;
11336
11337     case 3:
11338       /* LD1 {Vd.2d}, addr, #16 */
11339       /* LD1 {Vd.1d}, addr, #8 */
11340       for (i = 0; i < (all ? 2 : 1); i++)
11341         aarch64_set_vec_u64 (cpu, vd, i,
11342                              aarch64_get_mem_u64 (cpu, address + i * 8));
11343       return;
11344     }
11345 }
11346
11347 /* Load multiple 1-element structures into two registers.  */
11348 static void
11349 LD1_2 (sim_cpu *cpu, uint64_t address)
11350 {
11351   /* FIXME: This algorithm is *exactly* the same as the LD2 version.
11352      So why have two different instructions ?  There must be something
11353      wrong somewhere.  */
11354   vec_load (cpu, address, 2);
11355 }
11356
11357 /* Load multiple 1-element structures into three registers.  */
11358 static void
11359 LD1_3 (sim_cpu *cpu, uint64_t address)
11360 {
11361   /* FIXME: This algorithm is *exactly* the same as the LD3 version.
11362      So why have two different instructions ?  There must be something
11363      wrong somewhere.  */
11364   vec_load (cpu, address, 3);
11365 }
11366
11367 /* Load multiple 1-element structures into four registers.  */
11368 static void
11369 LD1_4 (sim_cpu *cpu, uint64_t address)
11370 {
11371   /* FIXME: This algorithm is *exactly* the same as the LD4 version.
11372      So why have two different instructions ?  There must be something
11373      wrong somewhere.  */
11374   vec_load (cpu, address, 4);
11375 }
11376
11377 /* Store multiple N-element structures to N consecutive registers.  */
11378 static void
11379 vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
11380 {
11381   int      all  = INSTR (30, 30);
11382   unsigned size = INSTR (11, 10);
11383   unsigned vd   = INSTR (4, 0);
11384   unsigned i;
11385
11386   switch (size)
11387     {
11388     case 0: /* 8-bit operations.  */
11389       if (all)
11390         for (i = 0; i < (16 * N); i++)
11391           aarch64_set_mem_u8
11392             (cpu, address + i,
11393              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
11394       else
11395         for (i = 0; i < (8 * N); i++)
11396           aarch64_set_mem_u8
11397             (cpu, address + i,
11398              aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
11399       return;
11400
11401     case 1: /* 16-bit operations.  */
11402       if (all)
11403         for (i = 0; i < (8 * N); i++)
11404           aarch64_set_mem_u16
11405             (cpu, address + i * 2,
11406              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
11407       else
11408         for (i = 0; i < (4 * N); i++)
11409           aarch64_set_mem_u16
11410             (cpu, address + i * 2,
11411              aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
11412       return;
11413
11414     case 2: /* 32-bit operations.  */
11415       if (all)
11416         for (i = 0; i < (4 * N); i++)
11417           aarch64_set_mem_u32
11418             (cpu, address + i * 4,
11419              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
11420       else
11421         for (i = 0; i < (2 * N); i++)
11422           aarch64_set_mem_u32
11423             (cpu, address + i * 4,
11424              aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
11425       return;
11426
11427     case 3: /* 64-bit operations.  */
11428       if (all)
11429         for (i = 0; i < (2 * N); i++)
11430           aarch64_set_mem_u64
11431             (cpu, address + i * 8,
11432              aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
11433       else
11434         for (i = 0; i < N; i++)
11435           aarch64_set_mem_u64
11436             (cpu, address + i * 8,
11437              aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
11438       return;
11439     }
11440 }
11441
11442 /* Store multiple 4-element structure to four consecutive registers.  */
11443 static void
11444 ST4 (sim_cpu *cpu, uint64_t address)
11445 {
11446   vec_store (cpu, address, 4);
11447 }
11448
11449 /* Store multiple 3-element structures to three consecutive registers.  */
11450 static void
11451 ST3 (sim_cpu *cpu, uint64_t address)
11452 {
11453   vec_store (cpu, address, 3);
11454 }
11455
11456 /* Store multiple 2-element structures to two consecutive registers.  */
11457 static void
11458 ST2 (sim_cpu *cpu, uint64_t address)
11459 {
11460   vec_store (cpu, address, 2);
11461 }
11462
11463 /* Store multiple 1-element structures into one register.  */
11464 static void
11465 ST1_1 (sim_cpu *cpu, uint64_t address)
11466 {
11467   int      all  = INSTR (30, 30);
11468   unsigned size = INSTR (11, 10);
11469   unsigned vd   = INSTR (4, 0);
11470   unsigned i;
11471
11472   switch (size)
11473     {
11474     case 0:
11475       for (i = 0; i < (all ? 16 : 8); i++)
11476         aarch64_set_mem_u8 (cpu, address + i,
11477                             aarch64_get_vec_u8 (cpu, vd, i));
11478       return;
11479
11480     case 1:
11481       for (i = 0; i < (all ? 8 : 4); i++)
11482         aarch64_set_mem_u16 (cpu, address + i * 2,
11483                              aarch64_get_vec_u16 (cpu, vd, i));
11484       return;
11485
11486     case 2:
11487       for (i = 0; i < (all ? 4 : 2); i++)
11488         aarch64_set_mem_u32 (cpu, address + i * 4,
11489                              aarch64_get_vec_u32 (cpu, vd, i));
11490       return;
11491
11492     case 3:
11493       for (i = 0; i < (all ? 2 : 1); i++)
11494         aarch64_set_mem_u64 (cpu, address + i * 8,
11495                              aarch64_get_vec_u64 (cpu, vd, i));
11496       return;
11497     }
11498 }
11499
11500 /* Store multiple 1-element structures into two registers.  */
11501 static void
11502 ST1_2 (sim_cpu *cpu, uint64_t address)
11503 {
11504   /* FIXME: This algorithm is *exactly* the same as the ST2 version.
11505      So why have two different instructions ?  There must be
11506      something wrong somewhere.  */
11507   vec_store (cpu, address, 2);
11508 }
11509
11510 /* Store multiple 1-element structures into three registers.  */
11511 static void
11512 ST1_3 (sim_cpu *cpu, uint64_t address)
11513 {
11514   /* FIXME: This algorithm is *exactly* the same as the ST3 version.
11515      So why have two different instructions ?  There must be
11516      something wrong somewhere.  */
11517   vec_store (cpu, address, 3);
11518 }
11519
11520 /* Store multiple 1-element structures into four registers.  */
11521 static void
11522 ST1_4 (sim_cpu *cpu, uint64_t address)
11523 {
11524   /* FIXME: This algorithm is *exactly* the same as the ST4 version.
11525      So why have two different instructions ?  There must be
11526      something wrong somewhere.  */
11527   vec_store (cpu, address, 4);
11528 }
11529
11530 #define LDn_STn_SINGLE_LANE_AND_SIZE()                          \
11531   do                                                            \
11532     {                                                           \
11533       switch (INSTR (15, 14))                                   \
11534         {                                                       \
11535         case 0:                                                 \
11536           lane = (full << 3) | (s << 2) | size;                 \
11537           size = 0;                                             \
11538           break;                                                \
11539                                                                 \
11540         case 1:                                                 \
11541           if ((size & 1) == 1)                                  \
11542             HALT_UNALLOC;                                       \
11543           lane = (full << 2) | (s << 1) | (size >> 1);          \
11544           size = 1;                                             \
11545           break;                                                \
11546                                                                 \
11547         case 2:                                                 \
11548           if ((size & 2) == 2)                                  \
11549             HALT_UNALLOC;                                       \
11550                                                                 \
11551           if ((size & 1) == 0)                                  \
11552             {                                                   \
11553               lane = (full << 1) | s;                           \
11554               size = 2;                                         \
11555             }                                                   \
11556           else                                                  \
11557             {                                                   \
11558               if (s)                                            \
11559                 HALT_UNALLOC;                                   \
11560               lane = full;                                      \
11561               size = 3;                                         \
11562             }                                                   \
11563           break;                                                \
11564                                                                 \
11565         default:                                                \
11566           HALT_UNALLOC;                                         \
11567         }                                                       \
11568     }                                                           \
11569   while (0)
11570
11571 /* Load single structure into one lane of N registers.  */
11572 static void
11573 do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
11574 {
11575   /* instr[31]    = 0
11576      instr[30]    = element selector 0=>half, 1=>all elements
11577      instr[29,24] = 00 1101
11578      instr[23]    = 0=>simple, 1=>post
11579      instr[22]    = 1
11580      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11581      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11582                       11111 (immediate post inc)
11583      instr[15,13] = opcode
11584      instr[12]    = S, used for lane number
11585      instr[11,10] = size, also used for lane number
11586      instr[9,5]   = address
11587      instr[4,0]   = Vd  */
11588
11589   unsigned full = INSTR (30, 30);
11590   unsigned vd = INSTR (4, 0);
11591   unsigned size = INSTR (11, 10);
11592   unsigned s = INSTR (12, 12);
11593   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11594   int lane = 0;
11595   int i;
11596
11597   NYI_assert (29, 24, 0x0D);
11598   NYI_assert (22, 22, 1);
11599
11600   /* Compute the lane number first (using size), and then compute size.  */
11601   LDn_STn_SINGLE_LANE_AND_SIZE ();
11602
11603   for (i = 0; i < nregs; i++)
11604     switch (size)
11605       {
11606       case 0:
11607         {
11608           uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
11609           aarch64_set_vec_u8 (cpu, vd + i, lane, val);
11610           break;
11611         }
11612
11613       case 1:
11614         {
11615           uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
11616           aarch64_set_vec_u16 (cpu, vd + i, lane, val);
11617           break;
11618         }
11619
11620       case 2:
11621         {
11622           uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
11623           aarch64_set_vec_u32 (cpu, vd + i, lane, val);
11624           break;
11625         }
11626
11627       case 3:
11628         {
11629           uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
11630           aarch64_set_vec_u64 (cpu, vd + i, lane, val);
11631           break;
11632         }
11633       }
11634 }
11635
11636 /* Store single structure from one lane from N registers.  */
11637 static void
11638 do_vec_STn_single (sim_cpu *cpu, uint64_t address)
11639 {
11640   /* instr[31]    = 0
11641      instr[30]    = element selector 0=>half, 1=>all elements
11642      instr[29,24] = 00 1101
11643      instr[23]    = 0=>simple, 1=>post
11644      instr[22]    = 0
11645      instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
11646      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11647                       11111 (immediate post inc)
11648      instr[15,13] = opcode
11649      instr[12]    = S, used for lane number
11650      instr[11,10] = size, also used for lane number
11651      instr[9,5]   = address
11652      instr[4,0]   = Vd  */
11653
11654   unsigned full = INSTR (30, 30);
11655   unsigned vd = INSTR (4, 0);
11656   unsigned size = INSTR (11, 10);
11657   unsigned s = INSTR (12, 12);
11658   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11659   int lane = 0;
11660   int i;
11661
11662   NYI_assert (29, 24, 0x0D);
11663   NYI_assert (22, 22, 0);
11664
11665   /* Compute the lane number first (using size), and then compute size.  */
11666   LDn_STn_SINGLE_LANE_AND_SIZE ();
11667
11668   for (i = 0; i < nregs; i++)
11669     switch (size)
11670       {
11671       case 0:
11672         {
11673           uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
11674           aarch64_set_mem_u8 (cpu, address + i, val);
11675           break;
11676         }
11677
11678       case 1:
11679         {
11680           uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
11681           aarch64_set_mem_u16 (cpu, address + (i * 2), val);
11682           break;
11683         }
11684
11685       case 2:
11686         {
11687           uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
11688           aarch64_set_mem_u32 (cpu, address + (i * 4), val);
11689           break;
11690         }
11691
11692       case 3:
11693         {
11694           uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
11695           aarch64_set_mem_u64 (cpu, address + (i * 8), val);
11696           break;
11697         }
11698       }
11699 }
11700
11701 /* Load single structure into all lanes of N registers.  */
11702 static void
11703 do_vec_LDnR (sim_cpu *cpu, uint64_t address)
11704 {
11705   /* instr[31]    = 0
11706      instr[30]    = element selector 0=>half, 1=>all elements
11707      instr[29,24] = 00 1101
11708      instr[23]    = 0=>simple, 1=>post
11709      instr[22]    = 1
11710      instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
11711      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
11712                       11111 (immediate post inc)
11713      instr[15,14] = 11
11714      instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
11715      instr[12]    = 0
11716      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11717                                  10=> word(s), 11=> double(d)
11718      instr[9,5]   = address
11719      instr[4,0]   = Vd  */
11720
11721   unsigned full = INSTR (30, 30);
11722   unsigned vd = INSTR (4, 0);
11723   unsigned size = INSTR (11, 10);
11724   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11725   int i, n;
11726
11727   NYI_assert (29, 24, 0x0D);
11728   NYI_assert (22, 22, 1);
11729   NYI_assert (15, 14, 3);
11730   NYI_assert (12, 12, 0);
11731
11732   for (n = 0; n < nregs; n++)
11733     switch (size)
11734       {
11735       case 0:
11736         {
11737           uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
11738           for (i = 0; i < (full ? 16 : 8); i++)
11739             aarch64_set_vec_u8 (cpu, vd + n, i, val);
11740           break;
11741         }
11742
11743       case 1:
11744         {
11745           uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
11746           for (i = 0; i < (full ? 8 : 4); i++)
11747             aarch64_set_vec_u16 (cpu, vd + n, i, val);
11748           break;
11749         }
11750
11751       case 2:
11752         {
11753           uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
11754           for (i = 0; i < (full ? 4 : 2); i++)
11755             aarch64_set_vec_u32 (cpu, vd + n, i, val);
11756           break;
11757         }
11758
11759       case 3:
11760         {
11761           uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
11762           for (i = 0; i < (full ? 2 : 1); i++)
11763             aarch64_set_vec_u64 (cpu, vd + n, i, val);
11764           break;
11765         }
11766
11767       default:
11768         HALT_UNALLOC;
11769       }
11770 }
11771
11772 static void
11773 do_vec_load_store (sim_cpu *cpu)
11774 {
11775   /* {LD|ST}<N>   {Vd..Vd+N}, vaddr
11776
11777      instr[31]    = 0
11778      instr[30]    = element selector 0=>half, 1=>all elements
11779      instr[29,25] = 00110
11780      instr[24]    = 0=>multiple struct, 1=>single struct
11781      instr[23]    = 0=>simple, 1=>post
11782      instr[22]    = 0=>store, 1=>load
11783      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
11784      instr[20,16] = 00000 (simple), Vinc (reg-post-inc, no SP),
11785                     11111 (immediate post inc)
11786      instr[15,12] = elements and destinations.  eg for load:
11787                      0000=>LD4 => load multiple 4-element to
11788                      four consecutive registers
11789                      0100=>LD3 => load multiple 3-element to
11790                      three consecutive registers
11791                      1000=>LD2 => load multiple 2-element to
11792                      two consecutive registers
11793                      0010=>LD1 => load multiple 1-element to
11794                      four consecutive registers
11795                      0110=>LD1 => load multiple 1-element to
11796                      three consecutive registers
11797                      1010=>LD1 => load multiple 1-element to
11798                      two consecutive registers
11799                      0111=>LD1 => load multiple 1-element to
11800                      one register
11801                      1100=>LDR1,LDR2
11802                      1110=>LDR3,LDR4
11803      instr[11,10] = element size 00=> byte(b), 01=> half(h),
11804                                  10=> word(s), 11=> double(d)
11805      instr[9,5]   = Vn, can be SP
11806      instr[4,0]   = Vd  */
11807
11808   int single;
11809   int post;
11810   int load;
11811   unsigned vn;
11812   uint64_t address;
11813   int type;
11814
11815   if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
11816     HALT_NYI;
11817
11818   single = INSTR (24, 24);
11819   post = INSTR (23, 23);
11820   load = INSTR (22, 22);
11821   type = INSTR (15, 12);
11822   vn = INSTR (9, 5);
11823   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
11824
11825   if (! single && INSTR (21, 21) != 0)
11826     HALT_UNALLOC;
11827
11828   if (post)
11829     {
11830       unsigned vm = INSTR (20, 16);
11831
11832       if (vm == R31)
11833         {
11834           unsigned sizeof_operation;
11835
11836           if (single)
11837             {
11838               if ((type >= 0) && (type <= 11))
11839                 {
11840                   int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
11841                   switch (INSTR (15, 14))
11842                     {
11843                     case 0:
11844                       sizeof_operation = nregs * 1;
11845                       break;
11846                     case 1:
11847                       sizeof_operation = nregs * 2;
11848                       break;
11849                     case 2:
11850                       if (INSTR (10, 10) == 0)
11851                         sizeof_operation = nregs * 4;
11852                       else
11853                         sizeof_operation = nregs * 8;
11854                       break;
11855                     default:
11856                       HALT_UNALLOC;
11857                     }
11858                 }
11859               else if (type == 0xC)
11860                 {
11861                   sizeof_operation = INSTR (21, 21) ? 2 : 1;
11862                   sizeof_operation <<= INSTR (11, 10);
11863                 }
11864               else if (type == 0xE)
11865                 {
11866                   sizeof_operation = INSTR (21, 21) ? 4 : 3;
11867                   sizeof_operation <<= INSTR (11, 10);
11868                 }
11869               else
11870                 HALT_UNALLOC;
11871             }
11872           else
11873             {
11874               switch (type)
11875                 {
11876                 case 0: sizeof_operation = 32; break;
11877                 case 4: sizeof_operation = 24; break;
11878                 case 8: sizeof_operation = 16; break;
11879
11880                 case 7:
11881                   /* One register, immediate offset variant.  */
11882                   sizeof_operation = 8;
11883                   break;
11884
11885                 case 10:
11886                   /* Two registers, immediate offset variant.  */
11887                   sizeof_operation = 16;
11888                   break;
11889
11890                 case 6:
11891                   /* Three registers, immediate offset variant.  */
11892                   sizeof_operation = 24;
11893                   break;
11894
11895                 case 2:
11896                   /* Four registers, immediate offset variant.  */
11897                   sizeof_operation = 32;
11898                   break;
11899
11900                 default:
11901                   HALT_UNALLOC;
11902                 }
11903
11904               if (INSTR (30, 30))
11905                 sizeof_operation *= 2;
11906             }
11907
11908           aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
11909         }
11910       else
11911         aarch64_set_reg_u64 (cpu, vn, SP_OK,
11912                              address + aarch64_get_reg_u64 (cpu, vm, NO_SP));
11913     }
11914   else
11915     {
11916       NYI_assert (20, 16, 0);
11917     }
11918
11919   if (single)
11920     {
11921       if (load)
11922         {
11923           if ((type >= 0) && (type <= 11))
11924             do_vec_LDn_single (cpu, address);
11925           else if ((type == 0xC) || (type == 0xE))
11926             do_vec_LDnR (cpu, address);
11927           else
11928             HALT_UNALLOC;
11929           return;
11930         }
11931
11932       /* Stores.  */
11933       if ((type >= 0) && (type <= 11))
11934         {
11935           do_vec_STn_single (cpu, address);
11936           return;
11937         }
11938
11939       HALT_UNALLOC;
11940     }
11941
11942   if (load)
11943     {
11944       switch (type)
11945         {
11946         case 0:  LD4 (cpu, address); return;
11947         case 4:  LD3 (cpu, address); return;
11948         case 8:  LD2 (cpu, address); return;
11949         case 2:  LD1_4 (cpu, address); return;
11950         case 6:  LD1_3 (cpu, address); return;
11951         case 10: LD1_2 (cpu, address); return;
11952         case 7:  LD1_1 (cpu, address); return;
11953
11954         default:
11955           HALT_UNALLOC;
11956         }
11957     }
11958
11959   /* Stores.  */
11960   switch (type)
11961     {
11962     case 0:  ST4 (cpu, address); return;
11963     case 4:  ST3 (cpu, address); return;
11964     case 8:  ST2 (cpu, address); return;
11965     case 2:  ST1_4 (cpu, address); return;
11966     case 6:  ST1_3 (cpu, address); return;
11967     case 10: ST1_2 (cpu, address); return;
11968     case 7:  ST1_1 (cpu, address); return;
11969     default:
11970       HALT_UNALLOC;
11971     }
11972 }
11973
11974 static void
11975 dexLdSt (sim_cpu *cpu)
11976 {
11977   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
11978      assert  group == GROUP_LDST_0100 || group == GROUP_LDST_0110 ||
11979              group == GROUP_LDST_1100 || group == GROUP_LDST_1110
11980      bits [29,28:26] of a LS are the secondary dispatch vector.  */
11981   uint32_t group2 = dispatchLS (aarch64_get_instr (cpu));
11982
11983   switch (group2)
11984     {
11985     case LS_EXCL_000:
11986       dexLoadExclusive (cpu); return;
11987
11988     case LS_LIT_010:
11989     case LS_LIT_011:
11990       dexLoadLiteral (cpu); return;
11991
11992     case LS_OTHER_110:
11993     case LS_OTHER_111:
11994       dexLoadOther (cpu); return;
11995
11996     case LS_ADVSIMD_001:
11997       do_vec_load_store (cpu); return;
11998
11999     case LS_PAIR_100:
12000       dex_load_store_pair_gr (cpu); return;
12001
12002     case LS_PAIR_101:
12003       dex_load_store_pair_fp (cpu); return;
12004
12005     default:
12006       /* Should never reach here.  */
12007       HALT_NYI;
12008     }
12009 }
12010
12011 /* Specific decode and execute for group Data Processing Register.  */
12012
12013 static void
12014 dexLogicalShiftedRegister (sim_cpu *cpu)
12015 {
12016   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12017      instr[30,29] = op
12018      instr[28:24] = 01010
12019      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
12020      instr[21]    = N
12021      instr[20,16] = Rm
12022      instr[15,10] = count : must be 0xxxxx for 32 bit
12023      instr[9,5]   = Rn
12024      instr[4,0]   = Rd  */
12025
12026   uint32_t size      = INSTR (31, 31);
12027   Shift    shiftType = INSTR (23, 22);
12028   uint32_t count     = INSTR (15, 10);
12029
12030   /* 32 bit operations must have count[5] = 0.
12031      or else we have an UNALLOC.  */
12032   if (size == 0 && uimm (count, 5, 5))
12033     HALT_UNALLOC;
12034
12035   /* Dispatch on size:op:N.  */
12036   switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
12037     {
12038     case 0: and32_shift  (cpu, shiftType, count); return;
12039     case 1: bic32_shift  (cpu, shiftType, count); return;
12040     case 2: orr32_shift  (cpu, shiftType, count); return;
12041     case 3: orn32_shift  (cpu, shiftType, count); return;
12042     case 4: eor32_shift  (cpu, shiftType, count); return;
12043     case 5: eon32_shift  (cpu, shiftType, count); return;
12044     case 6: ands32_shift (cpu, shiftType, count); return;
12045     case 7: bics32_shift (cpu, shiftType, count); return;
12046     case 8: and64_shift  (cpu, shiftType, count); return;
12047     case 9: bic64_shift  (cpu, shiftType, count); return;
12048     case 10:orr64_shift  (cpu, shiftType, count); return;
12049     case 11:orn64_shift  (cpu, shiftType, count); return;
12050     case 12:eor64_shift  (cpu, shiftType, count); return;
12051     case 13:eon64_shift  (cpu, shiftType, count); return;
12052     case 14:ands64_shift (cpu, shiftType, count); return;
12053     case 15:bics64_shift (cpu, shiftType, count); return;
12054     }
12055 }
12056
12057 /* 32 bit conditional select.  */
12058 static void
12059 csel32 (sim_cpu *cpu, CondCode cc)
12060 {
12061   unsigned rm = INSTR (20, 16);
12062   unsigned rn = INSTR (9, 5);
12063   unsigned rd = INSTR (4, 0);
12064
12065   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12066                        testConditionCode (cpu, cc)
12067                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12068                        : aarch64_get_reg_u32 (cpu, rm, NO_SP));
12069 }
12070
12071 /* 64 bit conditional select.  */
12072 static void
12073 csel64 (sim_cpu *cpu, CondCode cc)
12074 {
12075   unsigned rm = INSTR (20, 16);
12076   unsigned rn = INSTR (9, 5);
12077   unsigned rd = INSTR (4, 0);
12078
12079   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12080                        testConditionCode (cpu, cc)
12081                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12082                        : aarch64_get_reg_u64 (cpu, rm, NO_SP));
12083 }
12084
12085 /* 32 bit conditional increment.  */
12086 static void
12087 csinc32 (sim_cpu *cpu, CondCode cc)
12088 {
12089   unsigned rm = INSTR (20, 16);
12090   unsigned rn = INSTR (9, 5);
12091   unsigned rd = INSTR (4, 0);
12092
12093   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12094                        testConditionCode (cpu, cc)
12095                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12096                        : aarch64_get_reg_u32 (cpu, rm, NO_SP) + 1);
12097 }
12098
12099 /* 64 bit conditional increment.  */
12100 static void
12101 csinc64 (sim_cpu *cpu, CondCode cc)
12102 {
12103   unsigned rm = INSTR (20, 16);
12104   unsigned rn = INSTR (9, 5);
12105   unsigned rd = INSTR (4, 0);
12106
12107   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12108                        testConditionCode (cpu, cc)
12109                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12110                        : aarch64_get_reg_u64 (cpu, rm, NO_SP) + 1);
12111 }
12112
12113 /* 32 bit conditional invert.  */
12114 static void
12115 csinv32 (sim_cpu *cpu, CondCode cc)
12116 {
12117   unsigned rm = INSTR (20, 16);
12118   unsigned rn = INSTR (9, 5);
12119   unsigned rd = INSTR (4, 0);
12120
12121   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12122                        testConditionCode (cpu, cc)
12123                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12124                        : ~ aarch64_get_reg_u32 (cpu, rm, NO_SP));
12125 }
12126
12127 /* 64 bit conditional invert.  */
12128 static void
12129 csinv64 (sim_cpu *cpu, CondCode cc)
12130 {
12131   unsigned rm = INSTR (20, 16);
12132   unsigned rn = INSTR (9, 5);
12133   unsigned rd = INSTR (4, 0);
12134
12135   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12136                        testConditionCode (cpu, cc)
12137                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12138                        : ~ aarch64_get_reg_u64 (cpu, rm, NO_SP));
12139 }
12140
12141 /* 32 bit conditional negate.  */
12142 static void
12143 csneg32 (sim_cpu *cpu, CondCode cc)
12144 {
12145   unsigned rm = INSTR (20, 16);
12146   unsigned rn = INSTR (9, 5);
12147   unsigned rd = INSTR (4, 0);
12148
12149   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12150                        testConditionCode (cpu, cc)
12151                        ? aarch64_get_reg_u32 (cpu, rn, NO_SP)
12152                        : - aarch64_get_reg_u32 (cpu, rm, NO_SP));
12153 }
12154
12155 /* 64 bit conditional negate.  */
12156 static void
12157 csneg64 (sim_cpu *cpu, CondCode cc)
12158 {
12159   unsigned rm = INSTR (20, 16);
12160   unsigned rn = INSTR (9, 5);
12161   unsigned rd = INSTR (4, 0);
12162
12163   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12164                        testConditionCode (cpu, cc)
12165                        ? aarch64_get_reg_u64 (cpu, rn, NO_SP)
12166                        : - aarch64_get_reg_u64 (cpu, rm, NO_SP));
12167 }
12168
12169 static void
12170 dexCondSelect (sim_cpu *cpu)
12171 {
12172   /* instr[28,21] = 11011011
12173      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12174      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
12175                             100 ==> CSINV, 101 ==> CSNEG,
12176                             _1_ ==> UNALLOC
12177      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12178      instr[15,12] = cond
12179      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
12180
12181   CondCode cc = INSTR (15, 12);
12182   uint32_t S = INSTR (29, 29);
12183   uint32_t op2 = INSTR (11, 10);
12184
12185   if (S == 1)
12186     HALT_UNALLOC;
12187
12188   if (op2 & 0x2)
12189     HALT_UNALLOC;
12190
12191   switch ((INSTR (31, 30) << 1) | op2)
12192     {
12193     case 0: csel32  (cpu, cc); return;
12194     case 1: csinc32 (cpu, cc); return;
12195     case 2: csinv32 (cpu, cc); return;
12196     case 3: csneg32 (cpu, cc); return;
12197     case 4: csel64  (cpu, cc); return;
12198     case 5: csinc64 (cpu, cc); return;
12199     case 6: csinv64 (cpu, cc); return;
12200     case 7: csneg64 (cpu, cc); return;
12201     }
12202 }
12203
12204 /* Some helpers for counting leading 1 or 0 bits.  */
12205
12206 /* Counts the number of leading bits which are the same
12207    in a 32 bit value in the range 1 to 32.  */
12208 static uint32_t
12209 leading32 (uint32_t value)
12210 {
12211   int32_t mask= 0xffff0000;
12212   uint32_t count= 16; /* Counts number of bits set in mask.  */
12213   uint32_t lo = 1;    /* Lower bound for number of sign bits.  */
12214   uint32_t hi = 32;   /* Upper bound for number of sign bits.  */
12215
12216   while (lo + 1 < hi)
12217     {
12218       int32_t test = (value & mask);
12219
12220       if (test == 0 || test == mask)
12221         {
12222           lo = count;
12223           count = (lo + hi) / 2;
12224           mask >>= (count - lo);
12225         }
12226       else
12227         {
12228           hi = count;
12229           count = (lo + hi) / 2;
12230           mask <<= hi - count;
12231         }
12232     }
12233
12234   if (lo != hi)
12235     {
12236       int32_t test;
12237
12238       mask >>= 1;
12239       test = (value & mask);
12240
12241       if (test == 0 || test == mask)
12242         count = hi;
12243       else
12244         count = lo;
12245     }
12246
12247   return count;
12248 }
12249
12250 /* Counts the number of leading bits which are the same
12251    in a 64 bit value in the range 1 to 64.  */
12252 static uint64_t
12253 leading64 (uint64_t value)
12254 {
12255   int64_t mask= 0xffffffff00000000LL;
12256   uint64_t count = 32; /* Counts number of bits set in mask.  */
12257   uint64_t lo = 1;     /* Lower bound for number of sign bits.  */
12258   uint64_t hi = 64;    /* Upper bound for number of sign bits.  */
12259
12260   while (lo + 1 < hi)
12261     {
12262       int64_t test = (value & mask);
12263
12264       if (test == 0 || test == mask)
12265         {
12266           lo = count;
12267           count = (lo + hi) / 2;
12268           mask >>= (count - lo);
12269         }
12270       else
12271         {
12272           hi = count;
12273           count = (lo + hi) / 2;
12274           mask <<= hi - count;
12275         }
12276     }
12277
12278   if (lo != hi)
12279     {
12280       int64_t test;
12281
12282       mask >>= 1;
12283       test = (value & mask);
12284
12285       if (test == 0 || test == mask)
12286         count = hi;
12287       else
12288         count = lo;
12289     }
12290
12291   return count;
12292 }
12293
12294 /* Bit operations.  */
12295 /* N.B register args may not be SP.  */
12296
12297 /* 32 bit count leading sign bits.  */
12298 static void
12299 cls32 (sim_cpu *cpu)
12300 {
12301   unsigned rn = INSTR (9, 5);
12302   unsigned rd = INSTR (4, 0);
12303
12304   /* N.B. the result needs to exclude the leading bit.  */
12305   aarch64_set_reg_u64
12306     (cpu, rd, NO_SP, leading32 (aarch64_get_reg_u32 (cpu, rn, NO_SP)) - 1);
12307 }
12308
12309 /* 64 bit count leading sign bits.  */
12310 static void
12311 cls64 (sim_cpu *cpu)
12312 {
12313   unsigned rn = INSTR (9, 5);
12314   unsigned rd = INSTR (4, 0);
12315
12316   /* N.B. the result needs to exclude the leading bit.  */
12317   aarch64_set_reg_u64
12318     (cpu, rd, NO_SP, leading64 (aarch64_get_reg_u64 (cpu, rn, NO_SP)) - 1);
12319 }
12320
12321 /* 32 bit count leading zero bits.  */
12322 static void
12323 clz32 (sim_cpu *cpu)
12324 {
12325   unsigned rn = INSTR (9, 5);
12326   unsigned rd = INSTR (4, 0);
12327   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12328
12329   /* if the sign (top) bit is set then the count is 0.  */
12330   if (pick32 (value, 31, 31))
12331     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12332   else
12333     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading32 (value));
12334 }
12335
12336 /* 64 bit count leading zero bits.  */
12337 static void
12338 clz64 (sim_cpu *cpu)
12339 {
12340   unsigned rn = INSTR (9, 5);
12341   unsigned rd = INSTR (4, 0);
12342   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12343
12344   /* if the sign (top) bit is set then the count is 0.  */
12345   if (pick64 (value, 63, 63))
12346     aarch64_set_reg_u64 (cpu, rd, NO_SP, 0L);
12347   else
12348     aarch64_set_reg_u64 (cpu, rd, NO_SP, leading64 (value));
12349 }
12350
12351 /* 32 bit reverse bits.  */
12352 static void
12353 rbit32 (sim_cpu *cpu)
12354 {
12355   unsigned rn = INSTR (9, 5);
12356   unsigned rd = INSTR (4, 0);
12357   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12358   uint32_t result = 0;
12359   int i;
12360
12361   for (i = 0; i < 32; i++)
12362     {
12363       result <<= 1;
12364       result |= (value & 1);
12365       value >>= 1;
12366     }
12367   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12368 }
12369
12370 /* 64 bit reverse bits.  */
12371 static void
12372 rbit64 (sim_cpu *cpu)
12373 {
12374   unsigned rn = INSTR (9, 5);
12375   unsigned rd = INSTR (4, 0);
12376   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12377   uint64_t result = 0;
12378   int i;
12379
12380   for (i = 0; i < 64; i++)
12381     {
12382       result <<= 1;
12383       result |= (value & 1UL);
12384       value >>= 1;
12385     }
12386   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12387 }
12388
12389 /* 32 bit reverse bytes.  */
12390 static void
12391 rev32 (sim_cpu *cpu)
12392 {
12393   unsigned rn = INSTR (9, 5);
12394   unsigned rd = INSTR (4, 0);
12395   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12396   uint32_t result = 0;
12397   int i;
12398
12399   for (i = 0; i < 4; i++)
12400     {
12401       result <<= 8;
12402       result |= (value & 0xff);
12403       value >>= 8;
12404     }
12405   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12406 }
12407
12408 /* 64 bit reverse bytes.  */
12409 static void
12410 rev64 (sim_cpu *cpu)
12411 {
12412   unsigned rn = INSTR (9, 5);
12413   unsigned rd = INSTR (4, 0);
12414   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12415   uint64_t result = 0;
12416   int i;
12417
12418   for (i = 0; i < 8; i++)
12419     {
12420       result <<= 8;
12421       result |= (value & 0xffULL);
12422       value >>= 8;
12423     }
12424   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12425 }
12426
12427 /* 32 bit reverse shorts.  */
12428 /* N.B.this reverses the order of the bytes in each half word.  */
12429 static void
12430 revh32 (sim_cpu *cpu)
12431 {
12432   unsigned rn = INSTR (9, 5);
12433   unsigned rd = INSTR (4, 0);
12434   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12435   uint32_t result = 0;
12436   int i;
12437
12438   for (i = 0; i < 2; i++)
12439     {
12440       result <<= 8;
12441       result |= (value & 0x00ff00ff);
12442       value >>= 8;
12443     }
12444   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12445 }
12446
12447 /* 64 bit reverse shorts.  */
12448 /* N.B.this reverses the order of the bytes in each half word.  */
12449 static void
12450 revh64 (sim_cpu *cpu)
12451 {
12452   unsigned rn = INSTR (9, 5);
12453   unsigned rd = INSTR (4, 0);
12454   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12455   uint64_t result = 0;
12456   int i;
12457
12458   for (i = 0; i < 2; i++)
12459     {
12460       result <<= 8;
12461       result |= (value & 0x00ff00ff00ff00ffULL);
12462       value >>= 8;
12463     }
12464   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
12465 }
12466
12467 static void
12468 dexDataProc1Source (sim_cpu *cpu)
12469 {
12470   /* instr[30]    = 1
12471      instr[28,21] = 111010110
12472      instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
12473      instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
12474      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
12475      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
12476                              000010 ==> REV, 000011 ==> UNALLOC
12477                              000100 ==> CLZ, 000101 ==> CLS
12478                              ow ==> UNALLOC
12479      instr[9,5]   = rn : may not be SP
12480      instr[4,0]   = rd : may not be SP.  */
12481
12482   uint32_t S = INSTR (29, 29);
12483   uint32_t opcode2 = INSTR (20, 16);
12484   uint32_t opcode = INSTR (15, 10);
12485   uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
12486
12487   if (S == 1)
12488     HALT_UNALLOC;
12489
12490   if (opcode2 != 0)
12491     HALT_UNALLOC;
12492
12493   if (opcode & 0x38)
12494     HALT_UNALLOC;
12495
12496   switch (dispatch)
12497     {
12498     case 0: rbit32 (cpu); return;
12499     case 1: revh32 (cpu); return;
12500     case 2: rev32 (cpu); return;
12501     case 4: clz32 (cpu); return;
12502     case 5: cls32 (cpu); return;
12503     case 8: rbit64 (cpu); return;
12504     case 9: revh64 (cpu); return;
12505     case 10:rev32 (cpu); return;
12506     case 11:rev64 (cpu); return;
12507     case 12:clz64 (cpu); return;
12508     case 13:cls64 (cpu); return;
12509     default: HALT_UNALLOC;
12510     }
12511 }
12512
12513 /* Variable shift.
12514    Shifts by count supplied in register.
12515    N.B register args may not be SP.
12516    These all use the shifted auxiliary function for
12517    simplicity and clarity.  Writing the actual shift
12518    inline would avoid a branch and so be faster but
12519    would also necessitate getting signs right.  */
12520
12521 /* 32 bit arithmetic shift right.  */
12522 static void
12523 asrv32 (sim_cpu *cpu)
12524 {
12525   unsigned rm = INSTR (20, 16);
12526   unsigned rn = INSTR (9, 5);
12527   unsigned rd = INSTR (4, 0);
12528
12529   aarch64_set_reg_u64
12530     (cpu, rd, NO_SP,
12531      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ASR,
12532                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12533 }
12534
12535 /* 64 bit arithmetic shift right.  */
12536 static void
12537 asrv64 (sim_cpu *cpu)
12538 {
12539   unsigned rm = INSTR (20, 16);
12540   unsigned rn = INSTR (9, 5);
12541   unsigned rd = INSTR (4, 0);
12542
12543   aarch64_set_reg_u64
12544     (cpu, rd, NO_SP,
12545      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ASR,
12546                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12547 }
12548
12549 /* 32 bit logical shift left.  */
12550 static void
12551 lslv32 (sim_cpu *cpu)
12552 {
12553   unsigned rm = INSTR (20, 16);
12554   unsigned rn = INSTR (9, 5);
12555   unsigned rd = INSTR (4, 0);
12556
12557   aarch64_set_reg_u64
12558     (cpu, rd, NO_SP,
12559      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSL,
12560                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12561 }
12562
12563 /* 64 bit arithmetic shift left.  */
12564 static void
12565 lslv64 (sim_cpu *cpu)
12566 {
12567   unsigned rm = INSTR (20, 16);
12568   unsigned rn = INSTR (9, 5);
12569   unsigned rd = INSTR (4, 0);
12570
12571   aarch64_set_reg_u64
12572     (cpu, rd, NO_SP,
12573      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSL,
12574                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12575 }
12576
12577 /* 32 bit logical shift right.  */
12578 static void
12579 lsrv32 (sim_cpu *cpu)
12580 {
12581   unsigned rm = INSTR (20, 16);
12582   unsigned rn = INSTR (9, 5);
12583   unsigned rd = INSTR (4, 0);
12584
12585   aarch64_set_reg_u64
12586     (cpu, rd, NO_SP,
12587      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), LSR,
12588                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12589 }
12590
12591 /* 64 bit logical shift right.  */
12592 static void
12593 lsrv64 (sim_cpu *cpu)
12594 {
12595   unsigned rm = INSTR (20, 16);
12596   unsigned rn = INSTR (9, 5);
12597   unsigned rd = INSTR (4, 0);
12598
12599   aarch64_set_reg_u64
12600     (cpu, rd, NO_SP,
12601      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), LSR,
12602                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12603 }
12604
12605 /* 32 bit rotate right.  */
12606 static void
12607 rorv32 (sim_cpu *cpu)
12608 {
12609   unsigned rm = INSTR (20, 16);
12610   unsigned rn = INSTR (9, 5);
12611   unsigned rd = INSTR (4, 0);
12612
12613   aarch64_set_reg_u64
12614     (cpu, rd, NO_SP,
12615      shifted32 (aarch64_get_reg_u32 (cpu, rn, NO_SP), ROR,
12616                 (aarch64_get_reg_u32 (cpu, rm, NO_SP) & 0x1f)));
12617 }
12618
12619 /* 64 bit rotate right.  */
12620 static void
12621 rorv64 (sim_cpu *cpu)
12622 {
12623   unsigned rm = INSTR (20, 16);
12624   unsigned rn = INSTR (9, 5);
12625   unsigned rd = INSTR (4, 0);
12626
12627   aarch64_set_reg_u64
12628     (cpu, rd, NO_SP,
12629      shifted64 (aarch64_get_reg_u64 (cpu, rn, NO_SP), ROR,
12630                 (aarch64_get_reg_u64 (cpu, rm, NO_SP) & 0x3f)));
12631 }
12632
12633
12634 /* divide.  */
12635
12636 /* 32 bit signed divide.  */
12637 static void
12638 cpuiv32 (sim_cpu *cpu)
12639 {
12640   unsigned rm = INSTR (20, 16);
12641   unsigned rn = INSTR (9, 5);
12642   unsigned rd = INSTR (4, 0);
12643   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12644   /* TODO : check that this rounds towards zero as required.  */
12645   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
12646   int64_t divisor = aarch64_get_reg_s32 (cpu, rm, NO_SP);
12647
12648   aarch64_set_reg_s64 (cpu, rd, NO_SP,
12649                        divisor ? ((int32_t) (dividend / divisor)) : 0);
12650 }
12651
12652 /* 64 bit signed divide.  */
12653 static void
12654 cpuiv64 (sim_cpu *cpu)
12655 {
12656   unsigned rm = INSTR (20, 16);
12657   unsigned rn = INSTR (9, 5);
12658   unsigned rd = INSTR (4, 0);
12659
12660   /* TODO : check that this rounds towards zero as required.  */
12661   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
12662
12663   aarch64_set_reg_s64
12664     (cpu, rd, NO_SP,
12665      divisor ? (aarch64_get_reg_s64 (cpu, rn, NO_SP) / divisor) : 0);
12666 }
12667
12668 /* 32 bit unsigned divide.  */
12669 static void
12670 udiv32 (sim_cpu *cpu)
12671 {
12672   unsigned rm = INSTR (20, 16);
12673   unsigned rn = INSTR (9, 5);
12674   unsigned rd = INSTR (4, 0);
12675
12676   /* N.B. the pseudo-code does the divide using 64 bit data.  */
12677   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
12678   uint64_t divisor  = aarch64_get_reg_u32 (cpu, rm, NO_SP);
12679
12680   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12681                        divisor ? (uint32_t) (dividend / divisor) : 0);
12682 }
12683
12684 /* 64 bit unsigned divide.  */
12685 static void
12686 udiv64 (sim_cpu *cpu)
12687 {
12688   unsigned rm = INSTR (20, 16);
12689   unsigned rn = INSTR (9, 5);
12690   unsigned rd = INSTR (4, 0);
12691
12692   /* TODO : check that this rounds towards zero as required.  */
12693   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12694
12695   aarch64_set_reg_u64
12696     (cpu, rd, NO_SP,
12697      divisor ? (aarch64_get_reg_u64 (cpu, rn, NO_SP) / divisor) : 0);
12698 }
12699
12700 static void
12701 dexDataProc2Source (sim_cpu *cpu)
12702 {
12703   /* assert instr[30] == 0
12704      instr[28,21] == 11010110
12705      instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
12706      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
12707      instr[15,10] = opcode : 000010 ==> UDIV, 000011 ==> CPUIV,
12708                              001000 ==> LSLV, 001001 ==> LSRV
12709                              001010 ==> ASRV, 001011 ==> RORV
12710                              ow ==> UNALLOC.  */
12711
12712   uint32_t dispatch;
12713   uint32_t S = INSTR (29, 29);
12714   uint32_t opcode = INSTR (15, 10);
12715
12716   if (S == 1)
12717     HALT_UNALLOC;
12718
12719   if (opcode & 0x34)
12720     HALT_UNALLOC;
12721
12722   dispatch = (  (INSTR (31, 31) << 3)
12723               | (uimm (opcode, 3, 3) << 2)
12724               |  uimm (opcode, 1, 0));
12725   switch (dispatch)
12726     {
12727     case 2:  udiv32 (cpu); return;
12728     case 3:  cpuiv32 (cpu); return;
12729     case 4:  lslv32 (cpu); return;
12730     case 5:  lsrv32 (cpu); return;
12731     case 6:  asrv32 (cpu); return;
12732     case 7:  rorv32 (cpu); return;
12733     case 10: udiv64 (cpu); return;
12734     case 11: cpuiv64 (cpu); return;
12735     case 12: lslv64 (cpu); return;
12736     case 13: lsrv64 (cpu); return;
12737     case 14: asrv64 (cpu); return;
12738     case 15: rorv64 (cpu); return;
12739     default: HALT_UNALLOC;
12740     }
12741 }
12742
12743
12744 /* Multiply.  */
12745
12746 /* 32 bit multiply and add.  */
12747 static void
12748 madd32 (sim_cpu *cpu)
12749 {
12750   unsigned rm = INSTR (20, 16);
12751   unsigned ra = INSTR (14, 10);
12752   unsigned rn = INSTR (9, 5);
12753   unsigned rd = INSTR (4, 0);
12754
12755   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12756   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12757                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12758                        + aarch64_get_reg_u32 (cpu, rn, NO_SP)
12759                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12760 }
12761
12762 /* 64 bit multiply and add.  */
12763 static void
12764 madd64 (sim_cpu *cpu)
12765 {
12766   unsigned rm = INSTR (20, 16);
12767   unsigned ra = INSTR (14, 10);
12768   unsigned rn = INSTR (9, 5);
12769   unsigned rd = INSTR (4, 0);
12770
12771   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12772   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12773                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12774                        + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
12775                           * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
12776 }
12777
12778 /* 32 bit multiply and sub.  */
12779 static void
12780 msub32 (sim_cpu *cpu)
12781 {
12782   unsigned rm = INSTR (20, 16);
12783   unsigned ra = INSTR (14, 10);
12784   unsigned rn = INSTR (9, 5);
12785   unsigned rd = INSTR (4, 0);
12786
12787   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12788   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12789                        aarch64_get_reg_u32 (cpu, ra, NO_SP)
12790                        - aarch64_get_reg_u32 (cpu, rn, NO_SP)
12791                        * aarch64_get_reg_u32 (cpu, rm, NO_SP));
12792 }
12793
12794 /* 64 bit multiply and sub.  */
12795 static void
12796 msub64 (sim_cpu *cpu)
12797 {
12798   unsigned rm = INSTR (20, 16);
12799   unsigned ra = INSTR (14, 10);
12800   unsigned rn = INSTR (9, 5);
12801   unsigned rd = INSTR (4, 0);
12802
12803   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12804   aarch64_set_reg_u64 (cpu, rd, NO_SP,
12805                        aarch64_get_reg_u64 (cpu, ra, NO_SP)
12806                        - aarch64_get_reg_u64 (cpu, rn, NO_SP)
12807                        * aarch64_get_reg_u64 (cpu, rm, NO_SP));
12808 }
12809
12810 /* Signed multiply add long -- source, source2 : 32 bit, source3 : 64 bit.  */
12811 static void
12812 smaddl (sim_cpu *cpu)
12813 {
12814   unsigned rm = INSTR (20, 16);
12815   unsigned ra = INSTR (14, 10);
12816   unsigned rn = INSTR (9, 5);
12817   unsigned rd = INSTR (4, 0);
12818
12819   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12820      obtain a 64 bit product.  */
12821   aarch64_set_reg_s64
12822     (cpu, rd, NO_SP,
12823      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12824      + ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12825      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12826 }
12827
12828 /* Signed multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12829 static void
12830 smsubl (sim_cpu *cpu)
12831 {
12832   unsigned rm = INSTR (20, 16);
12833   unsigned ra = INSTR (14, 10);
12834   unsigned rn = INSTR (9, 5);
12835   unsigned rd = INSTR (4, 0);
12836
12837   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12838      obtain a 64 bit product.  */
12839   aarch64_set_reg_s64
12840     (cpu, rd, NO_SP,
12841      aarch64_get_reg_s64 (cpu, ra, NO_SP)
12842      - ((int64_t) aarch64_get_reg_s32 (cpu, rn, NO_SP))
12843      * ((int64_t) aarch64_get_reg_s32 (cpu, rm, NO_SP)));
12844 }
12845
12846 /* Integer Multiply/Divide.  */
12847
12848 /* First some macros and a helper function.  */
12849 /* Macros to test or access elements of 64 bit words.  */
12850
12851 /* Mask used to access lo 32 bits of 64 bit unsigned int.  */
12852 #define LOW_WORD_MASK ((1ULL << 32) - 1)
12853 /* Return the lo 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12854 #define lowWordToU64(_value_u64) ((_value_u64) & LOW_WORD_MASK)
12855 /* Return the hi 32 bit word of a 64 bit unsigned int as a 64 bit unsigned int.  */
12856 #define highWordToU64(_value_u64) ((_value_u64) >> 32)
12857
12858 /* Offset of sign bit in 64 bit signed integger.  */
12859 #define SIGN_SHIFT_U64 63
12860 /* The sign bit itself -- also identifies the minimum negative int value.  */
12861 #define SIGN_BIT_U64 (1UL << SIGN_SHIFT_U64)
12862 /* Return true if a 64 bit signed int presented as an unsigned int is the
12863    most negative value.  */
12864 #define isMinimumU64(_value_u64) ((_value_u64) == SIGN_BIT_U64)
12865 /* Return true (non-zero) if a 64 bit signed int presented as an unsigned
12866    int has its sign bit set to false.  */
12867 #define isSignSetU64(_value_u64) ((_value_u64) & SIGN_BIT_U64)
12868 /* Return 1L or -1L according to whether a 64 bit signed int presented as
12869    an unsigned int has its sign bit set or not.  */
12870 #define signOfU64(_value_u64) (1L + (((value_u64) >> SIGN_SHIFT_U64) * -2L)
12871 /* Clear the sign bit of a 64 bit signed int presented as an unsigned int.  */
12872 #define clearSignU64(_value_u64) ((_value_u64) &= ~SIGN_BIT_U64)
12873
12874 /* Multiply two 64 bit ints and return.
12875    the hi 64 bits of the 128 bit product.  */
12876
12877 static uint64_t
12878 mul64hi (uint64_t value1, uint64_t value2)
12879 {
12880   uint64_t resultmid1;
12881   uint64_t result;
12882   uint64_t value1_lo = lowWordToU64 (value1);
12883   uint64_t value1_hi = highWordToU64 (value1) ;
12884   uint64_t value2_lo = lowWordToU64 (value2);
12885   uint64_t value2_hi = highWordToU64 (value2);
12886
12887   /* Cross-multiply and collect results.  */
12888   uint64_t xproductlo = value1_lo * value2_lo;
12889   uint64_t xproductmid1 = value1_lo * value2_hi;
12890   uint64_t xproductmid2 = value1_hi * value2_lo;
12891   uint64_t xproducthi = value1_hi * value2_hi;
12892   uint64_t carry = 0;
12893   /* Start accumulating 64 bit results.  */
12894   /* Drop bottom half of lowest cross-product.  */
12895   uint64_t resultmid = xproductlo >> 32;
12896   /* Add in middle products.  */
12897   resultmid = resultmid + xproductmid1;
12898
12899   /* Check for overflow.  */
12900   if (resultmid < xproductmid1)
12901     /* Carry over 1 into top cross-product.  */
12902     carry++;
12903
12904   resultmid1  = resultmid + xproductmid2;
12905
12906   /* Check for overflow.  */
12907   if (resultmid1 < xproductmid2)
12908     /* Carry over 1 into top cross-product.  */
12909     carry++;
12910
12911   /* Drop lowest 32 bits of middle cross-product.  */
12912   result = resultmid1 >> 32;
12913
12914   /* Add top cross-product plus and any carry.  */
12915   result += xproducthi + carry;
12916
12917   return result;
12918 }
12919
12920 /* Signed multiply high, source, source2 :
12921    64 bit, dest <-- high 64-bit of result.  */
12922 static void
12923 smulh (sim_cpu *cpu)
12924 {
12925   uint64_t uresult;
12926   int64_t  result;
12927   unsigned rm = INSTR (20, 16);
12928   unsigned rn = INSTR (9, 5);
12929   unsigned rd = INSTR (4, 0);
12930   GReg     ra = INSTR (14, 10);
12931   int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
12932   int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
12933   uint64_t uvalue1;
12934   uint64_t uvalue2;
12935   int64_t  signum = 1;
12936
12937   if (ra != R31)
12938     HALT_UNALLOC;
12939
12940   /* Convert to unsigned and use the unsigned mul64hi routine
12941      the fix the sign up afterwards.  */
12942   if (value1 < 0)
12943     {
12944       signum *= -1L;
12945       uvalue1 = -value1;
12946     }
12947   else
12948     {
12949       uvalue1 = value1;
12950     }
12951
12952   if (value2 < 0)
12953     {
12954       signum *= -1L;
12955       uvalue2 = -value2;
12956     }
12957   else
12958     {
12959       uvalue2 = value2;
12960     }
12961
12962   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12963   uresult = mul64hi (uvalue1, uvalue2);
12964   result = uresult;
12965   result *= signum;
12966
12967   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
12968 }
12969
12970 /* Unsigned multiply add long -- source, source2 :
12971    32 bit, source3 : 64 bit.  */
12972 static void
12973 umaddl (sim_cpu *cpu)
12974 {
12975   unsigned rm = INSTR (20, 16);
12976   unsigned ra = INSTR (14, 10);
12977   unsigned rn = INSTR (9, 5);
12978   unsigned rd = INSTR (4, 0);
12979
12980   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
12981   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
12982      obtain a 64 bit product.  */
12983   aarch64_set_reg_u64
12984     (cpu, rd, NO_SP,
12985      aarch64_get_reg_u64 (cpu, ra, NO_SP)
12986      + ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
12987      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
12988 }
12989
12990 /* Unsigned multiply sub long -- source, source2 : 32 bit, source3 : 64 bit.  */
12991 static void
12992 umsubl (sim_cpu *cpu)
12993 {
12994   unsigned rm = INSTR (20, 16);
12995   unsigned ra = INSTR (14, 10);
12996   unsigned rn = INSTR (9, 5);
12997   unsigned rd = INSTR (4, 0);
12998
12999   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13000   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
13001      obtain a 64 bit product.  */
13002   aarch64_set_reg_u64
13003     (cpu, rd, NO_SP,
13004      aarch64_get_reg_u64 (cpu, ra, NO_SP)
13005      - ((uint64_t) aarch64_get_reg_u32 (cpu, rn, NO_SP))
13006      * ((uint64_t) aarch64_get_reg_u32 (cpu, rm, NO_SP)));
13007 }
13008
13009 /* Unsigned multiply high, source, source2 :
13010    64 bit, dest <-- high 64-bit of result.  */
13011 static void
13012 umulh (sim_cpu *cpu)
13013 {
13014   unsigned rm = INSTR (20, 16);
13015   unsigned rn = INSTR (9, 5);
13016   unsigned rd = INSTR (4, 0);
13017   GReg     ra = INSTR (14, 10);
13018
13019   if (ra != R31)
13020     HALT_UNALLOC;
13021
13022   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13023   aarch64_set_reg_u64 (cpu, rd, NO_SP,
13024                        mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
13025                                 aarch64_get_reg_u64 (cpu, rm, NO_SP)));
13026 }
13027
13028 static void
13029 dexDataProc3Source (sim_cpu *cpu)
13030 {
13031   /* assert instr[28,24] == 11011.  */
13032   /* instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit (for rd at least)
13033      instr[30,29] = op54 : 00 ==> ok, ow ==> UNALLOC
13034      instr[23,21] = op31 : 111 ==> UNALLOC, o2 ==> ok
13035      instr[15] = o0 : 0/1 ==> ok
13036      instr[23,21:15] ==> op : 0000 ==> MADD, 0001 ==> MSUB,     (32/64 bit)
13037                               0010 ==> SMADDL, 0011 ==> SMSUBL, (64 bit only)
13038                               0100 ==> SMULH,                   (64 bit only)
13039                               1010 ==> UMADDL, 1011 ==> UNSUBL, (64 bit only)
13040                               1100 ==> UMULH                    (64 bit only)
13041                               ow ==> UNALLOC.  */
13042
13043   uint32_t dispatch;
13044   uint32_t size = INSTR (31, 31);
13045   uint32_t op54 = INSTR (30, 29);
13046   uint32_t op31 = INSTR (23, 21);
13047   uint32_t o0 = INSTR (15, 15);
13048
13049   if (op54 != 0)
13050     HALT_UNALLOC;
13051
13052   if (size == 0)
13053     {
13054       if (op31 != 0)
13055         HALT_UNALLOC;
13056
13057       if (o0 == 0)
13058         madd32 (cpu);
13059       else
13060         msub32 (cpu);
13061       return;
13062     }
13063
13064   dispatch = (op31 << 1) | o0;
13065
13066   switch (dispatch)
13067     {
13068     case 0:  madd64 (cpu); return;
13069     case 1:  msub64 (cpu); return;
13070     case 2:  smaddl (cpu); return;
13071     case 3:  smsubl (cpu); return;
13072     case 4:  smulh (cpu); return;
13073     case 10: umaddl (cpu); return;
13074     case 11: umsubl (cpu); return;
13075     case 12: umulh (cpu); return;
13076     default: HALT_UNALLOC;
13077     }
13078 }
13079
13080 static void
13081 dexDPReg (sim_cpu *cpu)
13082 {
13083   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13084      assert  group == GROUP_DPREG_0101 || group == GROUP_DPREG_1101
13085      bits [28:24:21] of a DPReg are the secondary dispatch vector.  */
13086   uint32_t group2 = dispatchDPReg (aarch64_get_instr (cpu));
13087
13088   switch (group2)
13089     {
13090     case DPREG_LOG_000:
13091     case DPREG_LOG_001:
13092       dexLogicalShiftedRegister (cpu); return;
13093
13094     case DPREG_ADDSHF_010:
13095       dexAddSubtractShiftedRegister (cpu); return;
13096
13097     case DPREG_ADDEXT_011:
13098       dexAddSubtractExtendedRegister (cpu); return;
13099
13100     case DPREG_ADDCOND_100:
13101       {
13102         /* This set bundles a variety of different operations.  */
13103         /* Check for.  */
13104         /* 1) add/sub w carry.  */
13105         uint32_t mask1 = 0x1FE00000U;
13106         uint32_t val1  = 0x1A000000U;
13107         /* 2) cond compare register/immediate.  */
13108         uint32_t mask2 = 0x1FE00000U;
13109         uint32_t val2  = 0x1A400000U;
13110         /* 3) cond select.  */
13111         uint32_t mask3 = 0x1FE00000U;
13112         uint32_t val3  = 0x1A800000U;
13113         /* 4) data proc 1/2 source.  */
13114         uint32_t mask4 = 0x1FE00000U;
13115         uint32_t val4  = 0x1AC00000U;
13116
13117         if ((aarch64_get_instr (cpu) & mask1) == val1)
13118           dexAddSubtractWithCarry (cpu);
13119
13120         else if ((aarch64_get_instr (cpu) & mask2) == val2)
13121           CondCompare (cpu);
13122
13123         else if ((aarch64_get_instr (cpu) & mask3) == val3)
13124           dexCondSelect (cpu);
13125
13126         else if ((aarch64_get_instr (cpu) & mask4) == val4)
13127           {
13128             /* Bit 30 is clear for data proc 2 source
13129                and set for data proc 1 source.  */
13130             if (aarch64_get_instr (cpu)  & (1U << 30))
13131               dexDataProc1Source (cpu);
13132             else
13133               dexDataProc2Source (cpu);
13134           }
13135
13136         else
13137           /* Should not reach here.  */
13138           HALT_NYI;
13139
13140         return;
13141       }
13142
13143     case DPREG_3SRC_110:
13144       dexDataProc3Source (cpu); return;
13145
13146     case DPREG_UNALLOC_101:
13147       HALT_UNALLOC;
13148
13149     case DPREG_3SRC_111:
13150       dexDataProc3Source (cpu); return;
13151
13152     default:
13153       /* Should never reach here.  */
13154       HALT_NYI;
13155     }
13156 }
13157
13158 /* Unconditional Branch immediate.
13159    Offset is a PC-relative byte offset in the range +/- 128MiB.
13160    The offset is assumed to be raw from the decode i.e. the
13161    simulator is expected to scale them from word offsets to byte.  */
13162
13163 /* Unconditional branch.  */
13164 static void
13165 buc (sim_cpu *cpu, int32_t offset)
13166 {
13167   aarch64_set_next_PC_by_offset (cpu, offset);
13168 }
13169
13170 static unsigned stack_depth = 0;
13171
13172 /* Unconditional branch and link -- writes return PC to LR.  */
13173 static void
13174 bl (sim_cpu *cpu, int32_t offset)
13175 {
13176   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13177   aarch64_save_LR (cpu);
13178   aarch64_set_next_PC_by_offset (cpu, offset);
13179
13180   if (TRACE_BRANCH_P (cpu))
13181     {
13182       ++ stack_depth;
13183       TRACE_BRANCH (cpu,
13184                     " %*scall %" PRIx64 " [%s]"
13185                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13186                     stack_depth, " ", aarch64_get_next_PC (cpu),
13187                     aarch64_get_func (CPU_STATE (cpu),
13188                                       aarch64_get_next_PC (cpu)),
13189                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13190                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13191                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13192                     );
13193     }
13194 }
13195
13196 /* Unconditional Branch register.
13197    Branch/return address is in source register.  */
13198
13199 /* Unconditional branch.  */
13200 static void
13201 br (sim_cpu *cpu)
13202 {
13203   unsigned rn = INSTR (9, 5);
13204   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13205   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13206 }
13207
13208 /* Unconditional branch and link -- writes return PC to LR.  */
13209 static void
13210 blr (sim_cpu *cpu)
13211 {
13212   unsigned rn = INSTR (9, 5);
13213
13214   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13215   /* The pseudo code in the spec says we update LR before fetching.
13216      the value from the rn.  */
13217   aarch64_save_LR (cpu);
13218   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13219
13220   if (TRACE_BRANCH_P (cpu))
13221     {
13222       ++ stack_depth;
13223       TRACE_BRANCH (cpu,
13224                     " %*scall %" PRIx64 " [%s]"
13225                     " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
13226                     stack_depth, " ", aarch64_get_next_PC (cpu),
13227                     aarch64_get_func (CPU_STATE (cpu),
13228                                       aarch64_get_next_PC (cpu)),
13229                     aarch64_get_reg_u64 (cpu, 0, NO_SP),
13230                     aarch64_get_reg_u64 (cpu, 1, NO_SP),
13231                     aarch64_get_reg_u64 (cpu, 2, NO_SP)
13232                     );
13233     }
13234 }
13235
13236 /* Return -- assembler will default source to LR this is functionally
13237    equivalent to br but, presumably, unlike br it side effects the
13238    branch predictor.  */
13239 static void
13240 ret (sim_cpu *cpu)
13241 {
13242   unsigned rn = INSTR (9, 5);
13243   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
13244
13245   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13246   if (TRACE_BRANCH_P (cpu))
13247     {
13248       TRACE_BRANCH (cpu,
13249                     " %*sreturn [result: %" PRIx64 "]",
13250                     stack_depth, " ", aarch64_get_reg_u64 (cpu, 0, NO_SP));
13251       -- stack_depth;
13252     }
13253 }
13254
13255 /* NOP -- we implement this and call it from the decode in case we
13256    want to intercept it later.  */
13257
13258 static void
13259 nop (sim_cpu *cpu)
13260 {
13261   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13262 }
13263
13264 /* Data synchronization barrier.  */
13265
13266 static void
13267 dsb (sim_cpu *cpu)
13268 {
13269   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13270 }
13271
13272 /* Data memory barrier.  */
13273
13274 static void
13275 dmb (sim_cpu *cpu)
13276 {
13277   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13278 }
13279
13280 /* Instruction synchronization barrier.  */
13281
13282 static void
13283 isb (sim_cpu *cpu)
13284 {
13285   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13286 }
13287
13288 static void
13289 dexBranchImmediate (sim_cpu *cpu)
13290 {
13291   /* assert instr[30,26] == 00101
13292      instr[31] ==> 0 == B, 1 == BL
13293      instr[25,0] == imm26 branch offset counted in words.  */
13294
13295   uint32_t top = INSTR (31, 31);
13296   /* We have a 26 byte signed word offset which we need to pass to the
13297      execute routine as a signed byte offset.  */
13298   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
13299
13300   if (top)
13301     bl (cpu, offset);
13302   else
13303     buc (cpu, offset);
13304 }
13305
13306 /* Control Flow.  */
13307
13308 /* Conditional branch
13309
13310    Offset is a PC-relative byte offset in the range +/- 1MiB pos is
13311    a bit position in the range 0 .. 63
13312
13313    cc is a CondCode enum value as pulled out of the decode
13314
13315    N.B. any offset register (source) can only be Xn or Wn.  */
13316
13317 static void
13318 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
13319 {
13320   /* The test returns TRUE if CC is met.  */
13321   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13322   if (testConditionCode (cpu, cc))
13323     aarch64_set_next_PC_by_offset (cpu, offset);
13324 }
13325
13326 /* 32 bit branch on register non-zero.  */
13327 static void
13328 cbnz32 (sim_cpu *cpu, int32_t offset)
13329 {
13330   unsigned rt = INSTR (4, 0);
13331
13332   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13333   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
13334     aarch64_set_next_PC_by_offset (cpu, offset);
13335 }
13336
13337 /* 64 bit branch on register zero.  */
13338 static void
13339 cbnz (sim_cpu *cpu, int32_t offset)
13340 {
13341   unsigned rt = INSTR (4, 0);
13342
13343   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13344   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
13345     aarch64_set_next_PC_by_offset (cpu, offset);
13346 }
13347
13348 /* 32 bit branch on register non-zero.  */
13349 static void
13350 cbz32 (sim_cpu *cpu, int32_t offset)
13351 {
13352   unsigned rt = INSTR (4, 0);
13353
13354   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13355   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
13356     aarch64_set_next_PC_by_offset (cpu, offset);
13357 }
13358
13359 /* 64 bit branch on register zero.  */
13360 static void
13361 cbz (sim_cpu *cpu, int32_t offset)
13362 {
13363   unsigned rt = INSTR (4, 0);
13364
13365   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13366   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
13367     aarch64_set_next_PC_by_offset (cpu, offset);
13368 }
13369
13370 /* Branch on register bit test non-zero -- one size fits all.  */
13371 static void
13372 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13373 {
13374   unsigned rt = INSTR (4, 0);
13375
13376   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13377   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
13378     aarch64_set_next_PC_by_offset (cpu, offset);
13379 }
13380
13381 /* Branch on register bit test zero -- one size fits all.  */
13382 static void
13383 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
13384 {
13385   unsigned rt = INSTR (4, 0);
13386
13387   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13388   if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
13389     aarch64_set_next_PC_by_offset (cpu, offset);
13390 }
13391
13392 static void
13393 dexCompareBranchImmediate (sim_cpu *cpu)
13394 {
13395   /* instr[30,25] = 01 1010
13396      instr[31]    = size : 0 ==> 32, 1 ==> 64
13397      instr[24]    = op : 0 ==> CBZ, 1 ==> CBNZ
13398      instr[23,5]  = simm19 branch offset counted in words
13399      instr[4,0]   = rt  */
13400
13401   uint32_t size = INSTR (31, 31);
13402   uint32_t op   = INSTR (24, 24);
13403   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13404
13405   if (size == 0)
13406     {
13407       if (op == 0)
13408         cbz32 (cpu, offset);
13409       else
13410         cbnz32 (cpu, offset);
13411     }
13412   else
13413     {
13414       if (op == 0)
13415         cbz (cpu, offset);
13416       else
13417         cbnz (cpu, offset);
13418     }
13419 }
13420
13421 static void
13422 dexTestBranchImmediate (sim_cpu *cpu)
13423 {
13424   /* instr[31]    = b5 : bit 5 of test bit idx
13425      instr[30,25] = 01 1011
13426      instr[24]    = op : 0 ==> TBZ, 1 == TBNZ
13427      instr[23,19] = b40 : bits 4 to 0 of test bit idx
13428      instr[18,5]  = simm14 : signed offset counted in words
13429      instr[4,0]   = uimm5  */
13430
13431   uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
13432   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
13433
13434   NYI_assert (30, 25, 0x1b);
13435
13436   if (INSTR (24, 24) == 0)
13437     tbz (cpu, pos, offset);
13438   else
13439     tbnz (cpu, pos, offset);
13440 }
13441
13442 static void
13443 dexCondBranchImmediate (sim_cpu *cpu)
13444 {
13445   /* instr[31,25] = 010 1010
13446      instr[24]    = op1; op => 00 ==> B.cond
13447      instr[23,5]  = simm19 : signed offset counted in words
13448      instr[4]     = op0
13449      instr[3,0]   = cond  */
13450
13451   int32_t offset;
13452   uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
13453
13454   NYI_assert (31, 25, 0x2a);
13455
13456   if (op != 0)
13457     HALT_UNALLOC;
13458
13459   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
13460
13461   bcc (cpu, offset, INSTR (3, 0));
13462 }
13463
13464 static void
13465 dexBranchRegister (sim_cpu *cpu)
13466 {
13467   /* instr[31,25] = 110 1011
13468      instr[24,21] = op : 0 ==> BR, 1 => BLR, 2 => RET, 3 => ERET, 4 => DRPS
13469      instr[20,16] = op2 : must be 11111
13470      instr[15,10] = op3 : must be 000000
13471      instr[4,0]   = op2 : must be 11111.  */
13472
13473   uint32_t op = INSTR (24, 21);
13474   uint32_t op2 = INSTR (20, 16);
13475   uint32_t op3 = INSTR (15, 10);
13476   uint32_t op4 = INSTR (4, 0);
13477
13478   NYI_assert (31, 25, 0x6b);
13479
13480   if (op2 != 0x1F || op3 != 0 || op4 != 0)
13481     HALT_UNALLOC;
13482
13483   if (op == 0)
13484     br (cpu);
13485
13486   else if (op == 1)
13487     blr (cpu);
13488
13489   else if (op == 2)
13490     ret (cpu);
13491
13492   else
13493     {
13494       /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
13495       /* anything else is unallocated.  */
13496       uint32_t rn = INSTR (4, 0);
13497
13498       if (rn != 0x1f)
13499         HALT_UNALLOC;
13500
13501       if (op == 4 || op == 5)
13502         HALT_NYI;
13503
13504       HALT_UNALLOC;
13505     }
13506 }
13507
13508 /* FIXME: We should get the Angel SWI values from ../../libgloss/aarch64/svc.h
13509    but this may not be available.  So instead we define the values we need
13510    here.  */
13511 #define AngelSVC_Reason_Open            0x01
13512 #define AngelSVC_Reason_Close           0x02
13513 #define AngelSVC_Reason_Write           0x05
13514 #define AngelSVC_Reason_Read            0x06
13515 #define AngelSVC_Reason_IsTTY           0x09
13516 #define AngelSVC_Reason_Seek            0x0A
13517 #define AngelSVC_Reason_FLen            0x0C
13518 #define AngelSVC_Reason_Remove          0x0E
13519 #define AngelSVC_Reason_Rename          0x0F
13520 #define AngelSVC_Reason_Clock           0x10
13521 #define AngelSVC_Reason_Time            0x11
13522 #define AngelSVC_Reason_System          0x12
13523 #define AngelSVC_Reason_Errno           0x13
13524 #define AngelSVC_Reason_GetCmdLine      0x15
13525 #define AngelSVC_Reason_HeapInfo        0x16
13526 #define AngelSVC_Reason_ReportException 0x18
13527 #define AngelSVC_Reason_Elapsed         0x30
13528
13529
13530 static void
13531 handle_halt (sim_cpu *cpu, uint32_t val)
13532 {
13533   uint64_t result = 0;
13534
13535   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13536   if (val != 0xf000)
13537     {
13538       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
13539       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13540                        sim_stopped, SIM_SIGTRAP);
13541     }
13542
13543   /* We have encountered an Angel SVC call.  See if we can process it.  */
13544   switch (aarch64_get_reg_u32 (cpu, 0, NO_SP))
13545     {
13546     case AngelSVC_Reason_HeapInfo:
13547       {
13548         /* Get the values.  */
13549         uint64_t stack_top = aarch64_get_stack_start (cpu);
13550         uint64_t heap_base = aarch64_get_heap_start (cpu);
13551
13552         /* Get the pointer  */
13553         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13554         ptr = aarch64_get_mem_u64 (cpu, ptr);
13555
13556         /* Fill in the memory block.  */
13557         /* Start addr of heap.  */
13558         aarch64_set_mem_u64 (cpu, ptr +  0, heap_base);
13559         /* End addr of heap.  */
13560         aarch64_set_mem_u64 (cpu, ptr +  8, stack_top);
13561         /* Lowest stack addr.  */
13562         aarch64_set_mem_u64 (cpu, ptr + 16, heap_base);
13563         /* Initial stack addr.  */
13564         aarch64_set_mem_u64 (cpu, ptr + 24, stack_top);
13565
13566         TRACE_SYSCALL (cpu, " AngelSVC: Get Heap Info");
13567       }
13568       break;
13569
13570     case AngelSVC_Reason_Open:
13571       {
13572         /* Get the pointer  */
13573         /* uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);.  */
13574         /* FIXME: For now we just assume that we will only be asked
13575            to open the standard file descriptors.  */
13576         static int fd = 0;
13577         result = fd ++;
13578
13579         TRACE_SYSCALL (cpu, " AngelSVC: Open file %d", fd - 1);
13580       }
13581       break;
13582
13583     case AngelSVC_Reason_Close:
13584       {
13585         uint64_t fh = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13586         TRACE_SYSCALL (cpu, " AngelSVC: Close file %d", (int) fh);
13587         result = 0;
13588       }
13589       break;
13590
13591     case AngelSVC_Reason_Errno:
13592       result = 0;
13593       TRACE_SYSCALL (cpu, " AngelSVC: Get Errno");
13594       break;
13595
13596     case AngelSVC_Reason_Clock:
13597       result =
13598 #ifdef CLOCKS_PER_SEC
13599         (CLOCKS_PER_SEC >= 100)
13600         ? (clock () / (CLOCKS_PER_SEC / 100))
13601         : ((clock () * 100) / CLOCKS_PER_SEC)
13602 #else
13603         /* Presume unix... clock() returns microseconds.  */
13604         (clock () / 10000)
13605 #endif
13606         ;
13607         TRACE_SYSCALL (cpu, " AngelSVC: Get Clock");
13608       break;
13609
13610     case AngelSVC_Reason_GetCmdLine:
13611       {
13612         /* Get the pointer  */
13613         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13614         ptr = aarch64_get_mem_u64 (cpu, ptr);
13615
13616         /* FIXME: No command line for now.  */
13617         aarch64_set_mem_u64 (cpu, ptr, 0);
13618         TRACE_SYSCALL (cpu, " AngelSVC: Get Command Line");
13619       }
13620       break;
13621
13622     case AngelSVC_Reason_IsTTY:
13623       result = 1;
13624         TRACE_SYSCALL (cpu, " AngelSVC: IsTTY ?");
13625       break;
13626
13627     case AngelSVC_Reason_Write:
13628       {
13629         /* Get the pointer  */
13630         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13631         /* Get the write control block.  */
13632         uint64_t fd  = aarch64_get_mem_u64 (cpu, ptr);
13633         uint64_t buf = aarch64_get_mem_u64 (cpu, ptr + 8);
13634         uint64_t len = aarch64_get_mem_u64 (cpu, ptr + 16);
13635
13636         TRACE_SYSCALL (cpu, "write of %" PRIx64 " bytes from %"
13637                        PRIx64 " on descriptor %" PRIx64,
13638                        len, buf, fd);
13639
13640         if (len > 1280)
13641           {
13642             TRACE_SYSCALL (cpu,
13643                            " AngelSVC: Write: Suspiciously long write: %ld",
13644                            (long) len);
13645             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13646                              sim_stopped, SIM_SIGBUS);
13647           }
13648         else if (fd == 1)
13649           {
13650             printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
13651           }
13652         else if (fd == 2)
13653           {
13654             TRACE (cpu, 0, "\n");
13655             sim_io_eprintf (CPU_STATE (cpu), "%.*s",
13656                             (int) len, aarch64_get_mem_ptr (cpu, buf));
13657             TRACE (cpu, 0, "\n");
13658           }
13659         else
13660           {
13661             TRACE_SYSCALL (cpu,
13662                            " AngelSVC: Write: Unexpected file handle: %d",
13663                            (int) fd);
13664             sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13665                              sim_stopped, SIM_SIGABRT);
13666           }
13667       }
13668       break;
13669
13670     case AngelSVC_Reason_ReportException:
13671       {
13672         /* Get the pointer  */
13673         uint64_t ptr = aarch64_get_reg_u64 (cpu, 1, SP_OK);
13674         /*ptr = aarch64_get_mem_u64 (cpu, ptr);.  */
13675         uint64_t type = aarch64_get_mem_u64 (cpu, ptr);
13676         uint64_t state = aarch64_get_mem_u64 (cpu, ptr + 8);
13677
13678         TRACE_SYSCALL (cpu,
13679                        "Angel Exception: type 0x%" PRIx64 " state %" PRIx64,
13680                        type, state);
13681
13682         if (type == 0x20026)
13683           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13684                            sim_exited, state);
13685         else
13686           sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13687                            sim_stopped, SIM_SIGINT);
13688       }
13689       break;
13690
13691     case AngelSVC_Reason_Read:
13692     case AngelSVC_Reason_FLen:
13693     case AngelSVC_Reason_Seek:
13694     case AngelSVC_Reason_Remove:
13695     case AngelSVC_Reason_Time:
13696     case AngelSVC_Reason_System:
13697     case AngelSVC_Reason_Rename:
13698     case AngelSVC_Reason_Elapsed:
13699     default:
13700       TRACE_SYSCALL (cpu, " HLT [Unknown angel %x]",
13701                      aarch64_get_reg_u32 (cpu, 0, NO_SP));
13702       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13703                        sim_stopped, SIM_SIGTRAP);
13704     }
13705
13706   aarch64_set_reg_u64 (cpu, 0, NO_SP, result);
13707 }
13708
13709 static void
13710 dexExcpnGen (sim_cpu *cpu)
13711 {
13712   /* instr[31:24] = 11010100
13713      instr[23,21] = opc : 000 ==> GEN EXCPN, 001 ==> BRK
13714                           010 ==> HLT,       101 ==> DBG GEN EXCPN
13715      instr[20,5]  = imm16
13716      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
13717      instr[1,0]   = LL : discriminates opc  */
13718
13719   uint32_t opc = INSTR (23, 21);
13720   uint32_t imm16 = INSTR (20, 5);
13721   uint32_t opc2 = INSTR (4, 2);
13722   uint32_t LL;
13723
13724   NYI_assert (31, 24, 0xd4);
13725
13726   if (opc2 != 0)
13727     HALT_UNALLOC;
13728
13729   LL = INSTR (1, 0);
13730
13731   /* We only implement HLT and BRK for now.  */
13732   if (opc == 1 && LL == 0)
13733     {
13734       TRACE_EVENTS (cpu, " BRK [0x%x]", imm16);
13735       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),
13736                        sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
13737     }
13738
13739   if (opc == 2 && LL == 0)
13740     handle_halt (cpu, imm16);
13741
13742   else if (opc == 0 || opc == 5)
13743     HALT_NYI;
13744
13745   else
13746     HALT_UNALLOC;
13747 }
13748
13749 /* Stub for accessing system registers.  */
13750
13751 static uint64_t
13752 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13753             unsigned crm, unsigned op2)
13754 {
13755   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
13756     /* DCZID_EL0 - the Data Cache Zero ID register.
13757        We do not support DC ZVA at the moment, so
13758        we return a value with the disable bit set.
13759        We implement support for the DCZID register since
13760        it is used by the C library's memset function.  */
13761     return ((uint64_t) 1) << 4;
13762
13763   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
13764     /* Cache Type Register.  */
13765     return 0x80008000UL;
13766
13767   if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
13768     /* TPIDR_EL0 - thread pointer id.  */
13769     return aarch64_get_thread_id (cpu);
13770
13771   if (op1 == 3 && crm == 4 && op2 == 0)
13772     return aarch64_get_FPCR (cpu);
13773
13774   if (op1 == 3 && crm == 4 && op2 == 1)
13775     return aarch64_get_FPSR (cpu);
13776
13777   else if (op1 == 3 && crm == 2 && op2 == 0)
13778     return aarch64_get_CPSR (cpu);
13779
13780   HALT_NYI;
13781 }
13782
13783 static void
13784 system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
13785             unsigned crm, unsigned op2, uint64_t val)
13786 {
13787   if (op1 == 3 && crm == 4 && op2 == 0)
13788     aarch64_set_FPCR (cpu, val);
13789
13790   else if (op1 == 3 && crm == 4 && op2 == 1)
13791     aarch64_set_FPSR (cpu, val);
13792
13793   else if (op1 == 3 && crm == 2 && op2 == 0)
13794     aarch64_set_CPSR (cpu, val);
13795
13796   else
13797     HALT_NYI;
13798 }
13799
13800 static void
13801 do_mrs (sim_cpu *cpu)
13802 {
13803   /* instr[31:20] = 1101 0101 0001 1
13804      instr[19]    = op0
13805      instr[18,16] = op1
13806      instr[15,12] = CRn
13807      instr[11,8]  = CRm
13808      instr[7,5]   = op2
13809      instr[4,0]   = Rt  */
13810   unsigned sys_op0 = INSTR (19, 19) + 2;
13811   unsigned sys_op1 = INSTR (18, 16);
13812   unsigned sys_crn = INSTR (15, 12);
13813   unsigned sys_crm = INSTR (11, 8);
13814   unsigned sys_op2 = INSTR (7, 5);
13815   unsigned rt = INSTR (4, 0);
13816
13817   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13818   aarch64_set_reg_u64 (cpu, rt, NO_SP,
13819                        system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
13820 }
13821
13822 static void
13823 do_MSR_immediate (sim_cpu *cpu)
13824 {
13825   /* instr[31:19] = 1101 0101 0000 0
13826      instr[18,16] = op1
13827      instr[15,12] = 0100
13828      instr[11,8]  = CRm
13829      instr[7,5]   = op2
13830      instr[4,0]   = 1 1111  */
13831
13832   unsigned op1 = INSTR (18, 16);
13833   /*unsigned crm = INSTR (11, 8);*/
13834   unsigned op2 = INSTR (7, 5);
13835
13836   NYI_assert (31, 19, 0x1AA0);
13837   NYI_assert (15, 12, 0x4);
13838   NYI_assert (4,  0,  0x1F);
13839
13840   if (op1 == 0)
13841     {
13842       if (op2 == 5)
13843         HALT_NYI; /* set SPSel.  */
13844       else
13845         HALT_UNALLOC;
13846     }
13847   else if (op1 == 3)
13848     {
13849       if (op2 == 6)
13850         HALT_NYI; /* set DAIFset.  */
13851       else if (op2 == 7)
13852         HALT_NYI; /* set DAIFclr.  */
13853       else
13854         HALT_UNALLOC;
13855     }
13856   else
13857     HALT_UNALLOC;
13858 }
13859
13860 static void
13861 do_MSR_reg (sim_cpu *cpu)
13862 {
13863   /* instr[31:20] = 1101 0101 0001
13864      instr[19]    = op0
13865      instr[18,16] = op1
13866      instr[15,12] = CRn
13867      instr[11,8]  = CRm
13868      instr[7,5]   = op2
13869      instr[4,0]   = Rt  */
13870
13871   unsigned sys_op0 = INSTR (19, 19) + 2;
13872   unsigned sys_op1 = INSTR (18, 16);
13873   unsigned sys_crn = INSTR (15, 12);
13874   unsigned sys_crm = INSTR (11, 8);
13875   unsigned sys_op2 = INSTR (7, 5);
13876   unsigned rt = INSTR (4, 0);
13877
13878   NYI_assert (31, 20, 0xD51);
13879
13880   TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
13881   system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
13882               aarch64_get_reg_u64 (cpu, rt, NO_SP));
13883 }
13884
13885 static void
13886 do_SYS (sim_cpu *cpu)
13887 {
13888   /* instr[31,19] = 1101 0101 0000 1
13889      instr[18,16] = op1
13890      instr[15,12] = CRn
13891      instr[11,8]  = CRm
13892      instr[7,5]   = op2
13893      instr[4,0]   = Rt  */
13894   NYI_assert (31, 19, 0x1AA1);
13895
13896   /* FIXME: For now we just silently accept system ops.  */
13897 }
13898
13899 static void
13900 dexSystem (sim_cpu *cpu)
13901 {
13902   /* instr[31:22] = 1101 01010 0
13903      instr[21]    = L
13904      instr[20,19] = op0
13905      instr[18,16] = op1
13906      instr[15,12] = CRn
13907      instr[11,8]  = CRm
13908      instr[7,5]   = op2
13909      instr[4,0]   = uimm5  */
13910
13911   /* We are interested in HINT, DSB, DMB and ISB
13912
13913      Hint #0 encodes NOOP (this is the only hint we care about)
13914      L == 0, op0 == 0, op1 = 011, CRn = 0010, Rt = 11111,
13915      CRm op2  != 0000 000 OR CRm op2 == 0000 000 || CRm op > 0000 101
13916
13917      DSB, DMB, ISB are data store barrier, data memory barrier and
13918      instruction store barrier, respectively, where
13919
13920      L == 0, op0 == 0, op1 = 011, CRn = 0011, Rt = 11111,
13921      op2 : DSB ==> 100, DMB ==> 101, ISB ==> 110
13922      CRm<3:2> ==> domain, CRm<1:0> ==> types,
13923      domain : 00 ==> OuterShareable, 01 ==> Nonshareable,
13924               10 ==> InerShareable, 11 ==> FullSystem
13925      types :  01 ==> Reads, 10 ==> Writes,
13926               11 ==> All, 00 ==> All (domain == FullSystem).  */
13927
13928   unsigned rt = INSTR (4, 0);
13929
13930   NYI_assert (31, 22, 0x354);
13931
13932   switch (INSTR (21, 12))
13933     {
13934     case 0x032:
13935       if (rt == 0x1F)
13936         {
13937           /* NOP has CRm != 0000 OR.  */
13938           /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
13939           uint32_t crm = INSTR (11, 8);
13940           uint32_t op2 = INSTR (7, 5);
13941
13942           if (crm != 0 || (op2 == 0 || op2 > 5))
13943             {
13944               /* Actually call nop method so we can reimplement it later.  */
13945               nop (cpu);
13946               return;
13947             }
13948         }
13949       HALT_NYI;
13950
13951     case 0x033:
13952       {
13953         uint32_t op2 =  INSTR (7, 5);
13954
13955         switch (op2)
13956           {
13957           case 2: HALT_NYI;
13958           case 4: dsb (cpu); return;
13959           case 5: dmb (cpu); return;
13960           case 6: isb (cpu); return;
13961           default: HALT_UNALLOC;
13962         }
13963       }
13964
13965     case 0x3B0:
13966     case 0x3B4:
13967     case 0x3BD:
13968       do_mrs (cpu);
13969       return;
13970
13971     case 0x0B7:
13972       do_SYS (cpu); /* DC is an alias of SYS.  */
13973       return;
13974
13975     default:
13976       if (INSTR (21, 20) == 0x1)
13977         do_MSR_reg (cpu);
13978       else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
13979         do_MSR_immediate (cpu);
13980       else
13981         HALT_NYI;
13982       return;
13983     }
13984 }
13985
13986 static void
13987 dexBr (sim_cpu *cpu)
13988 {
13989   /* uint32_t group = dispatchGroup (aarch64_get_instr (cpu));
13990      assert  group == GROUP_BREXSYS_1010 || group == GROUP_BREXSYS_1011
13991      bits [31,29] of a BrExSys are the secondary dispatch vector.  */
13992   uint32_t group2 = dispatchBrExSys (aarch64_get_instr (cpu));
13993
13994   switch (group2)
13995     {
13996     case BR_IMM_000:
13997       return dexBranchImmediate (cpu);
13998
13999     case BR_IMMCMP_001:
14000       /* Compare has bit 25 clear while test has it set.  */
14001       if (!INSTR (25, 25))
14002         dexCompareBranchImmediate (cpu);
14003       else
14004         dexTestBranchImmediate (cpu);
14005       return;
14006
14007     case BR_IMMCOND_010:
14008       /* This is a conditional branch if bit 25 is clear otherwise
14009          unallocated.  */
14010       if (!INSTR (25, 25))
14011         dexCondBranchImmediate (cpu);
14012       else
14013         HALT_UNALLOC;
14014       return;
14015
14016     case BR_UNALLOC_011:
14017       HALT_UNALLOC;
14018
14019     case BR_IMM_100:
14020       dexBranchImmediate (cpu);
14021       return;
14022
14023     case BR_IMMCMP_101:
14024       /* Compare has bit 25 clear while test has it set.  */
14025       if (!INSTR (25, 25))
14026         dexCompareBranchImmediate (cpu);
14027       else
14028         dexTestBranchImmediate (cpu);
14029       return;
14030
14031     case BR_REG_110:
14032       /* Unconditional branch reg has bit 25 set.  */
14033       if (INSTR (25, 25))
14034         dexBranchRegister (cpu);
14035
14036       /* This includes both Excpn Gen, System and unalloc operations.
14037          We need to decode the Excpn Gen operation BRK so we can plant
14038          debugger entry points.
14039          Excpn Gen operations have instr [24] = 0.
14040          we need to decode at least one of the System operations NOP
14041          which is an alias for HINT #0.
14042          System operations have instr [24,22] = 100.  */
14043       else if (INSTR (24, 24) == 0)
14044         dexExcpnGen (cpu);
14045
14046       else if (INSTR (24, 22) == 4)
14047         dexSystem (cpu);
14048
14049       else
14050         HALT_UNALLOC;
14051
14052       return;
14053
14054     case BR_UNALLOC_111:
14055       HALT_UNALLOC;
14056
14057     default:
14058       /* Should never reach here.  */
14059       HALT_NYI;
14060     }
14061 }
14062
14063 static void
14064 aarch64_decode_and_execute (sim_cpu *cpu, uint64_t pc)
14065 {
14066   /* We need to check if gdb wants an in here.  */
14067   /* checkBreak (cpu);.  */
14068
14069   uint64_t group = dispatchGroup (aarch64_get_instr (cpu));
14070
14071   switch (group)
14072     {
14073     case GROUP_PSEUDO_0000:   dexPseudo (cpu); break;
14074     case GROUP_LDST_0100:     dexLdSt (cpu); break;
14075     case GROUP_DPREG_0101:    dexDPReg (cpu); break;
14076     case GROUP_LDST_0110:     dexLdSt (cpu); break;
14077     case GROUP_ADVSIMD_0111:  dexAdvSIMD0 (cpu); break;
14078     case GROUP_DPIMM_1000:    dexDPImm (cpu); break;
14079     case GROUP_DPIMM_1001:    dexDPImm (cpu); break;
14080     case GROUP_BREXSYS_1010:  dexBr (cpu); break;
14081     case GROUP_BREXSYS_1011:  dexBr (cpu); break;
14082     case GROUP_LDST_1100:     dexLdSt (cpu); break;
14083     case GROUP_DPREG_1101:    dexDPReg (cpu); break;
14084     case GROUP_LDST_1110:     dexLdSt (cpu); break;
14085     case GROUP_ADVSIMD_1111:  dexAdvSIMD1 (cpu); break;
14086
14087     case GROUP_UNALLOC_0001:
14088     case GROUP_UNALLOC_0010:
14089     case GROUP_UNALLOC_0011:
14090       HALT_UNALLOC;
14091
14092     default:
14093       /* Should never reach here.  */
14094       HALT_NYI;
14095     }
14096 }
14097
14098 static bfd_boolean
14099 aarch64_step (sim_cpu *cpu)
14100 {
14101   uint64_t pc = aarch64_get_PC (cpu);
14102
14103   if (pc == TOP_LEVEL_RETURN_PC)
14104     return FALSE;
14105
14106   aarch64_set_next_PC (cpu, pc + 4);
14107
14108   /* Code is always little-endian.  */
14109   sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
14110                         & aarch64_get_instr (cpu), pc, 4);
14111   aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
14112
14113   TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
14114               aarch64_get_instr (cpu));
14115   TRACE_DISASM (cpu, pc);
14116
14117   aarch64_decode_and_execute (cpu, pc);
14118
14119   return TRUE;
14120 }
14121
14122 void
14123 aarch64_run (SIM_DESC sd)
14124 {
14125   sim_cpu *cpu = STATE_CPU (sd, 0);
14126
14127   while (aarch64_step (cpu))
14128     {
14129       aarch64_update_PC (cpu);
14130
14131       if (sim_events_tick (sd))
14132         sim_events_process (sd);
14133     }
14134
14135   sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
14136                    sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
14137 }
14138
14139 void
14140 aarch64_init (sim_cpu *cpu, uint64_t pc)
14141 {
14142   uint64_t sp = aarch64_get_stack_start (cpu);
14143
14144   /* Install SP, FP and PC and set LR to -20
14145      so we can detect a top-level return.  */
14146   aarch64_set_reg_u64 (cpu, SP, SP_OK, sp);
14147   aarch64_set_reg_u64 (cpu, FP, SP_OK, sp);
14148   aarch64_set_reg_u64 (cpu, LR, SP_OK, TOP_LEVEL_RETURN_PC);
14149   aarch64_set_next_PC (cpu, pc);
14150   aarch64_update_PC (cpu);
14151   aarch64_init_LIT_table ();
14152 }