X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=sim%2Faarch64%2Fsimulator.c;h=5f16a69478c8cb0edab4f85274c489cec0b23ad2;hb=a1d1fa3e417b4bd8e79e2a731f9c6089e2d5f747;hp=59579468dad33399eab41f79f260eed21b2ed872;hpb=caa8d70005e4e12392683c799b30790fc4c62166;p=deliverable%2Fbinutils-gdb.git

diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c
index 59579468da..5f16a69478 100644
--- a/sim/aarch64/simulator.c
+++ b/sim/aarch64/simulator.c
@@ -1,6 +1,6 @@
 /* simulator.c -- Interface for the AArch64 simulator.
 
-   Copyright (C) 2015 Free Software Foundation, Inc.
+   Copyright (C) 2015-2020 Free Software Foundation, Inc.
 
    Contributed by Red Hat.
 
@@ -24,13 +24,10 @@
 #include <stdio.h>
 #include <string.h>
 #include <sys/types.h>
-#include <syscall.h>
 #include <math.h>
 #include <time.h>
 #include <limits.h>
 
-#include "dis-asm.h"
-
 #include "simulator.h"
 #include "cpustate.h"
 #include "memory.h"
@@ -38,23 +35,21 @@
 #define NO_SP 0
 #define SP_OK 1
 
-bfd_boolean disas = FALSE;
-
 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
-#define IS_SET(_X)   ( TST (( _X )))
-#define IS_CLEAR(_X) (!TST (( _X )))
+#define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
+#define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
+
+/* Space saver macro.  */
+#define INSTR(HIGH, LOW) uimm (aarch64_get_instr (cpu), (HIGH), (LOW))
 
 #define HALT_UNALLOC							\
   do									\
     {									\
-      if (TRACE_INSN_P (cpu))						\
-	{								\
-	  aarch64_print_insn (CPU_STATE (cpu), aarch64_get_PC (cpu));	\
-	  TRACE_INSN (cpu,						\
-		      "Unallocated instruction detected at sim line %d,"\
-		      " exe addr %" PRIx64,				\
-		      __LINE__, aarch64_get_PC (cpu));			\
-	}								\
+      TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
+      TRACE_INSN (cpu,							\
+		  "Unallocated instruction detected at sim line %d,"	\
+		  " exe addr %" PRIx64,					\
+		  __LINE__, aarch64_get_PC (cpu));			\
       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
 		       sim_stopped, SIM_SIGILL);			\
     }									\
@@ -63,14 +58,14 @@ bfd_boolean disas = FALSE;
 #define HALT_NYI							\
   do									\
     {									\
-      if (TRACE_INSN_P (cpu))						\
-	{								\
-	  aarch64_print_insn (CPU_STATE (cpu), aarch64_get_PC (cpu));	\
-	  TRACE_INSN (cpu,						\
-		      "Unimplemented instruction detected at sim line %d,"\
-		      " exe addr %" PRIx64,				\
-		      __LINE__, aarch64_get_PC (cpu));			\
-	}								\
+      TRACE_DISASM (cpu, aarch64_get_PC (cpu));				\
+      TRACE_INSN (cpu,							\
+		  "Unimplemented instruction detected at sim line %d,"	\
+		  " exe addr %" PRIx64,					\
+		  __LINE__, aarch64_get_PC (cpu));			\
+      if (! TRACE_ANY_P (cpu))						\
+        sim_io_eprintf (CPU_STATE (cpu), "SIM Error: Unimplemented instruction: %#08x\n", \
+                        aarch64_get_instr (cpu));			\
       sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu),\
 		       sim_stopped, SIM_SIGABRT);			\
     }									\
@@ -79,19 +74,11 @@ bfd_boolean disas = FALSE;
 #define NYI_assert(HI, LO, EXPECTED)					\
   do									\
     {									\
-      if (uimm (aarch64_get_instr (cpu), (HI), (LO)) != (EXPECTED))	\
+      if (INSTR ((HI), (LO)) != (EXPECTED))				\
 	HALT_NYI;							\
     }									\
   while (0)
 
-#define HALT_UNREACHABLE						\
-  do									\
-    {									\
-      TRACE_EVENTS (cpu, "ISE: unreachable code point");		\
-      sim_engine_abort (NULL, cpu, aarch64_get_PC (cpu), "Internal Error"); \
-    }									\
-  while (0)
-
 /* Helper functions used by expandLogicalImmediate.  */
 
 /* for i = 1, ... N result<i-1> = 1 other bits are zero  */
@@ -190,7 +177,7 @@ dexNotify (sim_cpu *cpu)
 {
   /* instr[14,0] == type : 0 ==> method entry, 1 ==> method reentry
                            2 ==> exit Java, 3 ==> start next bytecode.  */
-  uint32_t type = uimm (aarch64_get_instr (cpu), 14, 0);
+  uint32_t type = INSTR (14, 0);
 
   TRACE_EVENTS (cpu, "Notify Insn encountered, type = 0x%x", type);
 
@@ -245,7 +232,7 @@ dexPseudo (sim_cpu *cpu)
 		       sim_stopped, SIM_SIGTRAP);
     }
 
-  dispatch = uimm (aarch64_get_instr (cpu), 31, 15);
+  dispatch = INSTR (31, 15);
 
   /* We do not handle callouts at the moment.  */
   if (dispatch == PSEUDO_CALLOUT || dispatch == PSEUDO_CALLOUTR)
@@ -273,9 +260,10 @@ dexPseudo (sim_cpu *cpu)
 static void
 ldur32 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -285,9 +273,10 @@ ldur32 (sim_cpu *cpu, int32_t offset)
 static void
 ldur64 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -297,9 +286,10 @@ ldur64 (sim_cpu *cpu, int32_t offset)
 static void
 ldurb32 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -309,9 +299,10 @@ ldurb32 (sim_cpu *cpu, int32_t offset)
 static void
 ldursb32 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s8
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -321,9 +312,10 @@ ldursb32 (sim_cpu *cpu, int32_t offset)
 static void
 ldursb64 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s8
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -333,9 +325,10 @@ ldursb64 (sim_cpu *cpu, int32_t offset)
 static void
 ldurh32 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_mem_u16
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -345,9 +338,10 @@ ldurh32 (sim_cpu *cpu, int32_t offset)
 static void
 ldursh32 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s16
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -357,9 +351,10 @@ ldursh32 (sim_cpu *cpu, int32_t offset)
 static void
 ldursh64 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s16
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -369,9 +364,10 @@ ldursh64 (sim_cpu *cpu, int32_t offset)
 static void
 ldursw (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) aarch64_get_mem_s32
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			+ offset));
@@ -384,9 +380,10 @@ ldursw (sim_cpu *cpu, int32_t offset)
 static void
 stur32 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u32 (cpu,
 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 		       aarch64_get_reg_u32 (cpu, rd, NO_SP));
@@ -396,9 +393,10 @@ stur32 (sim_cpu *cpu, int32_t offset)
 static void
 stur64 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u64 (cpu,
 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 		       aarch64_get_reg_u64 (cpu, rd, NO_SP));
@@ -408,9 +406,10 @@ stur64 (sim_cpu *cpu, int32_t offset)
 static void
 sturb (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u8 (cpu,
 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 		      aarch64_get_reg_u8 (cpu, rd, NO_SP));
@@ -420,9 +419,10 @@ sturb (sim_cpu *cpu, int32_t offset)
 static void
 sturh (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u16 (cpu,
 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
 		       aarch64_get_reg_u16 (cpu, rd, NO_SP));
@@ -436,8 +436,9 @@ sturh (sim_cpu *cpu, int32_t offset)
 static void
 ldr32_pcrel (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_mem_u32
 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
@@ -447,8 +448,9 @@ ldr32_pcrel (sim_cpu *cpu, int32_t offset)
 static void
 ldr_pcrel (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_mem_u64
 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
@@ -458,8 +460,9 @@ ldr_pcrel (sim_cpu *cpu, int32_t offset)
 static void
 ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_mem_s32
 		       (cpu, aarch64_get_PC (cpu) + offset * 4));
@@ -469,32 +472,35 @@ ldrsw_pcrel (sim_cpu *cpu, int32_t offset)
 static void
 fldrs_pcrel (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rd = INSTR (4, 0);
 
-  aarch64_set_FP_float (cpu, rd,
-			aarch64_get_mem_float
-			(cpu, aarch64_get_PC (cpu) + offset * 4));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u32 (cpu, rd, 0,
+		       aarch64_get_mem_u32
+		       (cpu, aarch64_get_PC (cpu) + offset * 4));
 }
 
 /* double pc-relative load  */
 static void
 fldrd_pcrel (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int st = INSTR (4, 0);
 
-  aarch64_set_FP_double (cpu, st,
-			 aarch64_get_mem_double
-			 (cpu, aarch64_get_PC (cpu) + offset * 4));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u64 (cpu, st, 0,
+		       aarch64_get_mem_u64
+		       (cpu, aarch64_get_PC (cpu) + offset * 4));
 }
 
 /* long double pc-relative load.  */
 static void
 fldrq_pcrel (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int st = INSTR (4, 0);
   uint64_t addr = aarch64_get_PC (cpu) + offset * 4;
   FRegister a;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_get_mem_long_double (cpu, addr, & a);
   aarch64_set_FP_long_double (cpu, st, a);
 }
@@ -508,7 +514,7 @@ fldrq_pcrel (sim_cpu *cpu, int32_t offset)
 
 /* This can be used to optionally scale a register derived offset
    by applying the requisite shift as indicated by the Scaling
-   argument. the second argument is either Byte, Short, Word
+   argument.  The second argument is either Byte, Short, Word
    or Long. The third argument is either Scaled or Unscaled.
    N.B. when _Scaling is Scaled the shift gets ANDed with
    all 1s while when it is Unscaled it gets ANDed with 0.  */
@@ -549,14 +555,15 @@ extend (uint32_t value, Extension extension)
 static void
 fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 
   if (wb != Post)
     address += offset;
 
-  aarch64_set_FP_float (cpu, st, aarch64_get_mem_float (cpu, address));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
   if (wb == Post)
     address += offset;
 
@@ -564,17 +571,65 @@ fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 }
 
+/* Load 8 bit with unsigned 12 bit offset.  */
+static void
+fldrb_abs (sim_cpu *cpu, uint32_t offset)
+{
+  unsigned rd = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
+  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u8 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
+}
+
+/* Load 16 bit scaled unsigned 12 bit.  */
+static void
+fldrh_abs (sim_cpu *cpu, uint32_t offset)
+{
+  unsigned rd = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
+  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 16);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u16 (cpu, rd, 0, aarch64_get_mem_u16 (cpu, addr));
+}
+
 /* Load 32 bit scaled unsigned 12 bit.  */
 static void
 fldrs_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
+  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u32 (cpu, rd, 0, aarch64_get_mem_u32 (cpu, addr));
+}
+
+/* Load 64 bit scaled unsigned 12 bit.  */
+static void
+fldrd_abs (sim_cpu *cpu, uint32_t offset)
+{
+  unsigned rd = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
+  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
+}
+
+/* Load 128 bit scaled unsigned 12 bit.  */
+static void
+fldrq_abs (sim_cpu *cpu, uint32_t offset)
+{
+  unsigned rd = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
+  uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
 
-  aarch64_set_FP_float (cpu, st,
-			aarch64_get_mem_float
-			(cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
-			 + SCALE (offset, 32)));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_mem_u64 (cpu, addr));
+  aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_mem_u64 (cpu, addr + 8));
 }
 
 /* Load 32 bit scaled or unscaled zero- or sign-extended
@@ -582,30 +637,31 @@ fldrs_abs (sim_cpu *cpu, uint32_t offset)
 static void
 fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
 
-  aarch64_set_FP_float (cpu, st,
-			aarch64_get_mem_float
-			(cpu, address + displacement));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
+		       (cpu, address + displacement));
 }
 
 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
 static void
 fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 
   if (wb != Post)
     address += offset;
 
-  aarch64_set_FP_double (cpu, st, aarch64_get_mem_double (cpu, address));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
 
   if (wb == Post)
     address += offset;
@@ -614,22 +670,11 @@ fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 }
 
-/* Load 64 bit scaled unsigned 12 bit.  */
-static void
-fldrd_abs (sim_cpu *cpu, uint32_t offset)
-{
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
-  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
-
-  aarch64_set_FP_double (cpu, st, aarch64_get_mem_double (cpu, address));
-}
-
 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
 static void
 fldrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
+  unsigned rm = INSTR (20, 16);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
 
@@ -641,13 +686,14 @@ static void
 fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
   FRegister a;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_get_mem_long_double (cpu, address, & a);
   aarch64_set_FP_long_double (cpu, st, a);
 
@@ -658,24 +704,11 @@ fldrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
     aarch64_set_reg_u64 (cpu, rn, SP_OK, address);
 }
 
-/* Load 128 bit scaled unsigned 12 bit.  */
-static void
-fldrq_abs (sim_cpu *cpu, uint32_t offset)
-{
-  FRegister a;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
-  uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 128);
-
-  aarch64_get_mem_long_double (cpu, address, & a);
-  aarch64_set_FP_long_double (cpu, st, a);
-}
-
 /* Load 128 bit scaled or unscaled zero- or sign-extended 32-bit register offset  */
 static void
 fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
+  unsigned rm = INSTR (20, 16);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement = OPT_SCALE (extended, 128, scaling);
 
@@ -710,9 +743,10 @@ fldrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 ldr32_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be.  */
   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
@@ -723,8 +757,8 @@ ldr32_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -735,6 +769,7 @@ ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u32 (cpu, address));
 
   if (wb == Post)
@@ -749,15 +784,16 @@ ldr32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 		       aarch64_get_mem_u32 (cpu, address + displacement));
 }
@@ -766,9 +802,10 @@ ldr32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 ldr_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be.  */
   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64
 		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
@@ -779,8 +816,8 @@ ldr_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -791,6 +828,7 @@ ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
 
   if (wb == Post)
@@ -805,15 +843,16 @@ ldr_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement =  OPT_SCALE (extended, 64, scaling);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 		       aarch64_get_mem_u64 (cpu, address + displacement));
 }
@@ -822,9 +861,10 @@ ldr_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 ldrb32_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be
      there is no scaling required for a byte load.  */
   aarch64_set_reg_u64 (cpu, rt, NO_SP,
@@ -836,8 +876,8 @@ ldrb32_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -848,6 +888,7 @@ ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u8 (cpu, address));
 
   if (wb == Post)
@@ -862,15 +903,16 @@ ldrb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 				 extension);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* There is no scaling required for a byte load.  */
   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 		       aarch64_get_mem_u8 (cpu, address + displacement));
@@ -881,9 +923,10 @@ ldrb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
+  int64_t val;
 
   if (rn == rt && wb != NoWriteBack)
     HALT_UNALLOC;
@@ -893,7 +936,9 @@ ldrsb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_s8 (cpu, address));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  val = aarch64_get_mem_s8 (cpu, address);
+  aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 
   if (wb == Post)
     address += offset;
@@ -914,16 +959,17 @@ ldrsb_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 				 extension);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* There is no scaling required for a byte load.  */
-  aarch64_set_reg_u64 (cpu, rt, NO_SP,
+  aarch64_set_reg_s64 (cpu, rt, NO_SP,
 		       aarch64_get_mem_s8 (cpu, address + displacement));
 }
 
@@ -931,13 +977,15 @@ ldrsb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 ldrh32_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
+  uint32_t val;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be.  */
-  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16
-		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
-			+ SCALE (offset, 16)));
+  val = aarch64_get_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
+			     + SCALE (offset, 16));
+  aarch64_set_reg_u32 (cpu, rt, NO_SP, val);
 }
 
 /* 32 bit load zero-extended short unscaled signed 9 bit
@@ -945,8 +993,8 @@ ldrh32_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -957,7 +1005,8 @@ ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_reg_u32 (cpu, rt, NO_SP, aarch64_get_mem_u16 (cpu, address));
 
   if (wb == Post)
     address += offset;
@@ -971,16 +1020,17 @@ ldrh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
 
-  aarch64_set_reg_u64 (cpu, rt, NO_SP,
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_reg_u32 (cpu, rt, NO_SP,
 		       aarch64_get_mem_u16 (cpu, address + displacement));
 }
 
@@ -988,14 +1038,15 @@ ldrh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
+  int32_t val;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be.  */
-  aarch64_set_reg_u64 (cpu, rt, NO_SP, (uint32_t) aarch64_get_mem_s16
-		       (cpu,
-			aarch64_get_reg_u64 (cpu, rn, SP_OK)
-			+ SCALE (offset, 16)));
+  val = aarch64_get_mem_s16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
+			     + SCALE (offset, 16));
+  aarch64_set_reg_s32 (cpu, rt, NO_SP, val);
 }
 
 /* 32 bit load sign-extended short unscaled signed 9 bit
@@ -1003,8 +1054,8 @@ ldrsh32_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -1015,8 +1066,9 @@ ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_reg_u64 (cpu, rt, NO_SP,
-		       (uint32_t) aarch64_get_mem_s16 (cpu, address));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_reg_s32 (cpu, rt, NO_SP,
+		       (int32_t) aarch64_get_mem_s16 (cpu, address));
 
   if (wb == Post)
     address += offset;
@@ -1030,17 +1082,18 @@ ldrsh32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
 
-  aarch64_set_reg_u64 (cpu, rt, NO_SP,
-		       (uint32_t) aarch64_get_mem_s16
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_reg_s32 (cpu, rt, NO_SP,
+		       (int32_t) aarch64_get_mem_s16
 		       (cpu, address + displacement));
 }
 
@@ -1048,13 +1101,15 @@ ldrsh32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 ldrsh_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
+  int64_t val;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be.  */
-  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_s16
-		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
-			+ SCALE (offset, 16)));
+  val = aarch64_get_mem_s16  (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
+			      + SCALE (offset, 16));
+  aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 }
 
 /* 64 bit load sign-extended short unscaled signed 9 bit
@@ -1062,19 +1117,22 @@ ldrsh_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
+  int64_t val;
 
   if (rn == rt && wb != NoWriteBack)
     HALT_UNALLOC;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 
   if (wb != Post)
     address += offset;
 
-  aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_s16 (cpu, address));
+  val = aarch64_get_mem_s16 (cpu, address);
+  aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 
   if (wb == Post)
     address += offset;
@@ -1088,30 +1146,35 @@ ldrsh64_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 ldrsh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
+
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement = OPT_SCALE (extended, 16, scaling);
+  int64_t val;
 
-  aarch64_set_reg_u64 (cpu, rt, NO_SP,
-		       aarch64_get_mem_s16 (cpu, address + displacement));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  val = aarch64_get_mem_s16 (cpu, address + displacement);
+  aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 }
 
 /* 64 bit load sign-extended 32 bit scaled unsigned 12 bit.  */
 static void
 ldrsw_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
+  int64_t val;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  val = aarch64_get_mem_s32 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
+			     + SCALE (offset, 32));
   /* The target register may not be SP but the source may be.  */
-  return aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32
-			      (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
-			       + SCALE (offset, 32)));
+  return aarch64_set_reg_s64 (cpu, rt, NO_SP, val);
 }
 
 /* 64 bit load sign-extended 32 bit unscaled signed 9 bit
@@ -1119,8 +1182,8 @@ ldrsw_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -1131,6 +1194,7 @@ ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_s64 (cpu, rt, NO_SP, aarch64_get_mem_s32 (cpu, address));
 
   if (wb == Post)
@@ -1145,15 +1209,16 @@ ldrsw_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement =  OPT_SCALE (extended, 32, scaling);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_s64 (cpu, rt, NO_SP,
 		       aarch64_get_mem_s32 (cpu, address + displacement));
 }
@@ -1165,9 +1230,10 @@ ldrsw_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 str32_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be.  */
   aarch64_set_mem_u32 (cpu, (aarch64_get_reg_u64 (cpu, rn, SP_OK)
 			     + SCALE (offset, 32)),
@@ -1178,8 +1244,8 @@ str32_abs (sim_cpu *cpu, uint32_t offset)
 static void
 str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -1189,6 +1255,7 @@ str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u32 (cpu, address, aarch64_get_reg_u32 (cpu, rt, NO_SP));
 
   if (wb == Post)
@@ -1203,14 +1270,15 @@ str32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t  extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u32 (cpu, address + displacement,
 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
 }
@@ -1219,9 +1287,10 @@ str32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 str_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u64 (cpu,
 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
 		       + SCALE (offset, 64),
@@ -1232,8 +1301,8 @@ str_abs (sim_cpu *cpu, uint32_t offset)
 static void
 str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -1244,6 +1313,7 @@ str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u64 (cpu, address, aarch64_get_reg_u64 (cpu, rt, NO_SP));
 
   if (wb == Post)
@@ -1258,9 +1328,9 @@ str_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
@@ -1268,6 +1338,7 @@ str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 			       extension);
   uint64_t displacement = OPT_SCALE (extended, 64, scaling);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u64 (cpu, address + displacement,
 		       aarch64_get_reg_u64 (cpu, rt, NO_SP));
 }
@@ -1276,9 +1347,10 @@ str_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 strb_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be.
      There is no scaling required for a byte load.  */
   aarch64_set_mem_u8 (cpu,
@@ -1290,8 +1362,8 @@ strb_abs (sim_cpu *cpu, uint32_t offset)
 static void
 strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -1302,6 +1374,7 @@ strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u8 (cpu, address, aarch64_get_reg_u8 (cpu, rt, NO_SP));
 
   if (wb == Post)
@@ -1316,15 +1389,16 @@ strb_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t displacement = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 				 extension);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* There is no scaling required for a byte load.  */
   aarch64_set_mem_u8 (cpu, address + displacement,
 		      aarch64_get_reg_u8 (cpu, rt, NO_SP));
@@ -1334,9 +1408,10 @@ strb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 strh_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* The target register may not be SP but the source may be.  */
   aarch64_set_mem_u16 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
 		       + SCALE (offset, 16),
@@ -1347,8 +1422,8 @@ strh_abs (sim_cpu *cpu, uint32_t offset)
 static void
 strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address;
 
   if (rn == rt && wb != NoWriteBack)
@@ -1359,6 +1434,7 @@ strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u16 (cpu, address, aarch64_get_reg_u16 (cpu, rt, NO_SP));
 
   if (wb == Post)
@@ -1373,15 +1449,16 @@ strh_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 strh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   /* rn may reference SP, rm and rt must reference ZR  */
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement =  OPT_SCALE (extended, 16, scaling);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_mem_u16 (cpu, address + displacement,
 		       aarch64_get_reg_u16 (cpu, rt, NO_SP));
 }
@@ -1397,7 +1474,7 @@ prfm_abs (sim_cpu *cpu, uint32_t offset)
                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
                           ow ==> UNALLOC
-     PrfOp prfop = prfop (aarch64_get_instr (cpu), 4, 0);
+     PrfOp prfop = prfop (instr, 4, 0);
      uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK)
      + SCALE (offset, 64).  */
 
@@ -1416,7 +1493,7 @@ prfm_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
                           ow ==> UNALLOC
      rn may reference SP, rm may only reference ZR
-     PrfOp prfop = prfop (aarch64_get_instr (cpu), 4, 0);
+     PrfOp prfop = prfop (instr, 4, 0);
      uint64_t base = aarch64_get_reg_u64 (cpu, rn, SP_OK);
      int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
                                 extension);
@@ -1437,7 +1514,7 @@ prfm_pcrel (sim_cpu *cpu, int32_t offset)
                           10010 ==> PSTL2KEEP, 10001 ==> PSTL2STRM,
                           10100 ==> PSTL3KEEP, 10101 ==> PSTL3STRM,
                           ow ==> UNALLOC
-     PrfOp prfop = prfop (aarch64_get_instr (cpu), 4, 0);
+     PrfOp prfop = prfop (instr, 4, 0);
      uint64_t address = aarch64_get_PC (cpu) + offset.  */
 
   /* TODO : implement this  */
@@ -1448,13 +1525,14 @@ prfm_pcrel (sim_cpu *cpu, int32_t offset)
 static void
 ldxr (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
-  int size = uimm (aarch64_get_instr (cpu), 31, 30);
-  /* int ordered = uimm (aarch64_get_instr (cpu), 15, 15);  */
-  /* int exclusive = ! uimm (aarch64_get_instr (cpu), 23, 23);  */
+  int size = INSTR (31, 30);
+  /* int ordered = INSTR (15, 15);  */
+  /* int exclusive = ! INSTR (23, 23);  */
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   switch (size)
     {
     case 0:
@@ -1469,19 +1547,17 @@ ldxr (sim_cpu *cpu)
     case 3:
       aarch64_set_reg_u64 (cpu, rt, NO_SP, aarch64_get_mem_u64 (cpu, address));
       break;
-    default:
-      HALT_UNALLOC;
     }
 }
 
 static void
 stxr (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned rs = uimm (aarch64_get_instr (cpu), 20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rt = INSTR (4, 0);
+  unsigned rs = INSTR (20, 16);
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
-  int      size = uimm (aarch64_get_instr (cpu), 31, 30);
+  int      size = INSTR (31, 30);
   uint64_t data = aarch64_get_reg_u64 (cpu, rt, NO_SP);
 
   switch (size)
@@ -1490,9 +1566,9 @@ stxr (sim_cpu *cpu)
     case 1: aarch64_set_mem_u16 (cpu, address, data); break;
     case 2: aarch64_set_mem_u32 (cpu, address, data); break;
     case 3: aarch64_set_mem_u64 (cpu, address, data); break;
-    default: HALT_UNALLOC;
     }
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rs, NO_SP, 0); /* Always exclusive...  */
 }
 
@@ -1508,9 +1584,8 @@ dexLoadLiteral (sim_cpu *cpu)
      instr[26] ==> V : 0 ==> GReg, 1 ==> FReg
      instr[23, 5] == simm19  */
 
-  /* unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);  */
-  uint32_t dispatch = ( (uimm (aarch64_get_instr (cpu), 31, 30) << 1)
-			| uimm (aarch64_get_instr (cpu), 26, 26));
+  /* unsigned rt = INSTR (4, 0);  */
+  uint32_t dispatch = (INSTR (31, 30) << 1) | INSTR (26, 26);
   int32_t imm = simm32 (aarch64_get_instr (cpu), 23, 5);
 
   switch (dispatch)
@@ -1540,9 +1615,10 @@ dexLoadLiteral (sim_cpu *cpu)
 static void
 add32 (sim_cpu *cpu, uint32_t aimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) + aimm);
 }
@@ -1551,9 +1627,10 @@ add32 (sim_cpu *cpu, uint32_t aimm)
 static void
 add64 (sim_cpu *cpu, uint32_t aimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) + aimm);
 }
@@ -1573,7 +1650,7 @@ set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
   if (result & (1 << 31))
     flags |= N;
 
-  if (uresult != result)
+  if (uresult != (uint32_t)result)
     flags |= C;
 
   if (sresult != result)
@@ -1582,61 +1659,40 @@ set_flags_for_add32 (sim_cpu *cpu, int32_t value1, int32_t value2)
   aarch64_set_CPSR (cpu, flags);
 }
 
+#define NEG(a) (((a) & signbit) == signbit)
+#define POS(a) (((a) & signbit) == 0)
+
 static void
 set_flags_for_add64 (sim_cpu *cpu, uint64_t value1, uint64_t value2)
 {
-  int64_t   sval1 = value1;
-  int64_t   sval2 = value2;
-  uint64_t  result = value1 + value2;
-  int64_t   sresult = sval1 + sval2;
-  uint32_t  flags = 0;
+  uint64_t result = value1 + value2;
+  uint32_t flags = 0;
+  uint64_t signbit = 1ULL << 63;
 
   if (result == 0)
     flags |= Z;
 
-  if (result & (1ULL << 63))
+  if (NEG (result))
     flags |= N;
 
-  if (sval1 < 0)
-    {
-      if (sval2 < 0)
-	{
-	  /* Negative plus a negative.  Overflow happens if
-	     the result is greater than either of the operands.  */
-	  if (sresult > sval1 || sresult > sval2)
-	    flags |= V;
-	}
-      /* else Negative plus a positive.  Overflow cannot happen.  */
-    }
-  else /* value1 is +ve.  */
-    {
-      if (sval2 < 0)
-	{
-	  /* Overflow can only occur if we computed "0 - MININT".  */
-	  if (sval1 == 0 && sval2 == (1LL << 63))
-	    flags |= V;
-	}
-      else
-	{
-	  /* Postive plus positive - overflow has happened if the
-	     result is smaller than either of the operands.  */
-	  if (result < value1 || result < value2)
-	    flags |= V | C;
-	}
-    }
+  if (   (NEG (value1) && NEG (value2))
+      || (NEG (value1) && POS (result))
+      || (NEG (value2) && POS (result)))
+    flags |= C;
+
+  if (   (NEG (value1) && NEG (value2) && POS (result))
+      || (POS (value1) && POS (value2) && NEG (result)))
+    flags |= V;
 
   aarch64_set_CPSR (cpu, flags);
 }
 
-#define NEG(a) (((a) & signbit) == signbit)
-#define POS(a) (((a) & signbit) == 0)
-
 static void
 set_flags_for_sub32 (sim_cpu *cpu, uint32_t value1, uint32_t value2)
 {
   uint32_t result = value1 - value2;
   uint32_t flags = 0;
-  uint32_t signbit = 1ULL << 31;
+  uint32_t signbit = 1U << 31;
 
   if (result == 0)
     flags |= Z;
@@ -1721,11 +1777,12 @@ set_flags_for_binop64 (sim_cpu *cpu, uint64_t result)
 static void
 adds32 (sim_cpu *cpu, uint32_t aimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   /* TODO : do we need to worry about signs here?  */
   int32_t value1 = aarch64_get_reg_s32 (cpu, rn, SP_OK);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + aimm);
   set_flags_for_add32 (cpu, value1, aimm);
 }
@@ -1734,11 +1791,12 @@ adds32 (sim_cpu *cpu, uint32_t aimm)
 static void
 adds64 (sim_cpu *cpu, uint32_t aimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   uint64_t value2 = aimm;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   set_flags_for_add64 (cpu, value1, value2);
 }
@@ -1747,9 +1805,10 @@ adds64 (sim_cpu *cpu, uint32_t aimm)
 static void
 sub32 (sim_cpu *cpu, uint32_t aimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u32 (cpu, rn, SP_OK) - aimm);
 }
@@ -1758,9 +1817,10 @@ sub32 (sim_cpu *cpu, uint32_t aimm)
 static void
 sub64 (sim_cpu *cpu, uint32_t aimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u64 (cpu, rn, SP_OK) - aimm);
 }
@@ -1769,11 +1829,12 @@ sub64 (sim_cpu *cpu, uint32_t aimm)
 static void
 subs32 (sim_cpu *cpu, uint32_t aimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint32_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   uint32_t value2 = aimm;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   set_flags_for_sub32 (cpu, value1, value2);
 }
@@ -1782,11 +1843,12 @@ subs32 (sim_cpu *cpu, uint32_t aimm)
 static void
 subs64 (sim_cpu *cpu, uint32_t aimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   uint32_t value2 = aimm;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   set_flags_for_sub64 (cpu, value1, value2);
 }
@@ -1853,10 +1915,11 @@ shifted64 (uint64_t value, Shift shift, uint32_t count)
 static void
 add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
 		       + shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
@@ -1867,10 +1930,11 @@ add32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
 		       + shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
@@ -1881,14 +1945,15 @@ add64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 			       shift, count);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   set_flags_for_add32 (cpu, value1, value2);
 }
@@ -1897,14 +1962,15 @@ adds32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
 			       shift, count);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   set_flags_for_add64 (cpu, value1, value2);
 }
@@ -1913,10 +1979,11 @@ adds64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
 		       - shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
@@ -1927,10 +1994,11 @@ sub32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
 		       - shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
@@ -1941,14 +2009,15 @@ sub64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 			      shift, count);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   set_flags_for_sub32 (cpu, value1, value2);
 }
@@ -1957,14 +2026,15 @@ subs32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 subs64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
 			       shift, count);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   set_flags_for_sub64 (cpu, value1, value2);
 }
@@ -2021,10 +2091,11 @@ extreg64 (sim_cpu *cpu, unsigned int lo, Extension extension)
 static void
 add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
 		       + (extreg32 (cpu, rm, extension) << shift));
@@ -2035,10 +2106,11 @@ add32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 static void
 add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
 		       + (extreg64 (cpu, rm, extension) << shift));
@@ -2048,13 +2120,14 @@ add64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 static void
 adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   set_flags_for_add32 (cpu, value1, value2);
 }
@@ -2064,13 +2137,14 @@ adds32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 static void
 adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2);
   set_flags_for_add64 (cpu, value1, value2);
 }
@@ -2079,10 +2153,11 @@ adds64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 static void
 sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u32 (cpu, rn, SP_OK)
 		       - (extreg32 (cpu, rm, extension) << shift));
@@ -2093,10 +2168,11 @@ sub32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 static void
 sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u64 (cpu, rn, SP_OK)
 		       - (extreg64 (cpu, rm, extension) << shift));
@@ -2106,13 +2182,14 @@ sub64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 static void
 subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, SP_OK);
   uint32_t value2 = extreg32 (cpu, rm, extension) << shift;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   set_flags_for_sub32 (cpu, value1, value2);
 }
@@ -2122,13 +2199,14 @@ subs32_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 static void
 subs64_ext (sim_cpu *cpu, Extension extension, uint32_t shift)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   uint64_t value2 = extreg64 (cpu, rm, extension) << shift;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 - value2);
   set_flags_for_sub64 (cpu, value1, value2);
 }
@@ -2146,9 +2224,9 @@ dexAddSubtractImmediate (sim_cpu *cpu)
      instr[4,0]   = Rd  */
 
   /* N.B. the shift is applied at decode before calling the add/sub routine.  */
-  uint32_t shift = uimm (aarch64_get_instr (cpu), 23, 22);
-  uint32_t imm = uimm (aarch64_get_instr (cpu), 21, 10);
-  uint32_t dispatch = uimm (aarch64_get_instr (cpu), 31, 29);
+  uint32_t shift = INSTR (23, 22);
+  uint32_t imm = INSTR (21, 10);
+  uint32_t dispatch = INSTR (31, 29);
 
   NYI_assert (28, 24, 0x11);
 
@@ -2168,8 +2246,6 @@ dexAddSubtractImmediate (sim_cpu *cpu)
     case 5: adds64 (cpu, imm); break;
     case 6: sub64 (cpu, imm); break;
     case 7: subs64 (cpu, imm); break;
-    default:
-      HALT_UNALLOC;
     }
 }
 
@@ -2186,25 +2262,24 @@ dexAddSubtractShiftedRegister (sim_cpu *cpu)
      instr[9,5]   = Rn
      instr[4,0]   = Rd  */
 
-  uint32_t size = uimm (aarch64_get_instr (cpu), 31, 31);
-  /* 32 bit operations must have count[5] = 0
-     or else we have an UNALLOC.  */
-  uint32_t count = uimm (aarch64_get_instr (cpu), 15, 10);
-  /* Shift encoded as ROR is unallocated.  */
-  Shift shiftType = shift (aarch64_get_instr (cpu), 22);
-  /* Dispatch on size:op i.e aarch64_get_instr (cpu)[31,29].  */
-  uint32_t dispatch = uimm (aarch64_get_instr (cpu), 31, 29);
+  uint32_t size = INSTR (31, 31);
+  uint32_t count = INSTR (15, 10);
+  Shift shiftType = INSTR (23, 22);
 
   NYI_assert (28, 24, 0x0B);
   NYI_assert (21, 21, 0);
 
+  /* Shift encoded as ROR is unallocated.  */
   if (shiftType == ROR)
     HALT_UNALLOC;
 
-  if (!size && uimm (count, 5, 5))
+  /* 32 bit operations must have count[5] = 0
+     or else we have an UNALLOC.  */
+  if (size == 0 && uimm (count, 5, 5))
     HALT_UNALLOC;
 
-  switch (dispatch)
+  /* Dispatch on size:op i.e instr [31,29].  */
+  switch (INSTR (31, 29))
     {
     case 0: add32_shift  (cpu, shiftType, count); break;
     case 1: adds32_shift (cpu, shiftType, count); break;
@@ -2214,8 +2289,6 @@ dexAddSubtractShiftedRegister (sim_cpu *cpu)
     case 5: adds64_shift (cpu, shiftType, count); break;
     case 6: sub64_shift  (cpu, shiftType, count); break;
     case 7: subs64_shift (cpu, shiftType, count); break;
-    default:
-      HALT_UNALLOC;
     }
 }
 
@@ -2237,10 +2310,8 @@ dexAddSubtractExtendedRegister (sim_cpu *cpu)
      instr[9,5]   = Rn
      instr[4,0]   = Rd  */
 
-  Extension extensionType = extension (aarch64_get_instr (cpu), 13);
-  uint32_t shift = uimm (aarch64_get_instr (cpu), 12, 10);
-  /* dispatch on size:op:set? i.e aarch64_get_instr (cpu)[31,29]  */
-  uint32_t dispatch = uimm (aarch64_get_instr (cpu), 31, 29);
+  Extension extensionType = INSTR (15, 13);
+  uint32_t shift = INSTR (12, 10);
 
   NYI_assert (28, 24, 0x0B);
   NYI_assert (21, 21, 1);
@@ -2249,7 +2320,8 @@ dexAddSubtractExtendedRegister (sim_cpu *cpu)
   if (shift > 4)
     HALT_UNALLOC;
 
-  switch (dispatch)
+  /* Dispatch on size:op:set?.  */
+  switch (INSTR (31, 29))
     {
     case 0: add32_ext  (cpu, extensionType, shift); break;
     case 1: adds32_ext (cpu, extensionType, shift); break;
@@ -2259,7 +2331,6 @@ dexAddSubtractExtendedRegister (sim_cpu *cpu)
     case 5: adds64_ext (cpu, extensionType, shift); break;
     case 6: sub64_ext  (cpu, extensionType, shift); break;
     case 7: subs64_ext (cpu, extensionType, shift); break;
-    default: HALT_UNALLOC;
     }
 }
 
@@ -2272,10 +2343,11 @@ dexAddSubtractExtendedRegister (sim_cpu *cpu)
 static void
 adc32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
 		       + aarch64_get_reg_u32 (cpu, rm, NO_SP)
@@ -2286,10 +2358,11 @@ adc32 (sim_cpu *cpu)
 static void
 adc64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
 		       + aarch64_get_reg_u64 (cpu, rm, NO_SP)
@@ -2300,14 +2373,15 @@ adc64 (sim_cpu *cpu)
 static void
 adcs32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
   uint32_t carry = IS_SET (C);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
   set_flags_for_add32 (cpu, value1, value2 + carry);
 }
@@ -2316,14 +2390,15 @@ adcs32 (sim_cpu *cpu)
 static void
 adcs64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
   uint64_t carry = IS_SET (C);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 + value2 + carry);
   set_flags_for_add64 (cpu, value1, value2 + carry);
 }
@@ -2332,10 +2407,11 @@ adcs64 (sim_cpu *cpu)
 static void
 sbc32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5); /* ngc iff rn == 31.  */
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u32 (cpu, rn, NO_SP)
 		       - aarch64_get_reg_u32 (cpu, rm, NO_SP)
@@ -2346,10 +2422,11 @@ sbc32 (sim_cpu *cpu)
 static void
 sbc64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u64 (cpu, rn, NO_SP)
 		       - aarch64_get_reg_u64 (cpu, rm, NO_SP)
@@ -2360,15 +2437,16 @@ sbc64 (sim_cpu *cpu)
 static void
 sbcs32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t value2 = aarch64_get_reg_u32 (cpu, rm, NO_SP);
   uint32_t carry  = IS_SET (C);
   uint32_t result = value1 - value2 + 1 - carry;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   set_flags_for_sub32 (cpu, value1, value2 + 1 - carry);
 }
@@ -2377,15 +2455,16 @@ sbcs32 (sim_cpu *cpu)
 static void
 sbcs64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
   uint64_t carry  = IS_SET (C);
   uint64_t result = value1 - value2 + 1 - carry;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
   set_flags_for_sub64 (cpu, value1, value2 + 1 - carry);
 }
@@ -2402,16 +2481,15 @@ dexAddSubtractWithCarry (sim_cpu *cpu)
      instr[9,5]   = Rn
      instr[4,0]   = Rd  */
 
-  uint32_t op2 = uimm (aarch64_get_instr (cpu), 15, 10);
-  /* Dispatch on size:op:set? i.e aarch64_get_instr (cpu)[31,29]  */
-  uint32_t dispatch = uimm (aarch64_get_instr (cpu), 31, 29);
+  uint32_t op2 = INSTR (15, 10);
 
   NYI_assert (28, 21, 0xD0);
 
   if (op2 != 0)
     HALT_UNALLOC;
 
-  switch (dispatch)
+  /* Dispatch on size:op:set?.  */
+  switch (INSTR (31, 29))
     {
     case 0: adc32 (cpu); break;
     case 1: adcs32 (cpu); break;
@@ -2421,7 +2499,6 @@ dexAddSubtractWithCarry (sim_cpu *cpu)
     case 5: adcs64 (cpu); break;
     case 6: sbc64 (cpu); break;
     case 7: sbcs64 (cpu); break;
-    default: HALT_UNALLOC;
     }
 }
 
@@ -2465,7 +2542,7 @@ static void
 CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
 {
   /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
-     instr[30]    = compare with positive (0) or negative value (1)
+     instr[30]    = compare with positive (1) or negative value (0)
      instr[29,21] = 1 1101 0010
      instr[20,16] = Rm or const
      instr[15,12] = cond
@@ -2482,19 +2559,20 @@ CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
   NYI_assert (10, 10, 0);
   NYI_assert (4, 4, 0);
 
-  if (! testConditionCode (cpu, uimm (aarch64_get_instr (cpu), 15, 12)))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (! testConditionCode (cpu, INSTR (15, 12)))
     {
-      aarch64_set_CPSR (cpu, uimm (aarch64_get_instr (cpu), 3, 0));
+      aarch64_set_CPSR (cpu, INSTR (3, 0));
       return;
     }
 
-  negate = uimm (aarch64_get_instr (cpu), 30, 30) ? -1 : 1;
-  rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  rn = uimm (aarch64_get_instr (cpu),  9,  5);
+  negate = INSTR (30, 30) ? 1 : -1;
+  rm = INSTR (20, 16);
+  rn = INSTR ( 9,  5);
 
-  if (uimm (aarch64_get_instr (cpu), 31, 31))
+  if (INSTR (31, 31))
     {
-      if (uimm (aarch64_get_instr (cpu), 11, 11))
+      if (INSTR (11, 11))
 	set_flags_for_sub64 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK),
 			     negate * (uint64_t) rm);
       else
@@ -2503,7 +2581,7 @@ CondCompare (sim_cpu *cpu) /* aka: ccmp and ccmn  */
     }
   else
     {
-      if (uimm (aarch64_get_instr (cpu), 11, 11))
+      if (INSTR (11, 11))
 	set_flags_for_sub32 (cpu, aarch64_get_reg_u32 (cpu, rn, SP_OK),
 			     negate * rm);
       else
@@ -2525,58 +2603,166 @@ do_vec_MOV_whole_vector (sim_cpu *cpu)
      instr[9,5]   = Vs
      instr[4,0]   = Vd  */
 
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
 
   NYI_assert (29, 21, 0x075);
   NYI_assert (15, 10, 0x07);
 
-  if (uimm (aarch64_get_instr (cpu), 20, 16) != vs)
+  if (INSTR (20, 16) != vs)
     HALT_NYI;
 
-  if (uimm (aarch64_get_instr (cpu), 30, 30))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (30, 30))
     aarch64_set_vec_u64 (cpu, vd, 1, aarch64_get_vec_u64 (cpu, vs, 1));
 
   aarch64_set_vec_u64 (cpu, vd, 0, aarch64_get_vec_u64 (cpu, vs, 0));
 }
 
 static void
-do_vec_MOV_into_scalar (sim_cpu *cpu)
+do_vec_SMOV_into_scalar (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = word(0)/long(1)
+     instr[29,21] = 00 1110 000
+     instr[20,16] = element size and index
+     instr[15,10] = 00 0010 11
+     instr[9,5]   = V source
+     instr[4,0]   = R dest  */
+
+  unsigned vs = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned imm5 = INSTR (20, 16);
+  unsigned full = INSTR (30, 30);
+  int size, index;
+
+  NYI_assert (29, 21, 0x070);
+  NYI_assert (15, 10, 0x0B);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+
+  if (imm5 & 0x1)
+    {
+      size = 0;
+      index = (imm5 >> 1) & 0xF;
+    }
+  else if (imm5 & 0x2)
+    {
+      size = 1;
+      index = (imm5 >> 2) & 0x7;
+    }
+  else if (full && (imm5 & 0x4))
+    {
+      size = 2;
+      index = (imm5 >> 3) & 0x3;
+    }
+  else
+    HALT_UNALLOC;
+
+  switch (size)
+    {
+    case 0:
+      if (full)
+	aarch64_set_reg_s64 (cpu, rd, NO_SP,
+			     aarch64_get_vec_s8 (cpu, vs, index));
+      else
+	aarch64_set_reg_s32 (cpu, rd, NO_SP,
+			     aarch64_get_vec_s8 (cpu, vs, index));
+      break;
+
+    case 1:
+      if (full)
+	aarch64_set_reg_s64 (cpu, rd, NO_SP,
+			     aarch64_get_vec_s16 (cpu, vs, index));
+      else
+	aarch64_set_reg_s32 (cpu, rd, NO_SP,
+			     aarch64_get_vec_s16 (cpu, vs, index));
+      break;
+
+    case 2:
+      aarch64_set_reg_s64 (cpu, rd, NO_SP,
+			   aarch64_get_vec_s32 (cpu, vs, index));
+      break;
+
+    default:
+      HALT_UNALLOC;
+    }
+}
+
+static void
+do_vec_UMOV_into_scalar (sim_cpu *cpu)
 {
   /* instr[31]    = 0
      instr[30]    = word(0)/long(1)
      instr[29,21] = 00 1110 000
-     instr[20,18] = element size and index
-     instr[17,10] = 00 0011 11
+     instr[20,16] = element size and index
+     instr[15,10] = 00 0011 11
      instr[9,5]   = V source
      instr[4,0]   = R dest  */
 
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vs = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned imm5 = INSTR (20, 16);
+  unsigned full = INSTR (30, 30);
+  int size, index;
 
   NYI_assert (29, 21, 0x070);
-  NYI_assert (17, 10, 0x0F);
+  NYI_assert (15, 10, 0x0F);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+
+  if (!full)
+    {
+      if (imm5 & 0x1)
+	{
+	  size = 0;
+	  index = (imm5 >> 1) & 0xF;
+	}
+      else if (imm5 & 0x2)
+	{
+	  size = 1;
+	  index = (imm5 >> 2) & 0x7;
+	}
+      else if (imm5 & 0x4)
+	{
+	  size = 2;
+	  index = (imm5 >> 3) & 0x3;
+	}
+      else
+	HALT_UNALLOC;
+    }
+  else if (imm5 & 0x8)
+    {
+      size = 3;
+      index = (imm5 >> 4) & 0x1;
+    }
+  else
+    HALT_UNALLOC;
 
-  switch (uimm (aarch64_get_instr (cpu), 20, 18))
+  switch (size)
     {
-    case 0x2:
-      aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 0));
+    case 0:
+      aarch64_set_reg_u32 (cpu, rd, NO_SP,
+			   aarch64_get_vec_u8 (cpu, vs, index));
+      break;
+
+    case 1:
+      aarch64_set_reg_u32 (cpu, rd, NO_SP,
+			   aarch64_get_vec_u16 (cpu, vs, index));
       break;
 
-    case 0x6:
-      aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, vs, 1));
+    case 2:
+      aarch64_set_reg_u32 (cpu, rd, NO_SP,
+			   aarch64_get_vec_u32 (cpu, vs, index));
       break;
 
-    case 0x1:
-    case 0x3:
-    case 0x5:
-    case 0x7:
-      aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u32
-			   (cpu, vs, uimm (aarch64_get_instr (cpu), 20, 19)));
+    case 3:
+      aarch64_set_reg_u64 (cpu, rd, NO_SP,
+			   aarch64_get_vec_u64 (cpu, vs, index));
       break;
 
     default:
-      HALT_NYI;
+      HALT_UNALLOC;
     }
 }
 
@@ -2590,33 +2776,34 @@ do_vec_INS (sim_cpu *cpu)
      instr[4,0]   = V dest  */
 
   int index;
-  unsigned rs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
 
   NYI_assert (31, 21, 0x270);
   NYI_assert (15, 10, 0x07);
 
-  if (uimm (aarch64_get_instr (cpu), 16, 16))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (16, 16))
     {
-      index = uimm (aarch64_get_instr (cpu), 20, 17);
+      index = INSTR (20, 17);
       aarch64_set_vec_u8 (cpu, vd, index,
 			  aarch64_get_reg_u8 (cpu, rs, NO_SP));
     }
-  else if (uimm (aarch64_get_instr (cpu), 17, 17))
+  else if (INSTR (17, 17))
     {
-      index = uimm (aarch64_get_instr (cpu), 20, 18);
+      index = INSTR (20, 18);
       aarch64_set_vec_u16 (cpu, vd, index,
 			   aarch64_get_reg_u16 (cpu, rs, NO_SP));
     }
-  else if (uimm (aarch64_get_instr (cpu), 18, 18))
+  else if (INSTR (18, 18))
     {
-      index = uimm (aarch64_get_instr (cpu), 20, 19);
+      index = INSTR (20, 19);
       aarch64_set_vec_u32 (cpu, vd, index,
 			   aarch64_get_reg_u32 (cpu, rs, NO_SP));
     }
-  else if (uimm (aarch64_get_instr (cpu), 19, 19))
+  else if (INSTR (19, 19))
     {
-      index = uimm (aarch64_get_instr (cpu), 20, 20);
+      index = INSTR (20, 20);
       aarch64_set_vec_u64 (cpu, vd, index,
 			   aarch64_get_reg_u64 (cpu, rs, NO_SP));
     }
@@ -2635,44 +2822,45 @@ do_vec_DUP_vector_into_vector (sim_cpu *cpu)
      instr[9,5]   = V source
      instr[4,0]   = V dest.  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   int i, index;
 
   NYI_assert (29, 21, 0x070);
   NYI_assert (15, 10, 0x01);
 
-  if (uimm (aarch64_get_instr (cpu), 16, 16))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (16, 16))
     {
-      index = uimm (aarch64_get_instr (cpu), 20, 17);
+      index = INSTR (20, 17);
 
       for (i = 0; i < (full ? 16 : 8); i++)
 	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, index));
     }
-  else if (uimm (aarch64_get_instr (cpu), 17, 17))
+  else if (INSTR (17, 17))
     {
-      index = uimm (aarch64_get_instr (cpu), 20, 18);
+      index = INSTR (20, 18);
 
       for (i = 0; i < (full ? 8 : 4); i++)
 	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, index));
     }
-  else if (uimm (aarch64_get_instr (cpu), 18, 18))
+  else if (INSTR (18, 18))
     {
-      index = uimm (aarch64_get_instr (cpu), 20, 19);
+      index = INSTR (20, 19);
 
       for (i = 0; i < (full ? 4 : 2); i++)
 	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, index));
     }
   else
     {
-      if (uimm (aarch64_get_instr (cpu), 19, 19) == 0)
+      if (INSTR (19, 19) == 0)
 	HALT_UNALLOC;
 
       if (! full)
 	HALT_UNALLOC;
 
-      index = uimm (aarch64_get_instr (cpu), 20, 20);
+      index = INSTR (20, 20);
 
       for (i = 0; i < 2; i++)
 	aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, index));
@@ -2692,16 +2880,17 @@ do_vec_TBL (sim_cpu *cpu)
      instr[9,5]   = V start
      instr[4,0]   = V dest  */
 
-  int full    = uimm (aarch64_get_instr (cpu), 30, 30);
-  int len     = uimm (aarch64_get_instr (cpu), 14, 13) + 1;
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  int full    = INSTR (30, 30);
+  int len     = INSTR (14, 13) + 1;
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 21, 0x070);
   NYI_assert (12, 10, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   for (i = 0; i < (full ? 16 : 8); i++)
     {
       unsigned int selector = aarch64_get_vec_u8 (cpu, vm, i);
@@ -2737,17 +2926,18 @@ do_vec_TRN (sim_cpu *cpu)
      instr[9,5]   = V source
      instr[4,0]   = V dest.  */
 
-  int full    = uimm (aarch64_get_instr (cpu), 30, 30);
-  int second  = uimm (aarch64_get_instr (cpu), 14, 14);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  int full    = INSTR (30, 30);
+  int second  = INSTR (14, 14);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (13, 10, 0xA);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
       for (i = 0; i < (full ? 8 : 4); i++)
@@ -2793,9 +2983,6 @@ do_vec_TRN (sim_cpu *cpu)
       aarch64_set_vec_u64 (cpu, vd, 1,
 			   aarch64_get_vec_u64 (cpu, second ? vn : vm, 1));
       break;
-
-    default:
-      HALT_UNALLOC;
     }
 }
 
@@ -2813,14 +3000,15 @@ do_vec_DUP_scalar_into_vector (sim_cpu *cpu)
      instr[4,0]   = V dest.  */
 
   unsigned i;
-  unsigned Vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned Rs = uimm (aarch64_get_instr (cpu), 9, 5);
-  int both    = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned Vd = INSTR (4, 0);
+  unsigned Rs = INSTR (9, 5);
+  int both    = INSTR (30, 30);
 
   NYI_assert (29, 20, 0x0E0);
   NYI_assert (15, 10, 0x03);
 
-  switch (uimm (aarch64_get_instr (cpu), 19, 16))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (19, 16))
     {
     case 1:
       for (i = 0; i < (both ? 16 : 8); i++)
@@ -2864,56 +3052,91 @@ do_vec_UZP (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = Vd.  */
 
-  int full = uimm (aarch64_get_instr (cpu), 30, 30);
-  int upper = uimm (aarch64_get_instr (cpu), 14, 14);
+  int full = INSTR (30, 30);
+  int upper = INSTR (14, 14);
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
 
   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
   uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0);
   uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1);
 
-  uint64_t val1 = 0;
-  uint64_t val2 = 0;
+  uint64_t val1;
+  uint64_t val2;
 
-  uint64_t input1 = upper ? val_n1 : val_m1;
-  uint64_t input2 = upper ? val_n2 : val_m2;
-  unsigned i;
+  uint64_t input2 = full ? val_n2 : val_m1;
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 21, 0);
   NYI_assert (15, 15, 0);
   NYI_assert (13, 10, 6);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 23))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
-      for (i = 0; i < 8; i++)
+      val1 = (val_n1 >> (upper * 8)) & 0xFFULL;
+      val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL;
+      val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
+      val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
+
+      val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL;
+      val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
+      val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
+      val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
+
+      if (full)
 	{
-	  val1 |= (input1 >> (i * 8)) & (0xFFULL << (i * 8));
-	  val2 |= (input2 >> (i * 8)) & (0xFFULL << (i * 8));
+	  val2 = (val_m1 >> (upper * 8)) & 0xFFULL;
+	  val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL;
+	  val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL;
+	  val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL;
+
+	  val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL;
+	  val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL;
+	  val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL;
+	  val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL;
 	}
       break;
 
     case 1:
-      for (i = 0; i < 4; i++)
+      val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL;
+      val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
+
+      val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;;
+      val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
+
+      if (full)
 	{
-	  val1 |= (input1 >> (i * 16)) & (0xFFFFULL << (i * 16));
-	  val2 |= (input2 >> (i * 16)) & (0xFFFFULL << (i * 16));
+	  val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL;
+	  val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL;
+
+	  val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;
+	  val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL;
 	}
       break;
 
     case 2:
-      val1 = ((input1 & 0xFFFFFFFF) | ((input1 >> 32) & 0xFFFFFFFF00000000ULL));
-      val2 = ((input2 & 0xFFFFFFFF) | ((input2 >> 32) & 0xFFFFFFFF00000000ULL));
+      val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF;
+      val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
+
+      if (full)
+	{
+	  val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF;
+	  val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL;
+	}
+      break;
 
     case 3:
-      val1 = input1;
-      val2 = input2;
-	   break;
+      if (! full)
+	HALT_UNALLOC;
+
+      val1 = upper ? val_n2 : val_n1;
+      val2 = upper ? val_m2 : val_m1;
+      break;
     }
 
   aarch64_set_vec_u64 (cpu, vd, 0, val1);
@@ -2936,12 +3159,12 @@ do_vec_ZIP (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = Vd.  */
 
-  int full = uimm (aarch64_get_instr (cpu), 30, 30);
-  int upper = uimm (aarch64_get_instr (cpu), 14, 14);
+  int full = INSTR (30, 30);
+  int upper = INSTR (14, 14);
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
 
   uint64_t val_m1 = aarch64_get_vec_u64 (cpu, vm, 0);
   uint64_t val_m2 = aarch64_get_vec_u64 (cpu, vm, 1);
@@ -2959,7 +3182,8 @@ do_vec_ZIP (sim_cpu *cpu)
   NYI_assert (15, 15, 0);
   NYI_assert (13, 10, 0xE);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 23))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 23))
     {
     case 0:
       val1 =
@@ -3111,22 +3335,22 @@ do_vec_MOV_immediate (sim_cpu *cpu)
      instr[9,5]   = low 5-bits of uimm8
      instr[4,0]   = Vd.  */
 
-  int full     = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vd  = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned val = uimm (aarch64_get_instr (cpu), 18, 16) << 5
-    | uimm (aarch64_get_instr (cpu), 9, 5);
+  int full     = INSTR (30, 30);
+  unsigned vd  = INSTR (4, 0);
+  unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
   unsigned i;
 
   NYI_assert (29, 19, 0x1E0);
   NYI_assert (11, 10, 1);
 
-  switch (uimm (aarch64_get_instr (cpu), 15, 12))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (15, 12))
     {
     case 0x0: /* 32-bit, no shift.  */
     case 0x2: /* 32-bit, shift by 8.  */
     case 0x4: /* 32-bit, shift by 16.  */
     case 0x6: /* 32-bit, shift by 24.  */
-      val <<= (8 * uimm (aarch64_get_instr (cpu), 14, 13));
+      val <<= (8 * INSTR (14, 13));
       for (i = 0; i < (full ? 4 : 2); i++)
 	aarch64_set_vec_u32 (cpu, vd, i, val);
       break;
@@ -3137,7 +3361,8 @@ do_vec_MOV_immediate (sim_cpu *cpu)
     case 0x8: /* 16-bit, no shift.  */
       for (i = 0; i < (full ? 8 : 4); i++)
 	aarch64_set_vec_u16 (cpu, vd, i, val);
-      /* Fall through.  */
+      break;
+
     case 0xd: /* 32-bit, mask shift by 16.  */
       val <<= 8;
       val |= 0xFF;
@@ -3179,22 +3404,22 @@ do_vec_MVNI (sim_cpu *cpu)
      instr[9,5]   = low 5-bits of uimm8
      instr[4,0]   = Vd.  */
 
-  int full     = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vd  = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned val = uimm (aarch64_get_instr (cpu), 18, 16) << 5
-    | uimm (aarch64_get_instr (cpu), 9, 5);
+  int full     = INSTR (30, 30);
+  unsigned vd  = INSTR (4, 0);
+  unsigned val = (INSTR (18, 16) << 5) | INSTR (9, 5);
   unsigned i;
 
   NYI_assert (29, 19, 0x5E0);
   NYI_assert (11, 10, 1);
 
-  switch (uimm (aarch64_get_instr (cpu), 15, 12))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (15, 12))
     {
     case 0x0: /* 32-bit, no shift.  */
     case 0x2: /* 32-bit, shift by 8.  */
     case 0x4: /* 32-bit, shift by 16.  */
     case 0x6: /* 32-bit, shift by 24.  */
-      val <<= (8 * uimm (aarch64_get_instr (cpu), 14, 13));
+      val <<= (8 * INSTR (14, 13));
       val = ~ val;
       for (i = 0; i < (full ? 4 : 2); i++)
 	aarch64_set_vec_u32 (cpu, vd, i, val);
@@ -3225,9 +3450,9 @@ do_vec_MVNI (sim_cpu *cpu)
 
 	for (i = 0; i < 8; i++)
 	  if (val & (1 << i))
-	    mask |= (0xF << (i * 4));
+	    mask |= (0xFFUL << (i * 8));
 	aarch64_set_vec_u64 (cpu, vd, 0, mask);
-	aarch64_set_vec_u64 (cpu, vd, 1, 0);
+	aarch64_set_vec_u64 (cpu, vd, 1, mask);
 	return;
       }
 
@@ -3261,15 +3486,16 @@ do_vec_ABS (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
   unsigned i;
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 10, 0x82E);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
       for (i = 0; i < (full ? 16 : 8); i++)
@@ -3310,45 +3536,48 @@ do_vec_ADDV (sim_cpu *cpu)
      instr[9,5]   = Vm
      instr[4.0]   = Rd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   unsigned i;
-  uint64_t val = 0;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 10, 0xC6E);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
-      for (i = 0; i < (full ? 16 : 8); i++)
-	val += aarch64_get_vec_u8 (cpu, vm, i);
-      aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
-      return;
+      {
+	uint8_t val = 0;
+	for (i = 0; i < (full ? 16 : 8); i++)
+	  val += aarch64_get_vec_u8 (cpu, vm, i);
+	aarch64_set_vec_u64 (cpu, rd, 0, val);
+	return;
+      }
 
     case 1:
-      for (i = 0; i < (full ? 8 : 4); i++)
-	val += aarch64_get_vec_u16 (cpu, vm, i);
-      aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
-      return;
+      {
+	uint16_t val = 0;
+	for (i = 0; i < (full ? 8 : 4); i++)
+	  val += aarch64_get_vec_u16 (cpu, vm, i);
+	aarch64_set_vec_u64 (cpu, rd, 0, val);
+	return;
+      }
 
     case 2:
-      for (i = 0; i < (full ? 4 : 2); i++)
-	val += aarch64_get_vec_u32 (cpu, vm, i);
-      aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
-      return;
+      {
+	uint32_t val = 0;
+	if (! full)
+	  HALT_UNALLOC;
+	for (i = 0; i < 4; i++)
+	  val += aarch64_get_vec_u32 (cpu, vm, i);
+	aarch64_set_vec_u64 (cpu, rd, 0, val);
+	return;
+      }
 
     case 3:
-      if (! full)
-	HALT_UNALLOC;
-      val = aarch64_get_vec_u64 (cpu, vm, 0);
-      val += aarch64_get_vec_u64 (cpu, vm, 1);
-      aarch64_set_reg_u64 (cpu, rd, NO_SP, val);
-      return;
-
-    default:
-      HALT_UNREACHABLE;
+      HALT_UNALLOC;
     }
 }
 
@@ -3364,55 +3593,71 @@ do_vec_ins_2 (sim_cpu *cpu)
      instr[4,0]   = Vd.  */
 
   unsigned elem;
-  unsigned vm = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
 
   NYI_assert (31, 21, 0x270);
   NYI_assert (17, 14, 0);
   NYI_assert (12, 10, 7);
 
-  if (uimm (aarch64_get_instr (cpu), 13, 13) == 1)
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (13, 13) == 1)
     {
-      if (uimm (aarch64_get_instr (cpu), 18, 18) == 1)
+      if (INSTR (18, 18) == 1)
 	{
 	  /* 32-bit moves.  */
-	  elem = uimm (aarch64_get_instr (cpu), 20, 19);
+	  elem = INSTR (20, 19);
 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
 			       aarch64_get_vec_u32 (cpu, vm, elem));
 	}
       else
 	{
 	  /* 64-bit moves.  */
-	  if (uimm (aarch64_get_instr (cpu), 19, 19) != 1)
+	  if (INSTR (19, 19) != 1)
 	    HALT_NYI;
 
-	  elem = uimm (aarch64_get_instr (cpu), 20, 20);
+	  elem = INSTR (20, 20);
 	  aarch64_set_reg_u64 (cpu, vd, NO_SP,
 			       aarch64_get_vec_u64 (cpu, vm, elem));
 	}
     }
   else
     {
-      if (uimm (aarch64_get_instr (cpu), 18, 18) == 1)
+      if (INSTR (18, 18) == 1)
 	{
 	  /* 32-bit moves.  */
-	  elem = uimm (aarch64_get_instr (cpu), 20, 19);
+	  elem = INSTR (20, 19);
 	  aarch64_set_vec_u32 (cpu, vd, elem,
 			       aarch64_get_reg_u32 (cpu, vm, NO_SP));
 	}
       else
 	{
 	  /* 64-bit moves.  */
-	  if (uimm (aarch64_get_instr (cpu), 19, 19) != 1)
+	  if (INSTR (19, 19) != 1)
 	    HALT_NYI;
 
-	  elem = uimm (aarch64_get_instr (cpu), 20, 20);
+	  elem = INSTR (20, 20);
 	  aarch64_set_vec_u64 (cpu, vd, elem,
 			       aarch64_get_reg_u64 (cpu, vm, NO_SP));
 	}
     }
 }
 
+#define DO_VEC_WIDENING_MUL(N, DST_TYPE, READ_TYPE, WRITE_TYPE)	  \
+  do								  \
+    {								  \
+      DST_TYPE a[N], b[N];					  \
+								  \
+      for (i = 0; i < (N); i++)					  \
+	{							  \
+	  a[i] = aarch64_get_vec_##READ_TYPE (cpu, vn, i + bias); \
+	  b[i] = aarch64_get_vec_##READ_TYPE (cpu, vm, i + bias); \
+	}							  \
+      for (i = 0; i < (N); i++)					  \
+	aarch64_set_vec_##WRITE_TYPE (cpu, vd, i, a[i] * b[i]);	  \
+    }								  \
+  while (0)
+
 static void
 do_vec_mull (sim_cpu *cpu)
 {
@@ -3427,67 +3672,49 @@ do_vec_mull (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  int    unsign = uimm (aarch64_get_instr (cpu), 29, 29);
-  int    bias = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned vd = uimm (aarch64_get_instr (cpu),  4,  0);
+  int    unsign = INSTR (29, 29);
+  int    bias = INSTR (30, 30);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR ( 9,  5);
+  unsigned vd = INSTR ( 4,  0);
   unsigned i;
 
   NYI_assert (28, 24, 0x0E);
   NYI_assert (15, 10, 0x30);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  /* NB: Read source values before writing results, in case
+     the source and destination vectors are the same.  */
+  switch (INSTR (23, 22))
     {
     case 0:
       if (bias)
 	bias = 8;
       if (unsign)
-	for (i = 0; i < 8; i++)
-	  aarch64_set_vec_u16 (cpu, vd, i,
-			       aarch64_get_vec_u8 (cpu, vn, i + bias)
-			       * aarch64_get_vec_u8 (cpu, vm, i + bias));
+	DO_VEC_WIDENING_MUL (8, uint16_t, u8, u16);
       else
-	for (i = 0; i < 8; i++)
-	  aarch64_set_vec_s16 (cpu, vd, i,
-			       aarch64_get_vec_s8 (cpu, vn, i + bias)
-			       * aarch64_get_vec_s8 (cpu, vm, i + bias));
+	DO_VEC_WIDENING_MUL (8, int16_t, s8, s16);
       return;
 
     case 1:
       if (bias)
 	bias = 4;
       if (unsign)
-	for (i = 0; i < 4; i++)
-	  aarch64_set_vec_u32 (cpu, vd, i,
-			       aarch64_get_vec_u16 (cpu, vn, i + bias)
-			       * aarch64_get_vec_u16 (cpu, vm, i + bias));
+	DO_VEC_WIDENING_MUL (4, uint32_t, u16, u32);
       else
-	for (i = 0; i < 4; i++)
-	  aarch64_set_vec_s32 (cpu, vd, i,
-			       aarch64_get_vec_s16 (cpu, vn, i + bias)
-			       * aarch64_get_vec_s16 (cpu, vm, i + bias));
+	DO_VEC_WIDENING_MUL (4, int32_t, s16, s32);
       return;
 
     case 2:
       if (bias)
 	bias = 2;
       if (unsign)
-	for (i = 0; i < 2; i++)
-	  aarch64_set_vec_u64 (cpu, vd, i,
-			       (uint64_t) aarch64_get_vec_u32 (cpu, vn,
-							       i + bias)
-			       * (uint64_t) aarch64_get_vec_u32 (cpu, vm,
-								 i + bias));
+	DO_VEC_WIDENING_MUL (2, uint64_t, u32, u64);
       else
-	for (i = 0; i < 2; i++)
-	  aarch64_set_vec_s64 (cpu, vd, i,
-			       aarch64_get_vec_s32 (cpu, vn, i + bias)
-			       * aarch64_get_vec_s32 (cpu, vm, i + bias));
+	DO_VEC_WIDENING_MUL (2, int64_t, s32, s64);
       return;
 
     case 3:
-    default:
       HALT_NYI;
     }
 }
@@ -3506,19 +3733,20 @@ do_vec_fadd (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x35);
 
-  if (uimm (aarch64_get_instr (cpu), 23, 23))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (23, 23))
     {
-      if (uimm (aarch64_get_instr (cpu), 22, 22))
+      if (INSTR (22, 22))
 	{
 	  if (! full)
 	    HALT_NYI;
@@ -3538,7 +3766,7 @@ do_vec_fadd (sim_cpu *cpu)
     }
   else
     {
-      if (uimm (aarch64_get_instr (cpu), 22, 22))
+      if (INSTR (22, 22))
 	{
 	  if (! full)
 	    HALT_NYI;
@@ -3571,17 +3799,18 @@ do_vec_add (sim_cpu *cpu)
      instr[9,5]   = Vm
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x21);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
       for (i = 0; i < (full ? 16 : 8); i++)
@@ -3610,9 +3839,6 @@ do_vec_add (sim_cpu *cpu)
 			   aarch64_get_vec_u64 (cpu, vn, 1)
 			   + aarch64_get_vec_u64 (cpu, vm, 1));
       return;
-
-    default:
-      HALT_UNREACHABLE;
     }
 }
 
@@ -3629,49 +3855,32 @@ do_vec_mul (sim_cpu *cpu)
      instr[9,5]   = Vm
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
+  int      bias = 0;
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x27);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
-      for (i = 0; i < (full ? 16 : 8); i++)
-	{
-	  uint16_t val = aarch64_get_vec_u8 (cpu, vn, i);
-	  val *= aarch64_get_vec_u8 (cpu, vm, i);
-
-	  aarch64_set_vec_u16 (cpu, vd, i, val);
-	}
+      DO_VEC_WIDENING_MUL (full ? 16 : 8, uint8_t, u8, u8);
       return;
 
     case 1:
-      for (i = 0; i < (full ? 8 : 4); i++)
-	{
-	  uint32_t val = aarch64_get_vec_u16 (cpu, vn, i);
-	  val *= aarch64_get_vec_u16 (cpu, vm, i);
-
-	  aarch64_set_vec_u32 (cpu, vd, i, val);
-	}
+      DO_VEC_WIDENING_MUL (full ? 8 : 4, uint16_t, u16, u16);
       return;
 
     case 2:
-      for (i = 0; i < (full ? 4 : 2); i++)
-	{
-	  uint64_t val = aarch64_get_vec_u32 (cpu, vn, i);
-	  val *= aarch64_get_vec_u32 (cpu, vm, i);
-
-	  aarch64_set_vec_u64 (cpu, vd, i, val);
-	}
+      DO_VEC_WIDENING_MUL (full ? 4 : 2, uint32_t, u32, u32);
       return;
 
-    default:
     case 3:
       HALT_UNALLOC;
     }
@@ -3690,53 +3899,44 @@ do_vec_MLA (sim_cpu *cpu)
      instr[9,5]   = Vm
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x25);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
       for (i = 0; i < (full ? 16 : 8); i++)
-	{
-	  uint16_t val = aarch64_get_vec_u8 (cpu, vn, i);
-	  val *= aarch64_get_vec_u8 (cpu, vm, i);
-	  val += aarch64_get_vec_u8 (cpu, vd, i);
-
-	  aarch64_set_vec_u16 (cpu, vd, i, val);
-	}
+	aarch64_set_vec_u8 (cpu, vd, i,
+			    aarch64_get_vec_u8 (cpu, vd, i)
+			    + (aarch64_get_vec_u8 (cpu, vn, i)
+			       * aarch64_get_vec_u8 (cpu, vm, i)));
       return;
 
     case 1:
       for (i = 0; i < (full ? 8 : 4); i++)
-	{
-	  uint32_t val = aarch64_get_vec_u16 (cpu, vn, i);
-	  val *= aarch64_get_vec_u16 (cpu, vm, i);
-	  val += aarch64_get_vec_u16 (cpu, vd, i);
-
-	  aarch64_set_vec_u32 (cpu, vd, i, val);
-	}
+	aarch64_set_vec_u16 (cpu, vd, i,
+			     aarch64_get_vec_u16 (cpu, vd, i)
+			     + (aarch64_get_vec_u16 (cpu, vn, i)
+				* aarch64_get_vec_u16 (cpu, vm, i)));
       return;
 
     case 2:
       for (i = 0; i < (full ? 4 : 2); i++)
-	{
-	  uint64_t val = aarch64_get_vec_u32 (cpu, vn, i);
-	  val *= aarch64_get_vec_u32 (cpu, vm, i);
-	  val += aarch64_get_vec_u32 (cpu, vd, i);
-
-	  aarch64_set_vec_u64 (cpu, vd, i, val);
-	}
+	aarch64_set_vec_u32 (cpu, vd, i,
+			     aarch64_get_vec_u32 (cpu, vd, i)
+			     + (aarch64_get_vec_u32 (cpu, vn, i)
+				* aarch64_get_vec_u32 (cpu, vm, i)));
       return;
 
     default:
-    case 3:
       HALT_UNALLOC;
     }
 }
@@ -3744,13 +3944,13 @@ do_vec_MLA (sim_cpu *cpu)
 static float
 fmaxnm (float a, float b)
 {
-  if (fpclassify (a) == FP_NORMAL)
+  if (! isnan (a))
     {
-      if (fpclassify (b) == FP_NORMAL)
+      if (! isnan (b))
 	return a > b ? a : b;
       return a;
     }
-  else if (fpclassify (b) == FP_NORMAL)
+  else if (! isnan (b))
     return b;
   return a;
 }
@@ -3758,13 +3958,13 @@ fmaxnm (float a, float b)
 static float
 fminnm (float a, float b)
 {
-  if (fpclassify (a) == FP_NORMAL)
+  if (! isnan (a))
     {
-      if (fpclassify (b) == FP_NORMAL)
+      if (! isnan (b))
 	return a < b ? a : b;
       return a;
     }
-  else if (fpclassify (b) == FP_NORMAL)
+  else if (! isnan (b))
     return b;
   return a;
 }
@@ -3772,13 +3972,13 @@ fminnm (float a, float b)
 static double
 dmaxnm (double a, double b)
 {
-  if (fpclassify (a) == FP_NORMAL)
+  if (! isnan (a))
     {
-      if (fpclassify (b) == FP_NORMAL)
+      if (! isnan (b))
 	return a > b ? a : b;
       return a;
     }
-  else if (fpclassify (b) == FP_NORMAL)
+  else if (! isnan (b))
     return b;
   return a;
 }
@@ -3786,13 +3986,13 @@ dmaxnm (double a, double b)
 static double
 dminnm (double a, double b)
 {
-  if (fpclassify (a) == FP_NORMAL)
+  if (! isnan (a))
     {
-      if (fpclassify (b) == FP_NORMAL)
+      if (! isnan (b))
 	return a < b ? a : b;
       return a;
     }
-  else if (fpclassify (b) == FP_NORMAL)
+  else if (! isnan (b))
     return b;
   return a;
 }
@@ -3800,29 +4000,30 @@ dminnm (double a, double b)
 static void
 do_vec_FminmaxNMP (sim_cpu *cpu)
 {
-  /* aarch64_get_instr (cpu)[31]    = 0
-     aarch64_get_instr (cpu)[30]    = half (0)/full (1)
-     aarch64_get_instr (cpu)[29,24] = 10 1110
-     aarch64_get_instr (cpu)[23]    = max(0)/min(1)
-     aarch64_get_instr (cpu)[22]    = float (0)/double (1)
-     aarch64_get_instr (cpu)[21]    = 1
-     aarch64_get_instr (cpu)[20,16] = Vn
-     aarch64_get_instr (cpu)[15,10] = 1100 01
-     aarch64_get_instr (cpu)[9,5]   = Vm
-     aarch64_get_instr (cpu)[4.0]   = Vd.  */
-
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  /* instr [31]    = 0
+     instr [30]    = half (0)/full (1)
+     instr [29,24] = 10 1110
+     instr [23]    = max(0)/min(1)
+     instr [22]    = float (0)/double (1)
+     instr [21]    = 1
+     instr [20,16] = Vn
+     instr [15,10] = 1100 01
+     instr [9,5]   = Vm
+     instr [4.0]   = Vd.  */
+
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 24, 0x2E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x31);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
-      double (* fn)(double, double) = uimm (aarch64_get_instr (cpu), 23, 23)
+      double (* fn)(double, double) = INSTR (23, 23)
 	? dminnm : dmaxnm;
 
       if (! full)
@@ -3836,7 +4037,7 @@ do_vec_FminmaxNMP (sim_cpu *cpu)
     }
   else
     {
-      float (* fn)(float, float) = uimm (aarch64_get_instr (cpu), 23, 23)
+      float (* fn)(float, float) = INSTR (23, 23)
 	? fminnm : fmaxnm;
 
       aarch64_set_vec_float (cpu, vd, 0,
@@ -3868,15 +4069,16 @@ do_vec_AND (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 21, 0x071);
   NYI_assert (15, 10, 0x07);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   for (i = 0; i < (full ? 4 : 2); i++)
     aarch64_set_vec_u32 (cpu, vd, i,
 			 aarch64_get_vec_u32 (cpu, vn, i)
@@ -3894,15 +4096,16 @@ do_vec_BSL (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 21, 0x173);
   NYI_assert (15, 10, 0x07);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   for (i = 0; i < (full ? 16 : 8); i++)
     aarch64_set_vec_u8 (cpu, vd, i,
 			(    aarch64_get_vec_u8 (cpu, vd, i)
@@ -3922,15 +4125,16 @@ do_vec_EOR (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 21, 0x171);
   NYI_assert (15, 10, 0x07);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   for (i = 0; i < (full ? 4 : 2); i++)
     aarch64_set_vec_u32 (cpu, vd, i,
 			 aarch64_get_vec_u32 (cpu, vn, i)
@@ -3950,28 +4154,29 @@ do_vec_bit (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned test_false = uimm (aarch64_get_instr (cpu), 22, 22);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned test_false = INSTR (22, 22);
   unsigned i;
 
   NYI_assert (29, 23, 0x5D);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x07);
 
-  if (test_false)
-    {
-      for (i = 0; i < (full ? 16 : 8); i++)
-	if (aarch64_get_vec_u32 (cpu, vn, i) == 0)
-	  aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
-    }
-  else
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  for (i = 0; i < (full ? 4 : 2); i++)
     {
-      for (i = 0; i < (full ? 16 : 8); i++)
-	if (aarch64_get_vec_u32 (cpu, vn, i) != 0)
-	  aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vm, i));
+      uint32_t vd_val = aarch64_get_vec_u32 (cpu, vd, i);
+      uint32_t vn_val = aarch64_get_vec_u32 (cpu, vn, i);
+      uint32_t vm_val = aarch64_get_vec_u32 (cpu, vm, i);
+      if (test_false)
+	aarch64_set_vec_u32 (cpu, vd, i,
+			     (vd_val & vm_val) | (vn_val & ~vm_val));
+      else
+	aarch64_set_vec_u32 (cpu, vd, i,
+			     (vd_val & ~vm_val) | (vn_val & vm_val));
     }
 }
 
@@ -3986,15 +4191,16 @@ do_vec_ORN (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 21, 0x077);
   NYI_assert (15, 10, 0x07);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   for (i = 0; i < (full ? 16 : 8); i++)
     aarch64_set_vec_u8 (cpu, vd, i,
 			aarch64_get_vec_u8 (cpu, vn, i)
@@ -4012,15 +4218,16 @@ do_vec_ORR (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 21, 0x075);
   NYI_assert (15, 10, 0x07);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   for (i = 0; i < (full ? 16 : 8); i++)
     aarch64_set_vec_u8 (cpu, vd, i,
 			aarch64_get_vec_u8 (cpu, vn, i)
@@ -4038,15 +4245,16 @@ do_vec_BIC (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 21, 0x073);
   NYI_assert (15, 10, 0x07);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   for (i = 0; i < (full ? 16 : 8); i++)
     aarch64_set_vec_u8 (cpu, vd, i,
 			aarch64_get_vec_u8 (cpu, vn, i)
@@ -4064,53 +4272,86 @@ do_vec_XTN (sim_cpu *cpu)
      instr[9,5]   = Vs
      instr[4,0]   = Vd.  */
 
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned bias = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned bias = INSTR (30, 30);
   unsigned i;
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 10, 0x84A);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
-      if (bias)
-	for (i = 0; i < 8; i++)
-	  aarch64_set_vec_u8 (cpu, vd, i + 8,
-			      aarch64_get_vec_u16 (cpu, vs, i) >> 8);
-      else
-	for (i = 0; i < 8; i++)
-	  aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, i));
+      for (i = 0; i < 8; i++)
+	aarch64_set_vec_u8 (cpu, vd, i + (bias * 8),
+			    aarch64_get_vec_u16 (cpu, vs, i));
       return;
 
     case 1:
-      if (bias)
-	for (i = 0; i < 4; i++)
-	  aarch64_set_vec_u16 (cpu, vd, i + 4,
-			       aarch64_get_vec_u32 (cpu, vs, i) >> 16);
-      else
-	for (i = 0; i < 4; i++)
-	  aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vs, i));
+      for (i = 0; i < 4; i++)
+	aarch64_set_vec_u16 (cpu, vd, i + (bias * 4),
+			     aarch64_get_vec_u32 (cpu, vs, i));
       return;
 
     case 2:
-      if (bias)
-	for (i = 0; i < 2; i++)
-	  aarch64_set_vec_u32 (cpu, vd, i + 4,
-			       aarch64_get_vec_u64 (cpu, vs, i) >> 32);
-      else
-	for (i = 0; i < 2; i++)
-	  aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vs, i));
+      for (i = 0; i < 2; i++)
+	aarch64_set_vec_u32 (cpu, vd, i + (bias * 2),
+			     aarch64_get_vec_u64 (cpu, vs, i));
       return;
-
-    default:
-      HALT_UNALLOC;
     }
 }
 
-#define MAX(A,B) ((A) > (B) ? (A) : (B))
-#define MIN(A,B) ((A) < (B) ? (A) : (B))
+/* Return the number of bits set in the input value.  */
+#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+# define popcount __builtin_popcount
+#else
+static int
+popcount (unsigned char x)
+{
+  static const unsigned char popcnt[16] =
+    {
+      0, 1, 1, 2,
+      1, 2, 2, 3,
+      1, 2, 2, 3,
+      2, 3, 3, 4
+    };
+
+  /* Only counts the low 8 bits of the input as that is all we need.  */
+  return popcnt[x % 16] + popcnt[x / 16];
+}
+#endif
+
+static void
+do_vec_CNT (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = half (0)/ full (1)
+     instr[29,24] = 00 1110
+     instr[23,22] = size: byte(00)
+     instr[21,10] = 1000 0001 0110
+     instr[9,5]   = Vs
+     instr[4,0]   = Vd.  */
+
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  int full = INSTR (30, 30);
+  int size = INSTR (23, 22);
+  int i;
+
+  NYI_assert (29, 24, 0x0E);
+  NYI_assert (21, 10, 0x816);
+
+  if (size != 0)
+    HALT_UNALLOC;
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+
+  for (i = 0; i < (full ? 16 : 8); i++)
+    aarch64_set_vec_u8 (cpu, vd, i,
+			popcount (aarch64_get_vec_u8 (cpu, vs, i)));
+}
 
 static void
 do_vec_maxv (sim_cpu *cpu)
@@ -4127,9 +4368,9 @@ do_vec_maxv (sim_cpu *cpu)
      instr[9,5]   = V source
      instr[4.0]   = R dest.  */
 
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned vs = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
   unsigned i;
 
   NYI_assert (28, 24, 0x0E);
@@ -4137,30 +4378,29 @@ do_vec_maxv (sim_cpu *cpu)
   NYI_assert (20, 17, 8);
   NYI_assert (15, 10, 0x2A);
 
-  switch ((uimm (aarch64_get_instr (cpu), 29, 29) << 1)
-	  | uimm (aarch64_get_instr (cpu), 16, 16))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch ((INSTR (29, 29) << 1) | INSTR (16, 16))
     {
     case 0: /* SMAXV.  */
        {
 	int64_t smax;
-	switch (uimm (aarch64_get_instr (cpu), 23, 22))
+	switch (INSTR (23, 22))
 	  {
 	  case 0:
 	    smax = aarch64_get_vec_s8 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 16 : 8); i++)
-	      smax = MAX (smax, aarch64_get_vec_s8 (cpu, vs, i));
+	      smax = max (smax, aarch64_get_vec_s8 (cpu, vs, i));
 	    break;
 	  case 1:
 	    smax = aarch64_get_vec_s16 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 8 : 4); i++)
-	      smax = MAX (smax, aarch64_get_vec_s16 (cpu, vs, i));
+	      smax = max (smax, aarch64_get_vec_s16 (cpu, vs, i));
 	    break;
 	  case 2:
 	    smax = aarch64_get_vec_s32 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 4 : 2); i++)
-	      smax = MAX (smax, aarch64_get_vec_s32 (cpu, vs, i));
+	      smax = max (smax, aarch64_get_vec_s32 (cpu, vs, i));
 	    break;
-	  default:
 	  case 3:
 	    HALT_UNALLOC;
 	  }
@@ -4171,24 +4411,24 @@ do_vec_maxv (sim_cpu *cpu)
     case 1: /* SMINV.  */
       {
 	int64_t smin;
-	switch (uimm (aarch64_get_instr (cpu), 23, 22))
+	switch (INSTR (23, 22))
 	  {
 	  case 0:
 	    smin = aarch64_get_vec_s8 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 16 : 8); i++)
-	      smin = MIN (smin, aarch64_get_vec_s8 (cpu, vs, i));
+	      smin = min (smin, aarch64_get_vec_s8 (cpu, vs, i));
 	    break;
 	  case 1:
 	    smin = aarch64_get_vec_s16 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 8 : 4); i++)
-	      smin = MIN (smin, aarch64_get_vec_s16 (cpu, vs, i));
+	      smin = min (smin, aarch64_get_vec_s16 (cpu, vs, i));
 	    break;
 	  case 2:
 	    smin = aarch64_get_vec_s32 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 4 : 2); i++)
-	      smin = MIN (smin, aarch64_get_vec_s32 (cpu, vs, i));
+	      smin = min (smin, aarch64_get_vec_s32 (cpu, vs, i));
 	    break;
-	  default:
+
 	  case 3:
 	    HALT_UNALLOC;
 	  }
@@ -4199,24 +4439,24 @@ do_vec_maxv (sim_cpu *cpu)
     case 2: /* UMAXV.  */
       {
 	uint64_t umax;
-	switch (uimm (aarch64_get_instr (cpu), 23, 22))
+	switch (INSTR (23, 22))
 	  {
 	  case 0:
 	    umax = aarch64_get_vec_u8 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 16 : 8); i++)
-	      umax = MAX (umax, aarch64_get_vec_u8 (cpu, vs, i));
+	      umax = max (umax, aarch64_get_vec_u8 (cpu, vs, i));
 	    break;
 	  case 1:
 	    umax = aarch64_get_vec_u16 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 8 : 4); i++)
-	      umax = MAX (umax, aarch64_get_vec_u16 (cpu, vs, i));
+	      umax = max (umax, aarch64_get_vec_u16 (cpu, vs, i));
 	    break;
 	  case 2:
 	    umax = aarch64_get_vec_u32 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 4 : 2); i++)
-	      umax = MAX (umax, aarch64_get_vec_u32 (cpu, vs, i));
+	      umax = max (umax, aarch64_get_vec_u32 (cpu, vs, i));
 	    break;
-	  default:
+
 	  case 3:
 	    HALT_UNALLOC;
 	  }
@@ -4227,33 +4467,30 @@ do_vec_maxv (sim_cpu *cpu)
     case 3: /* UMINV.  */
       {
 	uint64_t umin;
-	switch (uimm (aarch64_get_instr (cpu), 23, 22))
+	switch (INSTR (23, 22))
 	  {
 	  case 0:
 	    umin = aarch64_get_vec_u8 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 16 : 8); i++)
-	      umin = MIN (umin, aarch64_get_vec_u8 (cpu, vs, i));
+	      umin = min (umin, aarch64_get_vec_u8 (cpu, vs, i));
 	    break;
 	  case 1:
 	    umin = aarch64_get_vec_u16 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 8 : 4); i++)
-	      umin = MIN (umin, aarch64_get_vec_u16 (cpu, vs, i));
+	      umin = min (umin, aarch64_get_vec_u16 (cpu, vs, i));
 	    break;
 	  case 2:
 	    umin = aarch64_get_vec_u32 (cpu, vs, 0);
 	    for (i = 1; i < (full ? 4 : 2); i++)
-	      umin = MIN (umin, aarch64_get_vec_u32 (cpu, vs, i));
+	      umin = min (umin, aarch64_get_vec_u32 (cpu, vs, i));
 	    break;
-	  default:
+
 	  case 3:
 	    HALT_UNALLOC;
 	  }
 	aarch64_set_reg_u64 (cpu, rd, NO_SP, umin);
 	return;
       }
-
-    default:
-      HALT_UNALLOC;
     }
 }
 
@@ -4268,8 +4505,8 @@ do_vec_fminmaxV (sim_cpu *cpu)
      instr[9,5]   = V source
      instr[4.0]   = R dest.  */
 
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vs = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   unsigned i;
   float res   = aarch64_get_vec_float (cpu, vs, 0);
 
@@ -4277,9 +4514,10 @@ do_vec_fminmaxV (sim_cpu *cpu)
   NYI_assert (22, 14, 0x0C3);
   NYI_assert (11, 10, 2);
 
-  if (uimm (aarch64_get_instr (cpu), 23, 23))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (23, 23))
     {
-      switch (uimm (aarch64_get_instr (cpu), 13, 12))
+      switch (INSTR (13, 12))
 	{
 	case 0: /* FMNINNMV.  */
 	  for (i = 1; i < 4; i++)
@@ -4288,7 +4526,7 @@ do_vec_fminmaxV (sim_cpu *cpu)
 
 	case 3: /* FMINV.  */
 	  for (i = 1; i < 4; i++)
-	    res = MIN (res, aarch64_get_vec_float (cpu, vs, i));
+	    res = min (res, aarch64_get_vec_float (cpu, vs, i));
 	  break;
 
 	default:
@@ -4297,7 +4535,7 @@ do_vec_fminmaxV (sim_cpu *cpu)
     }
   else
     {
-      switch (uimm (aarch64_get_instr (cpu), 13, 12))
+      switch (INSTR (13, 12))
 	{
 	case 0: /* FMNAXNMV.  */
 	  for (i = 1; i < 4; i++)
@@ -4306,7 +4544,7 @@ do_vec_fminmaxV (sim_cpu *cpu)
 
 	case 3: /* FMAXV.  */
 	  for (i = 1; i < 4; i++)
-	    res = MAX (res, aarch64_get_vec_float (cpu, vs, i));
+	    res = max (res, aarch64_get_vec_float (cpu, vs, i));
 	  break;
 
 	default:
@@ -4333,11 +4571,11 @@ do_vec_Fminmax (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned min = uimm (aarch64_get_instr (cpu), 23, 23);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned min = INSTR (23, 23);
   unsigned i;
 
   NYI_assert (29, 24, 0x0E);
@@ -4345,16 +4583,17 @@ do_vec_Fminmax (sim_cpu *cpu)
   NYI_assert (15, 14, 3);
   NYI_assert (11, 10, 1);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       double (* func)(double, double);
 
       if (! full)
 	HALT_NYI;
 
-      if (uimm (aarch64_get_instr (cpu), 13, 12) == 0)
+      if (INSTR (13, 12) == 0)
 	func = min ? dminnm : dmaxnm;
-      else if (uimm (aarch64_get_instr (cpu), 13, 12) == 3)
+      else if (INSTR (13, 12) == 3)
 	func = min ? fmin : fmax;
       else
 	HALT_NYI;
@@ -4368,9 +4607,9 @@ do_vec_Fminmax (sim_cpu *cpu)
     {
       float (* func)(float, float);
 
-      if (uimm (aarch64_get_instr (cpu), 13, 12) == 0)
+      if (INSTR (13, 12) == 0)
 	func = min ? fminnm : fmaxnm;
-      else if (uimm (aarch64_get_instr (cpu), 13, 12) == 3)
+      else if (INSTR (13, 12) == 3)
 	func = min ? fminf : fmaxf;
       else
 	HALT_NYI;
@@ -4393,15 +4632,16 @@ do_vec_SCVTF (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = Vd.  */
 
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned size = uimm (aarch64_get_instr (cpu), 22, 22);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned size = INSTR (22, 22);
   unsigned i;
 
   NYI_assert (29, 23, 0x1C);
   NYI_assert (21, 10, 0x876);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   if (size)
     {
       if (! full)
@@ -4462,8 +4702,6 @@ do_vec_SCVTF (sim_cpu *cpu)
 				 aarch64_get_vec_##SOURCE##64 (cpu, vm, i) \
 				 ? -1ULL : 0);				\
 	  return;							\
-	default:							\
-	HALT_UNALLOC;							\
 	}								\
     }									\
   while (0)
@@ -4499,8 +4737,6 @@ do_vec_SCVTF (sim_cpu *cpu)
 				 aarch64_get_vec_##SOURCE##64 (cpu, vn, i) \
 				 CMP 0 ? -1ULL : 0);			\
 	  return;							\
-	default:							\
-	  HALT_UNALLOC;							\
 	}								\
     }									\
   while (0)
@@ -4510,7 +4746,7 @@ do_vec_SCVTF (sim_cpu *cpu)
     {									\
       if (vm != 0)							\
 	HALT_NYI;							\
-      if (uimm (aarch64_get_instr (cpu), 22, 22))			\
+      if (INSTR (22, 22))						\
 	{								\
 	  if (! full)							\
 	    HALT_NYI;							\
@@ -4533,7 +4769,7 @@ do_vec_SCVTF (sim_cpu *cpu)
 #define VEC_FCMP(CMP)							\
   do									\
     {									\
-      if (uimm (aarch64_get_instr (cpu), 22, 22))			\
+      if (INSTR (22, 22))						\
 	{								\
 	  if (! full)							\
 	    HALT_NYI;							\
@@ -4572,31 +4808,32 @@ do_vec_compare (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  int full = uimm (aarch64_get_instr (cpu), 30, 30);
-  int size = uimm (aarch64_get_instr (cpu), 23, 22);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  int full = INSTR (30, 30);
+  int size = INSTR (23, 22);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (28, 24, 0x0E);
   NYI_assert (21, 21, 1);
 
-  if ((uimm (aarch64_get_instr (cpu), 11, 11)
-       && uimm (aarch64_get_instr (cpu), 14, 14))
-      || ((uimm (aarch64_get_instr (cpu), 11, 11) == 0
-	   && uimm (aarch64_get_instr (cpu), 10, 10) == 0)))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if ((INSTR (11, 11)
+       && INSTR (14, 14))
+      || ((INSTR (11, 11) == 0
+	   && INSTR (10, 10) == 0)))
     {
       /* A compare vs 0.  */
       if (vm != 0)
 	{
-	  if (uimm (aarch64_get_instr (cpu), 15, 10) == 0x2A)
+	  if (INSTR (15, 10) == 0x2A)
 	    do_vec_maxv (cpu);
-	  else if (uimm (aarch64_get_instr (cpu), 15, 10) == 0x32
-		   || uimm (aarch64_get_instr (cpu), 15, 10) == 0x3E)
+	  else if (INSTR (15, 10) == 0x32
+		   || INSTR (15, 10) == 0x3E)
 	    do_vec_fminmaxV (cpu);
-	  else if (uimm (aarch64_get_instr (cpu), 29, 23) == 0x1C
-		   && uimm (aarch64_get_instr (cpu), 21, 10) == 0x876)
+	  else if (INSTR (29, 23) == 0x1C
+		   && INSTR (21, 10) == 0x876)
 	    do_vec_SCVTF (cpu);
 	  else
 	    HALT_NYI;
@@ -4604,12 +4841,11 @@ do_vec_compare (sim_cpu *cpu)
 	}
     }
 
-  if (uimm (aarch64_get_instr (cpu), 14, 14))
+  if (INSTR (14, 14))
     {
       /* A floating point compare.  */
-      unsigned decode = (uimm (aarch64_get_instr (cpu), 29, 29) << 5)
-	| (uimm (aarch64_get_instr (cpu), 23, 23) << 4)
-	| uimm (aarch64_get_instr (cpu), 13, 10);
+      unsigned decode = (INSTR (29, 29) << 5) | (INSTR (23, 23) << 4)
+	| INSTR (13, 10);
 
       NYI_assert (15, 15, 1);
 
@@ -4630,14 +4866,14 @@ do_vec_compare (sim_cpu *cpu)
     }
   else
     {
-      unsigned decode = (uimm (aarch64_get_instr (cpu), 29, 29) << 6)
-	| uimm (aarch64_get_instr (cpu), 15, 10);
+      unsigned decode = (INSTR (29, 29) << 6) | INSTR (15, 10);
 
       switch (decode)
 	{
 	case 0x0D: /* 0001101 GT */     VEC_CMP  (s, > );
 	case 0x0F: /* 0001111 GE */     VEC_CMP  (s, >= );
 	case 0x22: /* 0100010 GT #0 */  VEC_CMP0 (s, > );
+	case 0x23: /* 0100011 TST */	VEC_CMP  (u, & );
 	case 0x26: /* 0100110 EQ #0 */  VEC_CMP0 (s, == );
 	case 0x2A: /* 0101010 LT #0 */  VEC_CMP0 (s, < );
 	case 0x4D: /* 1001101 HI */     VEC_CMP  (u, > );
@@ -4666,11 +4902,12 @@ do_vec_SSHL (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = Vd.  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
+  signed int shift;
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 21, 1);
@@ -4678,36 +4915,62 @@ do_vec_SSHL (sim_cpu *cpu)
 
   /* FIXME: What is a signed shift left in this context ?.  */
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
       for (i = 0; i < (full ? 16 : 8); i++)
-	aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
-			    << aarch64_get_vec_s8 (cpu, vm, i));
+	{
+	  shift = aarch64_get_vec_s8 (cpu, vm, i);
+	  if (shift >= 0)
+	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
+				<< shift);
+	  else
+	    aarch64_set_vec_s8 (cpu, vd, i, aarch64_get_vec_s8 (cpu, vn, i)
+				>> - shift);
+	}
       return;
 
     case 1:
       for (i = 0; i < (full ? 8 : 4); i++)
-	aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
-			     << aarch64_get_vec_s16 (cpu, vm, i));
+	{
+	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
+	  if (shift >= 0)
+	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
+				 << shift);
+	  else
+	    aarch64_set_vec_s16 (cpu, vd, i, aarch64_get_vec_s16 (cpu, vn, i)
+				 >> - shift);
+	}
       return;
 
     case 2:
       for (i = 0; i < (full ? 4 : 2); i++)
-	aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
-			     << aarch64_get_vec_s32 (cpu, vm, i));
+	{
+	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
+	  if (shift >= 0)
+	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
+				 << shift);
+	  else
+	    aarch64_set_vec_s32 (cpu, vd, i, aarch64_get_vec_s32 (cpu, vn, i)
+				 >> - shift);
+	}
       return;
 
     case 3:
       if (! full)
 	HALT_UNALLOC;
       for (i = 0; i < 2; i++)
-	aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
-			     << aarch64_get_vec_s64 (cpu, vm, i));
+	{
+	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
+	  if (shift >= 0)
+	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
+				 << shift);
+	  else
+	    aarch64_set_vec_s64 (cpu, vd, i, aarch64_get_vec_s64 (cpu, vn, i)
+				 >> - shift);
+	}
       return;
-
-    default:
-      HALT_NYI;
     }
 }
 
@@ -4724,45 +4987,72 @@ do_vec_USHL (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = Vd  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
+  signed int shift;
 
   NYI_assert (29, 24, 0x2E);
   NYI_assert (15, 10, 0x11);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
-      for (i = 0; i < (full ? 16 : 8); i++)
-	aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
-			    << aarch64_get_vec_u8 (cpu, vm, i));
+	for (i = 0; i < (full ? 16 : 8); i++)
+	  {
+	    shift = aarch64_get_vec_s8 (cpu, vm, i);
+	    if (shift >= 0)
+	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
+				  << shift);
+	    else
+	      aarch64_set_vec_u8 (cpu, vd, i, aarch64_get_vec_u8 (cpu, vn, i)
+				  >> - shift);
+	  }
       return;
 
     case 1:
       for (i = 0; i < (full ? 8 : 4); i++)
-	aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
-			     << aarch64_get_vec_u16 (cpu, vm, i));
+	{
+	  shift = aarch64_get_vec_s8 (cpu, vm, i * 2);
+	  if (shift >= 0)
+	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
+				 << shift);
+	  else
+	    aarch64_set_vec_u16 (cpu, vd, i, aarch64_get_vec_u16 (cpu, vn, i)
+				 >> - shift);
+	}
       return;
 
     case 2:
       for (i = 0; i < (full ? 4 : 2); i++)
-	aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
-			     << aarch64_get_vec_u32 (cpu, vm, i));
+	{
+	  shift = aarch64_get_vec_s8 (cpu, vm, i * 4);
+	  if (shift >= 0)
+	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
+				 << shift);
+	  else
+	    aarch64_set_vec_u32 (cpu, vd, i, aarch64_get_vec_u32 (cpu, vn, i)
+				 >> - shift);
+	}
       return;
 
     case 3:
       if (! full)
 	HALT_UNALLOC;
       for (i = 0; i < 2; i++)
-	aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
-			     << aarch64_get_vec_u64 (cpu, vm, i));
+	{
+	  shift = aarch64_get_vec_s8 (cpu, vm, i * 8);
+	  if (shift >= 0)
+	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
+				 << shift);
+	  else
+	    aarch64_set_vec_u64 (cpu, vd, i, aarch64_get_vec_u64 (cpu, vn, i)
+				 >> - shift);
+	}
       return;
-
-    default:
-      HALT_NYI;
     }
 }
 
@@ -4779,17 +5069,18 @@ do_vec_FMLA (sim_cpu *cpu)
      instr[9,5]   = Vm
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 23, 0x1C);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x33);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       if (! full)
 	HALT_UNALLOC;
@@ -4823,19 +5114,20 @@ do_vec_max (sim_cpu *cpu)
      instr[9,5]   = Vm
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (28, 24, 0x0E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x19);
 
-  if (uimm (aarch64_get_instr (cpu), 29, 29))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (29, 29))
     {
-      switch (uimm (aarch64_get_instr (cpu), 23, 22))
+      switch (INSTR (23, 22))
 	{
 	case 0:
 	  for (i = 0; i < (full ? 16 : 8); i++)
@@ -4864,14 +5156,13 @@ do_vec_max (sim_cpu *cpu)
 				 : aarch64_get_vec_u32 (cpu, vm, i));
 	  return;
 
-	default:
 	case 3:
 	  HALT_UNALLOC;
 	}
     }
   else
     {
-      switch (uimm (aarch64_get_instr (cpu), 23, 22))
+      switch (INSTR (23, 22))
 	{
 	case 0:
 	  for (i = 0; i < (full ? 16 : 8); i++)
@@ -4900,7 +5191,6 @@ do_vec_max (sim_cpu *cpu)
 				 : aarch64_get_vec_s32 (cpu, vm, i));
 	  return;
 
-	default:
 	case 3:
 	  HALT_UNALLOC;
 	}
@@ -4921,19 +5211,20 @@ do_vec_min (sim_cpu *cpu)
      instr[9,5]   = Vm
      instr[4.0]   = Vd.  */
 
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (28, 24, 0x0E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x1B);
 
-  if (uimm (aarch64_get_instr (cpu), 29, 29))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (29, 29))
     {
-      switch (uimm (aarch64_get_instr (cpu), 23, 22))
+      switch (INSTR (23, 22))
 	{
 	case 0:
 	  for (i = 0; i < (full ? 16 : 8); i++)
@@ -4962,14 +5253,13 @@ do_vec_min (sim_cpu *cpu)
 				 : aarch64_get_vec_u32 (cpu, vm, i));
 	  return;
 
-	default:
 	case 3:
 	  HALT_UNALLOC;
 	}
     }
   else
     {
-      switch (uimm (aarch64_get_instr (cpu), 23, 22))
+      switch (INSTR (23, 22))
 	{
 	case 0:
 	  for (i = 0; i < (full ? 16 : 8); i++)
@@ -4998,7 +5288,6 @@ do_vec_min (sim_cpu *cpu)
 				 : aarch64_get_vec_s32 (cpu, vm, i));
 	  return;
 
-	default:
 	case 3:
 	  HALT_UNALLOC;
 	}
@@ -5019,10 +5308,10 @@ do_vec_sub_long (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = V dest.  */
 
-  unsigned size = uimm (aarch64_get_instr (cpu), 23, 22);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned size = INSTR (23, 22);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned bias = 0;
   unsigned i;
 
@@ -5033,7 +5322,8 @@ do_vec_sub_long (sim_cpu *cpu)
   if (size == 3)
     HALT_UNALLOC;
 
-  switch (uimm (aarch64_get_instr (cpu), 30, 29))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (30, 29))
     {
     case 2: /* SSUBL2.  */
       bias = 2;
@@ -5103,21 +5393,6 @@ do_vec_sub_long (sim_cpu *cpu)
     }
 }
 
-#define DO_ADDP(FN)							\
-  do									\
-    {									\
-      for (i = 0; i < range; i++)					\
-	{								\
-	  aarch64_set_vec_##FN (cpu, vd, i,				\
-				aarch64_get_vec_##FN (cpu, vn, i * 2)	\
-				+ aarch64_get_vec_##FN (cpu, vn, i * 2 + 1)); \
-	  aarch64_set_vec_##FN (cpu, vd, i + range,			\
-				aarch64_get_vec_##FN (cpu, vm, i * 2)	\
-				+ aarch64_get_vec_##FN (cpu, vm, i * 2 + 1)); \
-	}								\
-      }									\
-    while (0)
-
 static void
 do_vec_ADDP (sim_cpu *cpu)
 {
@@ -5131,91 +5406,106 @@ do_vec_ADDP (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = V dest.  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned size = uimm (aarch64_get_instr (cpu), 23, 22);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  FRegister copy_vn;
+  FRegister copy_vm;
+  unsigned full = INSTR (30, 30);
+  unsigned size = INSTR (23, 22);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i, range;
 
   NYI_assert (29, 24, 0x0E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x2F);
 
+  /* Make copies of the source registers in case vd == vn/vm.  */
+  copy_vn = cpu->fr[vn];
+  copy_vm = cpu->fr[vm];
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   switch (size)
     {
     case 0:
       range = full ? 8 : 4;
-      DO_ADDP (u8);
+      for (i = 0; i < range; i++)
+	{
+	  aarch64_set_vec_u8 (cpu, vd, i,
+			      copy_vn.b[i * 2] + copy_vn.b[i * 2 + 1]);
+	  aarch64_set_vec_u8 (cpu, vd, i + range,
+			      copy_vm.b[i * 2] + copy_vm.b[i * 2 + 1]);
+	}
       return;
 
     case 1:
       range = full ? 4 : 2;
-      DO_ADDP (u16);
+      for (i = 0; i < range; i++)
+	{
+	  aarch64_set_vec_u16 (cpu, vd, i,
+			       copy_vn.h[i * 2] + copy_vn.h[i * 2 + 1]);
+	  aarch64_set_vec_u16 (cpu, vd, i + range,
+			       copy_vm.h[i * 2] + copy_vm.h[i * 2 + 1]);
+	}
       return;
 
     case 2:
       range = full ? 2 : 1;
-      DO_ADDP (u32);
+      for (i = 0; i < range; i++)
+	{
+	  aarch64_set_vec_u32 (cpu, vd, i,
+			       copy_vn.w[i * 2] + copy_vn.w[i * 2 + 1]);
+	  aarch64_set_vec_u32 (cpu, vd, i + range,
+			       copy_vm.w[i * 2] + copy_vm.w[i * 2 + 1]);
+	}
       return;
 
     case 3:
       if (! full)
 	HALT_UNALLOC;
-      range = 1;
-      DO_ADDP (u64);
+      aarch64_set_vec_u64 (cpu, vd, 0, copy_vn.v[0] + copy_vn.v[1]);
+      aarch64_set_vec_u64 (cpu, vd, 1, copy_vm.v[0] + copy_vm.v[1]);
       return;
-
-    default:
-      HALT_NYI;
     }
 }
 
+/* Float point vector convert to longer (precision).  */
 static void
-do_vec_UMOV (sim_cpu *cpu)
+do_vec_FCVTL (sim_cpu *cpu)
 {
   /* instr[31]    = 0
-     instr[30]    = 32-bit(0)/64-bit(1)
-     instr[29,21] = 00 1110 000
-     insrt[20,16] = size & index
-     instr[15,10] = 0011 11
-     instr[9,5]   = V source
-     instr[4,0]   = R dest.  */
+     instr[30]    = half (0) / all (1)
+     instr[29,23] = 00 1110 0
+     instr[22]    = single (0) / double (1)
+     instr[21,10] = 10 0001 0111 10
+     instr[9,5]   = Rn
+     instr[4,0]   = Rd.  */
 
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned index;
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned i;
 
-  NYI_assert (29, 21, 0x070);
-  NYI_assert (15, 10, 0x0F);
+  NYI_assert (31, 31, 0);
+  NYI_assert (29, 23, 0x1C);
+  NYI_assert (21, 10, 0x85E);
 
-  if (uimm (aarch64_get_instr (cpu), 16, 16))
-    {
-      /* Byte transfer.  */
-      index = uimm (aarch64_get_instr (cpu), 20, 17);
-      aarch64_set_reg_u64 (cpu, rd, NO_SP,
-			   aarch64_get_vec_u8 (cpu, vs, index));
-    }
-  else if (uimm (aarch64_get_instr (cpu), 17, 17))
-    {
-      index = uimm (aarch64_get_instr (cpu), 20, 18);
-      aarch64_set_reg_u64 (cpu, rd, NO_SP,
-			   aarch64_get_vec_u16 (cpu, vs, index));
-    }
-  else if (uimm (aarch64_get_instr (cpu), 18, 18))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
-      index = uimm (aarch64_get_instr (cpu), 20, 19);
-      aarch64_set_reg_u64 (cpu, rd, NO_SP,
-			   aarch64_get_vec_u32 (cpu, vs, index));
+      for (i = 0; i < 2; i++)
+	aarch64_set_vec_double (cpu, rd, i,
+				aarch64_get_vec_float (cpu, rn, i + 2*full));
     }
   else
     {
-      if (uimm (aarch64_get_instr (cpu), 30, 30) != 1)
-	HALT_UNALLOC;
+      HALT_NYI;
 
-      index = uimm (aarch64_get_instr (cpu), 20, 20);
-      aarch64_set_reg_u64 (cpu, rd, NO_SP,
-			   aarch64_get_vec_u64 (cpu, vs, index));
+#if 0
+      /* TODO: Implement missing half-float support.  */
+      for (i = 0; i < 4; i++)
+	aarch64_set_vec_float (cpu, rd, i,
+			     aarch64_get_vec_halffloat (cpu, rn, i + 4*full));
+#endif
     }
 }
 
@@ -5231,15 +5521,16 @@ do_vec_FABS (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = Vd.  */
 
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
   unsigned i;
 
   NYI_assert (29, 23, 0x1D);
   NYI_assert (21, 10, 0x83E);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       if (! full)
 	HALT_NYI;
@@ -5267,16 +5558,17 @@ do_vec_FCVTZS (sim_cpu *cpu)
      instr[9,5]   = Rn
      instr[4,0]   = Rd.  */
 
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
   unsigned i;
 
   NYI_assert (31, 31, 0);
   NYI_assert (29, 23, 0x1D);
   NYI_assert (21, 10, 0x86E);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       if (! full)
 	HALT_UNALLOC;
@@ -5291,6 +5583,92 @@ do_vec_FCVTZS (sim_cpu *cpu)
 			   (int32_t) aarch64_get_vec_float (cpu, rn, i));
 }
 
+static void
+do_vec_REV64 (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = full/half
+     instr[29,24] = 00 1110 
+     instr[23,22] = size
+     instr[21,10] = 10 0000 0000 10
+     instr[9,5]   = Rn
+     instr[4,0]   = Rd.  */
+
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned size = INSTR (23, 22);
+  unsigned full = INSTR (30, 30);
+  unsigned i;
+  FRegister val;
+
+  NYI_assert (29, 24, 0x0E);
+  NYI_assert (21, 10, 0x802);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (size)
+    {
+    case 0:
+      for (i = 0; i < (full ? 16 : 8); i++)
+	val.b[i ^ 0x7] = aarch64_get_vec_u8 (cpu, rn, i);
+      break;
+
+    case 1:
+      for (i = 0; i < (full ? 8 : 4); i++)
+	val.h[i ^ 0x3] = aarch64_get_vec_u16 (cpu, rn, i);
+      break;
+
+    case 2:
+      for (i = 0; i < (full ? 4 : 2); i++)
+	val.w[i ^ 0x1] = aarch64_get_vec_u32 (cpu, rn, i);
+      break;
+      
+    case 3:
+      HALT_UNALLOC;
+    }
+
+  aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
+  if (full)
+    aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
+}
+
+static void
+do_vec_REV16 (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = full/half
+     instr[29,24] = 00 1110 
+     instr[23,22] = size
+     instr[21,10] = 10 0000 0001 10
+     instr[9,5]   = Rn
+     instr[4,0]   = Rd.  */
+
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned size = INSTR (23, 22);
+  unsigned full = INSTR (30, 30);
+  unsigned i;
+  FRegister val;
+
+  NYI_assert (29, 24, 0x0E);
+  NYI_assert (21, 10, 0x806);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (size)
+    {
+    case 0:
+      for (i = 0; i < (full ? 16 : 8); i++)
+	val.b[i ^ 0x1] = aarch64_get_vec_u8 (cpu, rn, i);
+      break;
+
+    default:
+      HALT_UNALLOC;
+    }
+
+  aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
+  if (full)
+    aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
+}
+
 static void
 do_vec_op1 (sim_cpu *cpu)
 {
@@ -5304,29 +5682,22 @@ do_vec_op1 (sim_cpu *cpu)
      instr[4,0]   = Vd  */
   NYI_assert (29, 24, 0x0E);
 
-  if (uimm (aarch64_get_instr (cpu), 21, 21) == 0)
+  if (INSTR (21, 21) == 0)
     {
-      if (uimm (aarch64_get_instr (cpu), 23, 22) == 0)
+      if (INSTR (23, 22) == 0)
 	{
-	  if (uimm (aarch64_get_instr (cpu), 30, 30) == 1
-	      && uimm (aarch64_get_instr (cpu), 17, 14) == 0
-	      && uimm (aarch64_get_instr (cpu), 12, 10) == 7)
+	  if (INSTR (30, 30) == 1
+	      && INSTR (17, 14) == 0
+	      && INSTR (12, 10) == 7)
 	    return do_vec_ins_2 (cpu);
 
-	  switch (uimm (aarch64_get_instr (cpu), 15, 10))
+	  switch (INSTR (15, 10))
 	    {
 	    case 0x01: do_vec_DUP_vector_into_vector (cpu); return;
 	    case 0x03: do_vec_DUP_scalar_into_vector (cpu); return;
 	    case 0x07: do_vec_INS (cpu); return;
-	    case 0x0A: do_vec_TRN (cpu); return;
-
-	    case 0x0F:
-	      if (uimm (aarch64_get_instr (cpu), 17, 16) == 0)
-		{
-		  do_vec_MOV_into_scalar (cpu);
-		  return;
-		}
-	      break;
+	    case 0x0B: do_vec_SMOV_into_scalar (cpu); return;
+	    case 0x0F: do_vec_UMOV_into_scalar (cpu); return;
 
 	    case 0x00:
 	    case 0x08:
@@ -5338,6 +5709,8 @@ do_vec_op1 (sim_cpu *cpu)
 	    case 0x16:
 	      do_vec_UZP (cpu); return;
 
+	    case 0x0A: do_vec_TRN (cpu); return;
+
 	    case 0x0E:
 	    case 0x1E:
 	      do_vec_ZIP (cpu); return;
@@ -5347,20 +5720,22 @@ do_vec_op1 (sim_cpu *cpu)
 	    }
 	}
 
-      switch (uimm (aarch64_get_instr (cpu), 13, 10))
+      switch (INSTR (13, 10))
 	{
 	case 0x6: do_vec_UZP (cpu); return;
 	case 0xE: do_vec_ZIP (cpu); return;
 	case 0xA: do_vec_TRN (cpu); return;
-	case 0xF: do_vec_UMOV (cpu); return;
 	default:  HALT_NYI;
 	}
     }
 
-  switch (uimm (aarch64_get_instr (cpu), 15, 10))
+  switch (INSTR (15, 10))
     {
+    case 0x02: do_vec_REV64 (cpu); return;
+    case 0x06: do_vec_REV16 (cpu); return;
+
     case 0x07:
-      switch (uimm (aarch64_get_instr (cpu), 23, 21))
+      switch (INSTR (23, 21))
 	{
 	case 1: do_vec_AND (cpu); return;
 	case 3: do_vec_BIC (cpu); return;
@@ -5372,6 +5747,7 @@ do_vec_op1 (sim_cpu *cpu)
     case 0x08: do_vec_sub_long (cpu); return;
     case 0x0a: do_vec_XTN (cpu); return;
     case 0x11: do_vec_SSHL (cpu); return;
+    case 0x16: do_vec_CNT (cpu); return;
     case 0x19: do_vec_max (cpu); return;
     case 0x1B: do_vec_min (cpu); return;
     case 0x21: do_vec_add (cpu); return;
@@ -5382,8 +5758,15 @@ do_vec_op1 (sim_cpu *cpu)
     case 0x33: do_vec_FMLA (cpu); return;
     case 0x35: do_vec_fadd (cpu); return;
 
+    case 0x1E:
+      switch (INSTR (20, 16))
+	{
+	case 0x01: do_vec_FCVTL (cpu); return;
+	default: HALT_NYI;
+	}
+
     case 0x2E:
-      switch (uimm (aarch64_get_instr (cpu), 20, 16))
+      switch (INSTR (20, 16))
 	{
 	case 0x00: do_vec_ABS (cpu); return;
 	case 0x01: do_vec_FCVTZS (cpu); return;
@@ -5426,79 +5809,92 @@ do_vec_xtl (sim_cpu *cpu)
      instr[9,5]   = V source
      instr[4,0]   = V dest.  */
 
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i, shift, bias = 0;
 
   NYI_assert (28, 22, 0x3C);
   NYI_assert (15, 10, 0x29);
 
-  switch (uimm (aarch64_get_instr (cpu), 30, 29))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (30, 29))
     {
     case 2: /* SXTL2, SSHLL2.  */
       bias = 2;
     case 0: /* SXTL, SSHLL.  */
-      if (uimm (aarch64_get_instr (cpu), 21, 21))
+      if (INSTR (21, 21))
 	{
-	  shift = uimm (aarch64_get_instr (cpu), 20, 16);
-	  aarch64_set_vec_s64
-	    (cpu, vd, 0, aarch64_get_vec_s32 (cpu, vs, bias) << shift);
-	  aarch64_set_vec_s64
-	    (cpu, vd, 1, aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift);
+	  int64_t val1, val2;
+
+	  shift = INSTR (20, 16);
+	  /* Get the source values before setting the destination values
+	     in case the source and destination are the same.  */
+	  val1 = aarch64_get_vec_s32 (cpu, vs, bias) << shift;
+	  val2 = aarch64_get_vec_s32 (cpu, vs, bias + 1) << shift;
+	  aarch64_set_vec_s64 (cpu, vd, 0, val1);
+	  aarch64_set_vec_s64 (cpu, vd, 1, val2);
 	}
-      else if (uimm (aarch64_get_instr (cpu), 20, 20))
+      else if (INSTR (20, 20))
 	{
-	  shift = uimm (aarch64_get_instr (cpu), 19, 16);
+	  int32_t v[4];
+	  int32_t v1,v2,v3,v4;
+
+	  shift = INSTR (19, 16);
 	  bias *= 2;
 	  for (i = 0; i < 4; i++)
-	    aarch64_set_vec_s32
-	      (cpu, vd, i, aarch64_get_vec_s16 (cpu, vs, i + bias) << shift);
+	    v[i] = aarch64_get_vec_s16 (cpu, vs, bias + i) << shift;
+	  for (i = 0; i < 4; i++)
+	    aarch64_set_vec_s32 (cpu, vd, i, v[i]);
 	}
       else
 	{
+	  int16_t v[8];
 	  NYI_assert (19, 19, 1);
 
-	  shift = uimm (aarch64_get_instr (cpu), 18, 16);
-	  bias *= 3;
+	  shift = INSTR (18, 16);
+	  bias *= 4;
+	  for (i = 0; i < 8; i++)
+	    v[i] = aarch64_get_vec_s8 (cpu, vs, i + bias) << shift;
 	  for (i = 0; i < 8; i++)
-	    aarch64_set_vec_s16
-	      (cpu, vd, i, aarch64_get_vec_s8 (cpu, vs, i + bias) << shift);
+	    aarch64_set_vec_s16 (cpu, vd, i, v[i]);
 	}
       return;
 
     case 3: /* UXTL2, USHLL2.  */
       bias = 2;
     case 1: /* UXTL, USHLL.  */
-      if (uimm (aarch64_get_instr (cpu), 21, 21))
+      if (INSTR (21, 21))
 	{
-	  shift = uimm (aarch64_get_instr (cpu), 20, 16);
-	  aarch64_set_vec_u64
-	    (cpu, vd, 0, aarch64_get_vec_u32 (cpu, vs, bias) << shift);
-	  aarch64_set_vec_u64
-	    (cpu, vd, 1, aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift);
+	  uint64_t v1, v2;
+	  shift = INSTR (20, 16);
+	  v1 = aarch64_get_vec_u32 (cpu, vs, bias) << shift;
+	  v2 = aarch64_get_vec_u32 (cpu, vs, bias + 1) << shift;
+	  aarch64_set_vec_u64 (cpu, vd, 0, v1);
+	  aarch64_set_vec_u64 (cpu, vd, 1, v2);
 	}
-      else if (uimm (aarch64_get_instr (cpu), 20, 20))
+      else if (INSTR (20, 20))
 	{
-	  shift = uimm (aarch64_get_instr (cpu), 19, 16);
+	  uint32_t v[4];
+	  shift = INSTR (19, 16);
 	  bias *= 2;
 	  for (i = 0; i < 4; i++)
-	    aarch64_set_vec_u32
-	      (cpu, vd, i, aarch64_get_vec_u16 (cpu, vs, i + bias) << shift);
+	    v[i] = aarch64_get_vec_u16 (cpu, vs, i + bias) << shift;
+	  for (i = 0; i < 4; i++)
+	    aarch64_set_vec_u32 (cpu, vd, i, v[i]);
 	}
       else
 	{
+	  uint16_t v[8];
 	  NYI_assert (19, 19, 1);
 
-	  shift = uimm (aarch64_get_instr (cpu), 18, 16);
-	  bias *= 3;
+	  shift = INSTR (18, 16);
+	  bias *= 4;
 	  for (i = 0; i < 8; i++)
-	    aarch64_set_vec_u16
-	      (cpu, vd, i, aarch64_get_vec_u8 (cpu, vs, i + bias) << shift);
+	    v[i] = aarch64_get_vec_u8 (cpu, vs, i + bias) << shift;
+	  for (i = 0; i < 8; i++)
+	    aarch64_set_vec_u16 (cpu, vd, i, v[i]);
 	}
       return;
-
-    default:
-      HALT_NYI;
     }
 }
 
@@ -5514,17 +5910,18 @@ do_vec_SHL (sim_cpu *cpu)
      instr [4, 0]  = Vd.  */
 
   int shift;
-  int full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  int full    = INSTR (30, 30);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 23, 0x1E);
   NYI_assert (15, 10, 0x15);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
-      shift = uimm (aarch64_get_instr (cpu), 21, 16) - 1;
+      shift = INSTR (21, 16);
 
       if (full == 0)
 	HALT_UNALLOC;
@@ -5538,9 +5935,9 @@ do_vec_SHL (sim_cpu *cpu)
       return;
     }
 
-  if (uimm (aarch64_get_instr (cpu), 21, 21))
+  if (INSTR (21, 21))
     {
-      shift = uimm (aarch64_get_instr (cpu), 20, 16) - 1;
+      shift = INSTR (20, 16);
 
       for (i = 0; i < (full ? 4 : 2); i++)
 	{
@@ -5551,9 +5948,9 @@ do_vec_SHL (sim_cpu *cpu)
       return;
     }
 
-  if (uimm (aarch64_get_instr (cpu), 20, 20))
+  if (INSTR (20, 20))
     {
-      shift = uimm (aarch64_get_instr (cpu), 19, 16) - 1;
+      shift = INSTR (19, 16);
 
       for (i = 0; i < (full ? 8 : 4); i++)
 	{
@@ -5564,10 +5961,10 @@ do_vec_SHL (sim_cpu *cpu)
       return;
     }
 
-  if (uimm (aarch64_get_instr (cpu), 19, 19) == 0)
+  if (INSTR (19, 19) == 0)
     HALT_UNALLOC;
 
-  shift = uimm (aarch64_get_instr (cpu), 18, 16) - 1;
+  shift = INSTR (18, 16);
 
   for (i = 0; i < (full ? 16 : 8); i++)
     {
@@ -5582,25 +5979,26 @@ do_vec_SSHR_USHR (sim_cpu *cpu)
   /* instr [31]    = 0
      instr [30]    = half(0)/full(1)
      instr [29]    = signed(0)/unsigned(1)
-     instr [28,23] = 01 1110
+     instr [28,23] = 0 1111 0
      instr [22,16] = size and shift amount
      instr [15,10] = 0000 01
      instr [9, 5]  = Vs
      instr [4, 0]  = Vd.  */
 
-  int shift;
-  int full = uimm (aarch64_get_instr (cpu), 30, 30);
-  int sign = uimm (aarch64_get_instr (cpu), 29, 29);
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  int full       = INSTR (30, 30);
+  int sign       = ! INSTR (29, 29);
+  unsigned shift = INSTR (22, 16);
+  unsigned vs    = INSTR (9, 5);
+  unsigned vd    = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (28, 23, 0x1E);
   NYI_assert (15, 10, 0x01);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
-      shift = uimm (aarch64_get_instr (cpu), 21, 16);
+      shift = 128 - shift;
 
       if (full == 0)
 	HALT_UNALLOC;
@@ -5621,9 +6019,9 @@ do_vec_SSHR_USHR (sim_cpu *cpu)
       return;
     }
 
-  if (uimm (aarch64_get_instr (cpu), 21, 21))
+  if (INSTR (21, 21))
     {
-      shift = uimm (aarch64_get_instr (cpu), 20, 16);
+      shift = 64 - shift;
 
       if (sign)
 	for (i = 0; i < (full ? 4 : 2); i++)
@@ -5641,9 +6039,9 @@ do_vec_SSHR_USHR (sim_cpu *cpu)
       return;
     }
 
-  if (uimm (aarch64_get_instr (cpu), 20, 20))
+  if (INSTR (20, 20))
     {
-      shift = uimm (aarch64_get_instr (cpu), 19, 16);
+      shift = 32 - shift;
 
       if (sign)
 	for (i = 0; i < (full ? 8 : 4); i++)
@@ -5661,10 +6059,10 @@ do_vec_SSHR_USHR (sim_cpu *cpu)
       return;
     }
 
-  if (uimm (aarch64_get_instr (cpu), 19, 19) == 0)
+  if (INSTR (19, 19) == 0)
     HALT_UNALLOC;
 
-  shift = uimm (aarch64_get_instr (cpu), 18, 16);
+  shift = 16 - shift;
 
   if (sign)
     for (i = 0; i < (full ? 16 : 8); i++)
@@ -5681,28 +6079,187 @@ do_vec_SSHR_USHR (sim_cpu *cpu)
 }
 
 static void
-do_vec_op2 (sim_cpu *cpu)
+do_vec_MUL_by_element (sim_cpu *cpu)
 {
   /* instr[31]    = 0
      instr[30]    = half/full
      instr[29,24] = 00 1111
-     instr[23]    = ?
-     instr[22,16] = element size & index
+     instr[23,22] = size
+     instr[21]    = L
+     instr[20]    = M
+     instr[19,16] = m
+     instr[15,12] = 1000
+     instr[11]    = H
+     instr[10]    = 0
+     instr[9,5]   = Vn
+     instr[4,0]   = Vd  */
+
+  unsigned full     = INSTR (30, 30);
+  unsigned L        = INSTR (21, 21);
+  unsigned H        = INSTR (11, 11);
+  unsigned vn       = INSTR (9, 5);
+  unsigned vd       = INSTR (4, 0);
+  unsigned size     = INSTR (23, 22);
+  unsigned index;
+  unsigned vm;
+  unsigned e;
+
+  NYI_assert (29, 24, 0x0F);
+  NYI_assert (15, 12, 0x8);
+  NYI_assert (10, 10, 0);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (size)
+    {
+    case 1:
+      {
+	/* 16 bit products.  */
+	uint16_t product;
+	uint16_t element1;
+	uint16_t element2;
+
+	index = (H << 2) | (L << 1) | INSTR (20, 20);
+	vm = INSTR (19, 16);
+	element2 = aarch64_get_vec_u16 (cpu, vm, index);
+
+	for (e = 0; e < (full ? 8 : 4); e ++)
+	  {
+	    element1 = aarch64_get_vec_u16 (cpu, vn, e);
+	    product  = element1 * element2;
+	    aarch64_set_vec_u16 (cpu, vd, e, product);
+	  }
+      }
+      break;
+
+    case 2:
+      {
+	/* 32 bit products.  */
+	uint32_t product;
+	uint32_t element1;
+	uint32_t element2;
+
+	index = (H << 1) | L;
+	vm = INSTR (20, 16);
+	element2 = aarch64_get_vec_u32 (cpu, vm, index);
+
+	for (e = 0; e < (full ? 4 : 2); e ++)
+	  {
+	    element1 = aarch64_get_vec_u32 (cpu, vn, e);
+	    product  = element1 * element2;
+	    aarch64_set_vec_u32 (cpu, vd, e, product);
+	  }
+      }
+      break;
+
+    default:
+      HALT_UNALLOC;
+    }
+}
+
+static void
+do_FMLA_by_element (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = half/full
+     instr[29,23] = 00 1111 1
+     instr[22]    = size
+     instr[21]    = L
+     instr[20,16] = m
+     instr[15,12] = 0001
+     instr[11]    = H
+     instr[10]    = 0
+     instr[9,5]   = Vn
+     instr[4,0]   = Vd  */
+
+  unsigned full     = INSTR (30, 30);
+  unsigned size     = INSTR (22, 22);
+  unsigned L        = INSTR (21, 21);
+  unsigned vm       = INSTR (20, 16);
+  unsigned H        = INSTR (11, 11);
+  unsigned vn       = INSTR (9, 5);
+  unsigned vd       = INSTR (4, 0);
+  unsigned e;
+
+  NYI_assert (29, 23, 0x1F);
+  NYI_assert (15, 12, 0x1);
+  NYI_assert (10, 10, 0);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (size)
+    {
+      double element1, element2;
+
+      if (! full || L)
+	HALT_UNALLOC;
+
+      element2 = aarch64_get_vec_double (cpu, vm, H);
+
+      for (e = 0; e < 2; e++)
+	{
+	  element1 = aarch64_get_vec_double (cpu, vn, e);
+	  element1 *= element2;
+	  element1 += aarch64_get_vec_double (cpu, vd, e);
+	  aarch64_set_vec_double (cpu, vd, e, element1);
+	}
+    }
+  else
+    {
+      float element1;
+      float element2 = aarch64_get_vec_float (cpu, vm, (H << 1) | L);
+
+      for (e = 0; e < (full ? 4 : 2); e++)
+	{
+	  element1 = aarch64_get_vec_float (cpu, vn, e);
+	  element1 *= element2;
+	  element1 += aarch64_get_vec_float (cpu, vd, e);
+	  aarch64_set_vec_float (cpu, vd, e, element1);
+	}
+    }
+}
+
+static void
+do_vec_op2 (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = half/full
+     instr[29,24] = 00 1111
+     instr[23]    = ?
+     instr[22,16] = element size & index
      instr[15,10] = sub-opcode
      instr[9,5]   = Vm
-     instr[4.0]   = Vd  */
+     instr[4,0]   = Vd  */
 
   NYI_assert (29, 24, 0x0F);
 
-  if (uimm (aarch64_get_instr (cpu), 23, 23) != 0)
-    HALT_NYI;
+  if (INSTR (23, 23) != 0)
+    {
+      switch (INSTR (15, 10))
+	{
+	case 0x04:
+	case 0x06:
+	  do_FMLA_by_element (cpu);
+	  return;
+
+	case 0x20:
+	case 0x22:
+	  do_vec_MUL_by_element (cpu);
+	  return;
 
-  switch (uimm (aarch64_get_instr (cpu), 15, 10))
+	default:
+	  HALT_NYI;
+	}
+    }
+  else
     {
-    case 0x01: do_vec_SSHR_USHR (cpu); return;
-    case 0x15: do_vec_SHL (cpu); return;
-    case 0x29: do_vec_xtl (cpu); return;
-    default:   HALT_NYI;
+      switch (INSTR (15, 10))
+	{
+	case 0x01: do_vec_SSHR_USHR (cpu); return;
+	case 0x15: do_vec_SHL (cpu); return;
+	case 0x20:
+	case 0x22: do_vec_MUL_by_element (cpu); return;
+	case 0x29: do_vec_xtl (cpu); return;
+	default:   HALT_NYI;
+	}
     }
 }
 
@@ -5717,15 +6274,16 @@ do_vec_neg (sim_cpu *cpu)
      instr[9,5]   = Vs
      instr[4,0]   = Vd  */
 
-  int    full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  int    full = INSTR (30, 30);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 24, 0x2E);
   NYI_assert (21, 10, 0x82E);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
       for (i = 0; i < (full ? 16 : 8); i++)
@@ -5748,9 +6306,6 @@ do_vec_neg (sim_cpu *cpu)
       for (i = 0; i < 2; i++)
 	aarch64_set_vec_s64 (cpu, vd, i, - aarch64_get_vec_s64 (cpu, vs, i));
       return;
-
-    default:
-      HALT_UNREACHABLE;
     }
 }
 
@@ -5765,15 +6320,16 @@ do_vec_sqrt (sim_cpu *cpu)
      instr[9,5]   = Vs
      instr[4,0]   = Vd.  */
 
-  int    full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  int    full = INSTR (30, 30);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 23, 0x5B);
   NYI_assert (21, 10, 0x87E);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22) == 0)
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22) == 0)
     for (i = 0; i < (full ? 4 : 2); i++)
       aarch64_set_vec_float (cpu, vd, i,
 			     sqrtf (aarch64_get_vec_float (cpu, vs, i)));
@@ -5799,16 +6355,17 @@ do_vec_mls_indexed (sim_cpu *cpu)
      instr[9,5]      = Vs
      instr[4,0]      = Vd.  */
 
-  int    full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
+  int    full = INSTR (30, 30);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned vm = INSTR (20, 16);
   unsigned i;
 
   NYI_assert (15, 12, 4);
   NYI_assert (10, 10, 0);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 1:
       {
@@ -5818,8 +6375,7 @@ do_vec_mls_indexed (sim_cpu *cpu)
 	if (vm > 15)
 	  HALT_NYI;
 
-	elem = (uimm (aarch64_get_instr (cpu), 21, 20) << 1)
-	  | uimm (aarch64_get_instr (cpu), 11, 11);
+	elem = (INSTR (21, 20) << 1) | INSTR (11, 11);
 	val = aarch64_get_vec_u16 (cpu, vm, elem);
 
 	for (i = 0; i < (full ? 8 : 4); i++)
@@ -5831,8 +6387,7 @@ do_vec_mls_indexed (sim_cpu *cpu)
 
     case 2:
       {
-	unsigned elem = (uimm (aarch64_get_instr (cpu), 21, 21) << 1)
-	  | uimm (aarch64_get_instr (cpu), 11, 11);
+	unsigned elem = (INSTR (21, 21) << 1) | INSTR (11, 11);
 	uint64_t val = aarch64_get_vec_u32 (cpu, vm, elem);
 
 	for (i = 0; i < (full ? 4 : 2); i++)
@@ -5862,17 +6417,18 @@ do_vec_SUB (sim_cpu *cpu)
      instr [9, 5]  = Vn
      instr [4, 0]  = Vd.  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 24, 0x2E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x21);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
       for (i = 0; i < (full ? 16 : 8); i++)
@@ -5904,9 +6460,6 @@ do_vec_SUB (sim_cpu *cpu)
 			     aarch64_get_vec_s64 (cpu, vn, i)
 			     - aarch64_get_vec_s64 (cpu, vm, i));
       return;
-
-    default:
-      HALT_UNREACHABLE;
     }
 }
 
@@ -5923,40 +6476,41 @@ do_vec_MLS (sim_cpu *cpu)
      instr [9, 5]  = Vn
      instr [4, 0]  = Vd.  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 24, 0x2E);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x25);
 
-  switch (uimm (aarch64_get_instr (cpu), 23, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
     {
     case 0:
       for (i = 0; i < (full ? 16 : 8); i++)
 	aarch64_set_vec_u8 (cpu, vd, i,
-			    (aarch64_get_vec_u8 (cpu, vn, i)
-			     * aarch64_get_vec_u8 (cpu, vm, i))
-			    - aarch64_get_vec_u8 (cpu, vd, i));
+			    aarch64_get_vec_u8 (cpu, vd, i)
+			    - (aarch64_get_vec_u8 (cpu, vn, i)
+			       * aarch64_get_vec_u8 (cpu, vm, i)));
       return;
 
     case 1:
       for (i = 0; i < (full ? 8 : 4); i++)
 	aarch64_set_vec_u16 (cpu, vd, i,
-			     (aarch64_get_vec_u16 (cpu, vn, i)
-			      * aarch64_get_vec_u16 (cpu, vm, i))
-			     - aarch64_get_vec_u16 (cpu, vd, i));
+			     aarch64_get_vec_u16 (cpu, vd, i)
+			     - (aarch64_get_vec_u16 (cpu, vn, i)
+				* aarch64_get_vec_u16 (cpu, vm, i)));
       return;
 
     case 2:
       for (i = 0; i < (full ? 4 : 2); i++)
 	aarch64_set_vec_u32 (cpu, vd, i,
-			     (aarch64_get_vec_u32 (cpu, vn, i)
-			      * aarch64_get_vec_u32 (cpu, vm, i))
-			     - aarch64_get_vec_u32 (cpu, vd, i));
+			     aarch64_get_vec_u32 (cpu, vd, i)
+			     - (aarch64_get_vec_u32 (cpu, vn, i)
+				* aarch64_get_vec_u32 (cpu, vm, i)));
       return;
 
     default:
@@ -5977,17 +6531,18 @@ do_vec_FDIV (sim_cpu *cpu)
      instr [9, 5]  = Vn
      instr [4, 0]  = Vd.  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 23, 0x5C);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x3F);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       if (! full)
 	HALT_UNALLOC;
@@ -6017,17 +6572,18 @@ do_vec_FMUL (sim_cpu *cpu)
      instr [9, 5]  = Vn
      instr [4, 0]  = Vd.  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
 
   NYI_assert (29, 23, 0x5C);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x37);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       if (! full)
 	HALT_UNALLOC;
@@ -6057,39 +6613,55 @@ do_vec_FADDP (sim_cpu *cpu)
      instr [9, 5]  = Vn
      instr [4, 0]  = Vd.  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned full = INSTR (30, 30);
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
 
   NYI_assert (29, 23, 0x5C);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x35);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
+      /* Extract values before adding them incase vd == vn/vm.  */
+      double tmp1 = aarch64_get_vec_double (cpu, vn, 0);
+      double tmp2 = aarch64_get_vec_double (cpu, vn, 1);
+      double tmp3 = aarch64_get_vec_double (cpu, vm, 0);
+      double tmp4 = aarch64_get_vec_double (cpu, vm, 1);
+
       if (! full)
 	HALT_UNALLOC;
 
-      aarch64_set_vec_double (cpu, vd, 0, aarch64_get_vec_double (cpu, vn, 0)
-			      + aarch64_get_vec_double (cpu, vn, 1));
-      aarch64_set_vec_double (cpu, vd, 1, aarch64_get_vec_double (cpu, vm, 0)
-			      + aarch64_get_vec_double (cpu, vm, 1));
+      aarch64_set_vec_double (cpu, vd, 0, tmp1 + tmp2);
+      aarch64_set_vec_double (cpu, vd, 1, tmp3 + tmp4);
     }
   else
     {
-      aarch64_set_vec_float (cpu, vd, 0, aarch64_get_vec_float (cpu, vn, 0)
-			     + aarch64_get_vec_float (cpu, vn, 1));
-      if (full)
-	aarch64_set_vec_float (cpu, vd, 1, aarch64_get_vec_float (cpu, vn, 2)
-			       + aarch64_get_vec_float (cpu, vn, 3));
-      aarch64_set_vec_float (cpu, vd, full ? 2 : 1,
-			     aarch64_get_vec_float (cpu, vm, 0)
-			     + aarch64_get_vec_float (cpu, vm, 1));
+      /* Extract values before adding them incase vd == vn/vm.  */
+      float tmp1 = aarch64_get_vec_float (cpu, vn, 0);
+      float tmp2 = aarch64_get_vec_float (cpu, vn, 1);
+      float tmp5 = aarch64_get_vec_float (cpu, vm, 0);
+      float tmp6 = aarch64_get_vec_float (cpu, vm, 1);
+
       if (full)
-	aarch64_set_vec_float (cpu, vd, 3,
-			       aarch64_get_vec_float (cpu, vm, 2)
-			       + aarch64_get_vec_float (cpu, vm, 3));
+	{
+	  float tmp3 = aarch64_get_vec_float (cpu, vn, 2);
+	  float tmp4 = aarch64_get_vec_float (cpu, vn, 3);
+	  float tmp7 = aarch64_get_vec_float (cpu, vm, 2);
+	  float tmp8 = aarch64_get_vec_float (cpu, vm, 3);
+
+	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
+	  aarch64_set_vec_float (cpu, vd, 1, tmp3 + tmp4);
+	  aarch64_set_vec_float (cpu, vd, 2, tmp5 + tmp6);
+	  aarch64_set_vec_float (cpu, vd, 3, tmp7 + tmp8);
+	}
+      else
+	{
+	  aarch64_set_vec_float (cpu, vd, 0, tmp1 + tmp2);
+	  aarch64_set_vec_float (cpu, vd, 1, tmp5 + tmp6);
+	}
     }
 }
 
@@ -6104,15 +6676,16 @@ do_vec_FSQRT (sim_cpu *cpu)
      instr[9,5]   = Vsrc
      instr[4,0]   = Vdest.  */
 
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
   int i;
 
   NYI_assert (29, 23, 0x5D);
   NYI_assert (21, 10, 0x87E);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       if (! full)
 	HALT_UNALLOC;
@@ -6140,15 +6713,16 @@ do_vec_FNEG (sim_cpu *cpu)
      instr[9,5]   = Vsrc
      instr[4,0]   = Vdest.  */
 
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned full = INSTR (30, 30);
   int i;
 
   NYI_assert (29, 23, 0x5D);
   NYI_assert (21, 10, 0x83E);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       if (! full)
 	HALT_UNALLOC;
@@ -6170,23 +6744,85 @@ do_vec_NOT (sim_cpu *cpu)
 {
   /* instr[31]    = 0
      instr[30]    = half (0)/full (1)
-     instr[29,21] = 10 1110 001
-     instr[20,16] = 0 0000
-     instr[15,10] = 0101 10
+     instr[29,10] = 10 1110 0010 0000 0101 10
      instr[9,5]   = Vn
      instr[4.0]   = Vd.  */
 
-  unsigned vn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned i;
-  int      full = uimm (aarch64_get_instr (cpu), 30, 30);
+  int      full = INSTR (30, 30);
 
   NYI_assert (29, 10, 0xB8816);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   for (i = 0; i < (full ? 16 : 8); i++)
     aarch64_set_vec_u8 (cpu, vd, i, ~ aarch64_get_vec_u8 (cpu, vn, i));
 }
 
+static unsigned int
+clz (uint64_t val, unsigned size)
+{
+  uint64_t mask = 1;
+  int      count;
+
+  mask <<= (size - 1);
+  count = 0;
+  do
+    {
+      if (val & mask)
+	break;
+      mask >>= 1;
+      count ++;
+    }
+  while (mask);
+
+  return count;
+}
+
+static void
+do_vec_CLZ (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = half (0)/full (1)
+     instr[29,24] = 10 1110
+     instr[23,22] = size
+     instr[21,10] = 10 0000 0100 10
+     instr[9,5]   = Vn
+     instr[4.0]   = Vd.  */
+
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned i;
+  int      full = INSTR (30,30);
+
+  NYI_assert (29, 24, 0x2E);
+  NYI_assert (21, 10, 0x812);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (23, 22))
+    {
+    case 0:
+      for (i = 0; i < (full ? 16 : 8); i++)
+	aarch64_set_vec_u8 (cpu, vd, i, clz (aarch64_get_vec_u8 (cpu, vn, i), 8));
+      break;
+    case 1:
+      for (i = 0; i < (full ? 8 : 4); i++)
+	aarch64_set_vec_u16 (cpu, vd, i, clz (aarch64_get_vec_u16 (cpu, vn, i), 16));
+      break;
+    case 2:
+      for (i = 0; i < (full ? 4 : 2); i++)
+	aarch64_set_vec_u32 (cpu, vd, i, clz (aarch64_get_vec_u32 (cpu, vn, i), 32));
+      break;
+    case 3:
+      if (! full)
+	HALT_UNALLOC;
+      aarch64_set_vec_u64 (cpu, vd, 0, clz (aarch64_get_vec_u64 (cpu, vn, 0), 64));
+      aarch64_set_vec_u64 (cpu, vd, 1, clz (aarch64_get_vec_u64 (cpu, vn, 1), 64));
+      break;
+    }
+}
+
 static void
 do_vec_MOV_element (sim_cpu *cpu)
 {
@@ -6198,8 +6834,8 @@ do_vec_MOV_element (sim_cpu *cpu)
      instr[9,5]   = Vs
      instr[4.0]   = Vd.  */
 
-  unsigned vs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned vs = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
   unsigned src_index;
   unsigned dst_index;
 
@@ -6207,29 +6843,30 @@ do_vec_MOV_element (sim_cpu *cpu)
   NYI_assert (15, 15, 0);
   NYI_assert (10, 10, 1);
 
-  if (uimm (aarch64_get_instr (cpu), 16, 16))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (16, 16))
     {
       /* Move a byte.  */
-      src_index = uimm (aarch64_get_instr (cpu), 14, 11);
-      dst_index = uimm (aarch64_get_instr (cpu), 20, 17);
+      src_index = INSTR (14, 11);
+      dst_index = INSTR (20, 17);
       aarch64_set_vec_u8 (cpu, vd, dst_index,
 			  aarch64_get_vec_u8 (cpu, vs, src_index));
     }
-  else if (uimm (aarch64_get_instr (cpu), 17, 17))
+  else if (INSTR (17, 17))
     {
       /* Move 16-bits.  */
       NYI_assert (11, 11, 0);
-      src_index = uimm (aarch64_get_instr (cpu), 14, 12);
-      dst_index = uimm (aarch64_get_instr (cpu), 20, 18);
+      src_index = INSTR (14, 12);
+      dst_index = INSTR (20, 18);
       aarch64_set_vec_u16 (cpu, vd, dst_index,
 			   aarch64_get_vec_u16 (cpu, vs, src_index));
     }
-  else if (uimm (aarch64_get_instr (cpu), 18, 18))
+  else if (INSTR (18, 18))
     {
       /* Move 32-bits.  */
       NYI_assert (12, 11, 0);
-      src_index = uimm (aarch64_get_instr (cpu), 14, 13);
-      dst_index = uimm (aarch64_get_instr (cpu), 20, 19);
+      src_index = INSTR (14, 13);
+      dst_index = INSTR (20, 19);
       aarch64_set_vec_u32 (cpu, vd, dst_index,
 			   aarch64_get_vec_u32 (cpu, vs, src_index));
     }
@@ -6237,74 +6874,152 @@ do_vec_MOV_element (sim_cpu *cpu)
     {
       NYI_assert (19, 19, 1);
       NYI_assert (13, 11, 0);
-      src_index = uimm (aarch64_get_instr (cpu), 14, 14);
-      dst_index = uimm (aarch64_get_instr (cpu), 20, 20);
+      src_index = INSTR (14, 14);
+      dst_index = INSTR (20, 20);
       aarch64_set_vec_u64 (cpu, vd, dst_index,
 			   aarch64_get_vec_u64 (cpu, vs, src_index));
     }
 }
 
+static void
+do_vec_REV32 (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = full/half
+     instr[29,24] = 10 1110 
+     instr[23,22] = size
+     instr[21,10] = 10 0000 0000 10
+     instr[9,5]   = Rn
+     instr[4,0]   = Rd.  */
+
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned size = INSTR (23, 22);
+  unsigned full = INSTR (30, 30);
+  unsigned i;
+  FRegister val;
+
+  NYI_assert (29, 24, 0x2E);
+  NYI_assert (21, 10, 0x802);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (size)
+    {
+    case 0:
+      for (i = 0; i < (full ? 16 : 8); i++)
+	val.b[i ^ 0x3] = aarch64_get_vec_u8 (cpu, rn, i);
+      break;
+
+    case 1:
+      for (i = 0; i < (full ? 8 : 4); i++)
+	val.h[i ^ 0x1] = aarch64_get_vec_u16 (cpu, rn, i);
+      break;
+
+    default:
+      HALT_UNALLOC;
+    }
+
+  aarch64_set_vec_u64 (cpu, rd, 0, val.v[0]);
+  if (full)
+    aarch64_set_vec_u64 (cpu, rd, 1, val.v[1]);
+}
+
+static void
+do_vec_EXT (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = full/half
+     instr[29,21] = 10 1110 000
+     instr[20,16] = Vm
+     instr[15]    = 0
+     instr[14,11] = source index
+     instr[10]    = 0
+     instr[9,5]   = Vn
+     instr[4.0]   = Vd.  */
+
+  unsigned vm = INSTR (20, 16);
+  unsigned vn = INSTR (9, 5);
+  unsigned vd = INSTR (4, 0);
+  unsigned src_index = INSTR (14, 11);
+  unsigned full = INSTR (30, 30);
+  unsigned i;
+  unsigned j;
+  FRegister val;
+
+  NYI_assert (31, 21, 0x370);
+  NYI_assert (15, 15, 0);
+  NYI_assert (10, 10, 0);
+
+  if (!full && (src_index & 0x8))
+    HALT_UNALLOC;
+
+  j = 0;
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  for (i = src_index; i < (full ? 16 : 8); i++)
+    val.b[j ++] = aarch64_get_vec_u8 (cpu, vn, i);
+  for (i = 0; i < src_index; i++)
+    val.b[j ++] = aarch64_get_vec_u8 (cpu, vm, i);
+
+  aarch64_set_vec_u64 (cpu, vd, 0, val.v[0]);
+  if (full)
+    aarch64_set_vec_u64 (cpu, vd, 1, val.v[1]);
+}
+
 static void
 dexAdvSIMD0 (sim_cpu *cpu)
 {
   /* instr [28,25] = 0 111.  */
-  if (    uimm (aarch64_get_instr (cpu), 15, 10) == 0x07
-      && (uimm (aarch64_get_instr (cpu), 9, 5) ==
-	  uimm (aarch64_get_instr (cpu), 20, 16)))
+  if (    INSTR (15, 10) == 0x07
+      && (INSTR (9, 5) ==
+	  INSTR (20, 16)))
     {
-      if (uimm (aarch64_get_instr (cpu), 31, 21) == 0x075
-	  || uimm (aarch64_get_instr (cpu), 31, 21) == 0x275)
+      if (INSTR (31, 21) == 0x075
+	  || INSTR (31, 21) == 0x275)
 	{
 	  do_vec_MOV_whole_vector (cpu);
 	  return;
 	}
     }
 
-  if (uimm (aarch64_get_instr (cpu), 29, 19) == 0x1E0)
+  if (INSTR (29, 19) == 0x1E0)
     {
       do_vec_MOV_immediate (cpu);
       return;
     }
 
-  if (uimm (aarch64_get_instr (cpu), 29, 19) == 0x5E0)
+  if (INSTR (29, 19) == 0x5E0)
     {
       do_vec_MVNI (cpu);
       return;
     }
 
-  if (uimm (aarch64_get_instr (cpu), 29, 19) == 0x1C0
-      || uimm (aarch64_get_instr (cpu), 29, 19) == 0x1C1)
+  if (INSTR (29, 19) == 0x1C0
+      || INSTR (29, 19) == 0x1C1)
     {
-      if (uimm (aarch64_get_instr (cpu), 15, 10) == 0x03)
+      if (INSTR (15, 10) == 0x03)
 	{
 	  do_vec_DUP_scalar_into_vector (cpu);
 	  return;
 	}
     }
 
-  switch (uimm (aarch64_get_instr (cpu), 29, 24))
+  switch (INSTR (29, 24))
     {
     case 0x0E: do_vec_op1 (cpu); return;
     case 0x0F: do_vec_op2 (cpu); return;
 
-    case 0x2f:
-      switch (uimm (aarch64_get_instr (cpu), 15, 10))
-	{
-	case 0x01: do_vec_SSHR_USHR (cpu); return;
-	case 0x10:
-	case 0x12: do_vec_mls_indexed (cpu); return;
-	case 0x29: do_vec_xtl (cpu); return;
-	default:
-	  HALT_NYI;
-	}
-
     case 0x2E:
-      if (uimm (aarch64_get_instr (cpu), 21, 21) == 1)
+      if (INSTR (21, 21) == 1)
 	{
-	  switch (uimm (aarch64_get_instr (cpu), 15, 10))
+	  switch (INSTR (15, 10))
 	    {
+	    case 0x02:
+	      do_vec_REV32 (cpu);
+	      return;
+
 	    case 0x07:
-	      switch (uimm (aarch64_get_instr (cpu), 23, 22))
+	      switch (INSTR (23, 22))
 		{
 		case 0: do_vec_EOR (cpu); return;
 		case 1: do_vec_BSL (cpu); return;
@@ -6315,6 +7030,7 @@ dexAdvSIMD0 (sim_cpu *cpu)
 
 	    case 0x08: do_vec_sub_long (cpu); return;
 	    case 0x11: do_vec_USHL (cpu); return;
+	    case 0x12: do_vec_CLZ (cpu); return;
 	    case 0x16: do_vec_NOT (cpu); return;
 	    case 0x19: do_vec_max (cpu); return;
 	    case 0x1B: do_vec_min (cpu); return;
@@ -6326,7 +7042,7 @@ dexAdvSIMD0 (sim_cpu *cpu)
 	    case 0x3F: do_vec_FDIV (cpu); return;
 
 	    case 0x3E:
-	      switch (uimm (aarch64_get_instr (cpu), 20, 16))
+	      switch (INSTR (20, 16))
 		{
 		case 0x00: do_vec_FNEG (cpu); return;
 		case 0x01: do_vec_FSQRT (cpu); return;
@@ -6345,22 +7061,26 @@ dexAdvSIMD0 (sim_cpu *cpu)
 	    case 0x3A:
 	      do_vec_compare (cpu); return;
 
-	    default: break;
+	    default:
+	      break;
 	    }
 	}
 
-      if (uimm (aarch64_get_instr (cpu), 31, 21) == 0x370)
+      if (INSTR (31, 21) == 0x370)
 	{
-	  do_vec_MOV_element (cpu);
+	  if (INSTR (10, 10))
+	    do_vec_MOV_element (cpu);
+	  else
+	    do_vec_EXT (cpu);
 	  return;
 	}
 
-      switch (uimm (aarch64_get_instr (cpu), 21, 10))
+      switch (INSTR (21, 10))
 	{
 	case 0x82E: do_vec_neg (cpu); return;
 	case 0x87E: do_vec_sqrt (cpu); return;
 	default:
-	  if (uimm (aarch64_get_instr (cpu), 15, 10) == 0x30)
+	  if (INSTR (15, 10) == 0x30)
 	    {
 	      do_vec_mull (cpu);
 	      return;
@@ -6369,6 +7089,17 @@ dexAdvSIMD0 (sim_cpu *cpu)
 	}
       break;
 
+    case 0x2f:
+      switch (INSTR (15, 10))
+	{
+	case 0x01: do_vec_SSHR_USHR (cpu); return;
+	case 0x10:
+	case 0x12: do_vec_mls_indexed (cpu); return;
+	case 0x29: do_vec_xtl (cpu); return;
+	default:
+	  HALT_NYI;
+	}
+
     default:
       break;
     }
@@ -6382,11 +7113,12 @@ dexAdvSIMD0 (sim_cpu *cpu)
 static void
 fmadds (sim_cpu *cpu)
 {
-  unsigned sa = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sa = INSTR (14, 10);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
 			+ aarch64_get_FP_float (cpu, sn)
 			* aarch64_get_FP_float (cpu, sm));
@@ -6396,11 +7128,12 @@ fmadds (sim_cpu *cpu)
 static void
 fmaddd (sim_cpu *cpu)
 {
-  unsigned sa = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sa = INSTR (14, 10);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
 			 + aarch64_get_FP_double (cpu, sn)
 			 * aarch64_get_FP_double (cpu, sm));
@@ -6410,11 +7143,12 @@ fmaddd (sim_cpu *cpu)
 static void
 fmsubs (sim_cpu *cpu)
 {
-  unsigned sa = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sa = INSTR (14, 10);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sa)
 			- aarch64_get_FP_float (cpu, sn)
 			* aarch64_get_FP_float (cpu, sm));
@@ -6424,11 +7158,12 @@ fmsubs (sim_cpu *cpu)
 static void
 fmsubd (sim_cpu *cpu)
 {
-  unsigned sa = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sa = INSTR (14, 10);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sa)
 			 - aarch64_get_FP_double (cpu, sn)
 			 * aarch64_get_FP_double (cpu, sm));
@@ -6438,11 +7173,12 @@ fmsubd (sim_cpu *cpu)
 static void
 fnmadds (sim_cpu *cpu)
 {
-  unsigned sa = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sa = INSTR (14, 10);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
 			+ (- aarch64_get_FP_float (cpu, sn))
 			* aarch64_get_FP_float (cpu, sm));
@@ -6452,11 +7188,12 @@ fnmadds (sim_cpu *cpu)
 static void
 fnmaddd (sim_cpu *cpu)
 {
-  unsigned sa = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sa = INSTR (14, 10);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
 			 + (- aarch64_get_FP_double (cpu, sn))
 			 * aarch64_get_FP_double (cpu, sm));
@@ -6466,11 +7203,12 @@ fnmaddd (sim_cpu *cpu)
 static void
 fnmsubs (sim_cpu *cpu)
 {
-  unsigned sa = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sa = INSTR (14, 10);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sa)
 			+ aarch64_get_FP_float (cpu, sn)
 			* aarch64_get_FP_float (cpu, sm));
@@ -6480,11 +7218,12 @@ fnmsubs (sim_cpu *cpu)
 static void
 fnmsubd (sim_cpu *cpu)
 {
-  unsigned sa = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sa = INSTR (14, 10);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sa)
 			 + aarch64_get_FP_double (cpu, sn)
 			 * aarch64_get_FP_double (cpu, sm));
@@ -6502,11 +7241,9 @@ dexSimpleFPDataProc3Source (sim_cpu *cpu)
      instr[21]    ==> o1 : 0 ==> unnegated, 1 ==> negated
      instr[15]    ==> o2 : 0 ==> ADD, 1 ==> SUB  */
 
-  uint32_t M_S = (uimm (aarch64_get_instr (cpu), 31, 31) << 1)
-    | uimm (aarch64_get_instr (cpu), 29, 29);
+  uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
   /* dispatch on combined type:o1:o2.  */
-  uint32_t dispatch = (uimm (aarch64_get_instr (cpu), 23, 21) << 1)
-    | uimm (aarch64_get_instr (cpu), 15, 15);
+  uint32_t dispatch = (INSTR (23, 21) << 1) | INSTR (15, 15);
 
   if (M_S != 0)
     HALT_UNALLOC;
@@ -6536,7 +7273,58 @@ dexSimpleFPFixedConvert (sim_cpu *cpu)
 static void
 dexSimpleFPCondCompare (sim_cpu *cpu)
 {
-  HALT_NYI;
+  /* instr [31,23] = 0001 1110 0
+     instr [22]    = type
+     instr [21]    = 1
+     instr [20,16] = Rm
+     instr [15,12] = condition
+     instr [11,10] = 01
+     instr [9,5]   = Rn
+     instr [4]     = 0
+     instr [3,0]   = nzcv  */
+
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+
+  NYI_assert (31, 23, 0x3C);
+  NYI_assert (11, 10, 0x1);
+  NYI_assert (4,  4,  0);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (! testConditionCode (cpu, INSTR (15, 12)))
+    {
+      aarch64_set_CPSR (cpu, INSTR (3, 0));
+      return;
+    }
+
+  if (INSTR (22, 22))
+    {
+      /* Double precision.  */
+      double val1 = aarch64_get_vec_double (cpu, rn, 0);
+      double val2 = aarch64_get_vec_double (cpu, rm, 0);
+
+      /* FIXME: Check for NaNs.  */
+      if (val1 == val2)
+	aarch64_set_CPSR (cpu, (Z | C));
+      else if (val1 < val2)
+	aarch64_set_CPSR (cpu, N);
+      else /* val1 > val2 */
+	aarch64_set_CPSR (cpu, C);
+    }
+  else
+    {
+      /* Single precision.  */
+      float val1 = aarch64_get_vec_float (cpu, rn, 0);
+      float val2 = aarch64_get_vec_float (cpu, rm, 0);
+
+      /* FIXME: Check for NaNs.  */
+      if (val1 == val2)
+	aarch64_set_CPSR (cpu, (Z | C));
+      else if (val1 < val2)
+	aarch64_set_CPSR (cpu, N);
+      else /* val1 > val2 */
+	aarch64_set_CPSR (cpu, C);
+    }
 }
 
 /* 2 sources.  */
@@ -6545,10 +7333,11 @@ dexSimpleFPCondCompare (sim_cpu *cpu)
 static void
 fadds (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
 			+ aarch64_get_FP_float (cpu, sm));
 }
@@ -6557,10 +7346,11 @@ fadds (sim_cpu *cpu)
 static void
 faddd (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
 			 + aarch64_get_FP_double (cpu, sm));
 }
@@ -6569,10 +7359,11 @@ faddd (sim_cpu *cpu)
 static void
 fdivs (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
 			/ aarch64_get_FP_float (cpu, sm));
 }
@@ -6581,10 +7372,11 @@ fdivs (sim_cpu *cpu)
 static void
 fdivd (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
 			 / aarch64_get_FP_double (cpu, sm));
 }
@@ -6593,10 +7385,11 @@ fdivd (sim_cpu *cpu)
 static void
 fmuls (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
 			* aarch64_get_FP_float (cpu, sm));
 }
@@ -6605,10 +7398,11 @@ fmuls (sim_cpu *cpu)
 static void
 fmuld (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
 			 * aarch64_get_FP_double (cpu, sm));
 }
@@ -6617,10 +7411,11 @@ fmuld (sim_cpu *cpu)
 static void
 fnmuls (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, - (aarch64_get_FP_float (cpu, sn)
 				    * aarch64_get_FP_float (cpu, sm)));
 }
@@ -6629,10 +7424,11 @@ fnmuls (sim_cpu *cpu)
 static void
 fnmuld (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, - (aarch64_get_FP_double (cpu, sn)
 				     * aarch64_get_FP_double (cpu, sm)));
 }
@@ -6641,10 +7437,11 @@ fnmuld (sim_cpu *cpu)
 static void
 fsubs (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, aarch64_get_FP_float (cpu, sn)
 			- aarch64_get_FP_float (cpu, sm));
 }
@@ -6653,10 +7450,11 @@ fsubs (sim_cpu *cpu)
 static void
 fsubd (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, aarch64_get_FP_double (cpu, sn)
 			 - aarch64_get_FP_double (cpu, sm));
 }
@@ -6672,14 +7470,15 @@ do_FMINNM (sim_cpu *cpu)
      instr[9,5]   = Sn
      instr[4,0]   = Cpu  */
 
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
   NYI_assert (31, 23, 0x03C);
   NYI_assert (15, 10, 0x1E);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     aarch64_set_FP_double (cpu, sd,
 			   dminnm (aarch64_get_FP_double (cpu, sn),
 				   aarch64_get_FP_double (cpu, sm)));
@@ -6700,14 +7499,15 @@ do_FMAXNM (sim_cpu *cpu)
      instr[9,5]   = Sn
      instr[4,0]   = Cpu  */
 
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
+  unsigned sd = INSTR ( 4,  0);
 
   NYI_assert (31, 23, 0x03C);
   NYI_assert (15, 10, 0x1A);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     aarch64_set_FP_double (cpu, sd,
 			   dmaxnm (aarch64_get_FP_double (cpu, sn),
 				   aarch64_get_FP_double (cpu, sm)));
@@ -6737,11 +7537,10 @@ dexSimpleFPDataProc2Source (sim_cpu *cpu)
      instr[9,5]   = Vn
      instr[4,0]   = Vd  */
 
-  uint32_t M_S = (uimm (aarch64_get_instr (cpu), 31, 31) << 1)
-    | uimm (aarch64_get_instr (cpu), 29, 29);
-  uint32_t type = uimm (aarch64_get_instr (cpu), 23, 22);
+  uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
+  uint32_t type = INSTR (23, 22);
   /* Dispatch on opcode.  */
-  uint32_t dispatch = uimm (aarch64_get_instr (cpu), 15, 12);
+  uint32_t dispatch = INSTR (15, 12);
 
   if (type > 1)
     HALT_UNALLOC;
@@ -6800,53 +7599,59 @@ dexSimpleFPCondSelect (sim_cpu *cpu)
      instr[11,10] = 11
      instr[9,5]   = Sn
      instr[4,0]   = Cpu  */
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu),  4, 0);
-  uint32_t set = testConditionCode (cpu, uimm (aarch64_get_instr (cpu), 15, 12));
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9, 5);
+  unsigned sd = INSTR ( 4, 0);
+  uint32_t set = testConditionCode (cpu, INSTR (15, 12));
 
   NYI_assert (31, 23, 0x03C);
   NYI_assert (11, 10, 0x3);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
-    aarch64_set_FP_double (cpu, sd, set ? sn : sm);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
+    aarch64_set_FP_double (cpu, sd, (set ? aarch64_get_FP_double (cpu, sn)
+				     : aarch64_get_FP_double (cpu, sm)));
   else
-    aarch64_set_FP_float (cpu, sd, set ? sn : sm);
+    aarch64_set_FP_float (cpu, sd, (set ? aarch64_get_FP_float (cpu, sn)
+				    : aarch64_get_FP_float (cpu, sm)));
 }
 
 /* Store 32 bit unscaled signed 9 bit.  */
 static void
 fsturs (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
-  aarch64_set_mem_float (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
-			 aarch64_get_FP_float (cpu, rn));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
+		       aarch64_get_vec_u32 (cpu, st, 0));
 }
 
 /* Store 64 bit unscaled signed 9 bit.  */
 static void
 fsturd (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
-  aarch64_set_mem_double (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
-			  aarch64_get_FP_double (cpu, rn));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, rn, 1) + offset,
+		       aarch64_get_vec_u64 (cpu, st, 0));
 }
 
 /* Store 128 bit unscaled signed 9 bit.  */
 static void
 fsturq (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
   FRegister a;
 
-  aarch64_get_FP_long_double (cpu, rn, & a);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_get_FP_long_double (cpu, st, & a);
   aarch64_set_mem_long_double (cpu,
-			       aarch64_get_reg_u64 (cpu, st, 1)
+			       aarch64_get_reg_u64 (cpu, rn, 1)
 			       + offset, a);
 }
 
@@ -6856,9 +7661,10 @@ fsturq (sim_cpu *cpu, int32_t offset)
 static void
 ffmovs (sim_cpu *cpu)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, st, aarch64_get_FP_float (cpu, rn));
 }
 
@@ -6866,9 +7672,10 @@ ffmovs (sim_cpu *cpu)
 static void
 ffmovd (sim_cpu *cpu)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, st, aarch64_get_FP_double (cpu, rn));
 }
 
@@ -6876,9 +7683,10 @@ ffmovd (sim_cpu *cpu)
 static void
 fgmovs (sim_cpu *cpu)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_reg_u32 (cpu, rn, NO_SP));
 }
 
@@ -6886,9 +7694,10 @@ fgmovs (sim_cpu *cpu)
 static void
 fgmovd (sim_cpu *cpu)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_reg_u64 (cpu, rn, NO_SP));
 }
 
@@ -6896,9 +7705,10 @@ fgmovd (sim_cpu *cpu)
 static void
 gfmovs (sim_cpu *cpu)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u32 (cpu, rn, 0));
 }
 
@@ -6906,9 +7716,10 @@ gfmovs (sim_cpu *cpu)
 static void
 gfmovd (sim_cpu *cpu)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, st, NO_SP, aarch64_get_vec_u64 (cpu, rn, 0));
 }
 
@@ -6921,20 +7732,22 @@ gfmovd (sim_cpu *cpu)
 static void
 fmovs (sim_cpu *cpu)
 {
-  unsigned int sd = uimm (aarch64_get_instr (cpu), 4, 0);
-  uint32_t imm = uimm (aarch64_get_instr (cpu), 20, 13);
+  unsigned int sd = INSTR (4, 0);
+  uint32_t imm = INSTR (20, 13);
   float f = fp_immediate_for_encoding_32 (imm);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, f);
 }
 
 static void
 fmovd (sim_cpu *cpu)
 {
-  unsigned int sd = uimm (aarch64_get_instr (cpu), 4, 0);
-  uint32_t imm = uimm (aarch64_get_instr (cpu), 20, 13);
+  unsigned int sd = INSTR (4, 0);
+  uint32_t imm = INSTR (20, 13);
   double d = fp_immediate_for_encoding_64 (imm);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, d);
 }
 
@@ -6948,14 +7761,14 @@ dexSimpleFPImmediate (sim_cpu *cpu)
      instr[12,10] == 100
      instr[9,5]   == imm5 : 00000 ==> PK, ow ==> UNALLOC
      instr[4,0]   == Rd  */
-  uint32_t imm5 = uimm (aarch64_get_instr (cpu), 9, 5);
+  uint32_t imm5 = INSTR (9, 5);
 
   NYI_assert (31, 23, 0x3C);
 
   if (imm5 != 0)
     HALT_UNALLOC;
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  if (INSTR (22, 22))
     fmovd (cpu);
   else
     fmovs (cpu);
@@ -6972,33 +7785,36 @@ dexSimpleFPImmediate (sim_cpu *cpu)
 static void
 fldurs (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
-  aarch64_set_FP_float (cpu, st, aarch64_get_mem_float
-			(cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
+		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 }
 
 /* Load 64 bit unscaled signed 9 bit.  */
 static void
 fldurd (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
 
-  aarch64_set_FP_double (cpu, st, aarch64_get_mem_double
-			 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
+		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 }
 
 /* Load 128 bit unscaled signed 9 bit.  */
 static void
 fldurq (sim_cpu *cpu, int32_t offset)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int st = INSTR (4, 0);
   FRegister a;
   uint64_t addr = aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_get_mem_long_double (cpu, addr, & a);
   aarch64_set_FP_long_double (cpu, st, a);
 }
@@ -7013,10 +7829,11 @@ fldurq (sim_cpu *cpu, int32_t offset)
 static void
 fabss (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
   float value = aarch64_get_FP_float (cpu, sn);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, fabsf (value));
 }
 
@@ -7024,10 +7841,11 @@ fabss (sim_cpu *cpu)
 static void
 fabcpu (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
   double value = aarch64_get_FP_double (cpu, sn);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, fabs (value));
 }
 
@@ -7035,9 +7853,10 @@ fabcpu (sim_cpu *cpu)
 static void
 fnegs (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, - aarch64_get_FP_float (cpu, sn));
 }
 
@@ -7045,9 +7864,10 @@ fnegs (sim_cpu *cpu)
 static void
 fnegd (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, - aarch64_get_FP_double (cpu, sn));
 }
 
@@ -7055,19 +7875,21 @@ fnegd (sim_cpu *cpu)
 static void
 fsqrts (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
-  aarch64_set_FP_float (cpu, sd, sqrt (aarch64_get_FP_float (cpu, sn)));
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_FP_float (cpu, sd, sqrtf (aarch64_get_FP_float (cpu, sn)));
 }
 
 /* Double square root.  */
 static void
 fsqrtd (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd,
 			 sqrt (aarch64_get_FP_double (cpu, sn)));
 }
@@ -7076,9 +7898,10 @@ fsqrtd (sim_cpu *cpu)
 static void
 fcvtds (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float (cpu, sd, (float) aarch64_get_FP_double (cpu, sn));
 }
 
@@ -7086,9 +7909,10 @@ fcvtds (sim_cpu *cpu)
 static void
 fcvtcpu (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double (cpu, sd, (double) aarch64_get_FP_float (cpu, sn));
 }
 
@@ -7104,9 +7928,9 @@ do_FRINT (sim_cpu *cpu)
      instr[4,0]   = dest  */
 
   float val;
-  unsigned rs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned int rmode = uimm (aarch64_get_instr (cpu), 17, 15);
+  unsigned rs = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned int rmode = INSTR (17, 15);
 
   NYI_assert (31, 23, 0x03C);
   NYI_assert (21, 18, 0x9);
@@ -7116,7 +7940,8 @@ do_FRINT (sim_cpu *cpu)
     /* FIXME: Add support for rmode == 6 exactness check.  */
     rmode = uimm (aarch64_get_FPSR (cpu), 23, 22);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       double val = aarch64_get_FP_double (cpu, rs);
 
@@ -7216,6 +8041,57 @@ do_FRINT (sim_cpu *cpu)
     }
 }
 
+/* Convert half to float.  */
+static void
+do_FCVT_half_to_single (sim_cpu *cpu)
+{
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 10, 0x7B890);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_FP_float (cpu, rd, (float) aarch64_get_FP_half  (cpu, rn));
+}
+
+/* Convert half to double.  */
+static void
+do_FCVT_half_to_double (sim_cpu *cpu)
+{
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 10, 0x7B8B0);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_FP_double (cpu, rd, (double) aarch64_get_FP_half  (cpu, rn));
+}
+
+static void
+do_FCVT_single_to_half (sim_cpu *cpu)
+{
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 10, 0x788F0);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_FP_half (cpu, rd, aarch64_get_FP_float  (cpu, rn));
+}
+
+/* Convert double to half.  */
+static void
+do_FCVT_double_to_half (sim_cpu *cpu)
+{
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 10, 0x798F0);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_FP_half (cpu, rd, (float) aarch64_get_FP_double  (cpu, rn));
+}
+
 static void
 dexSimpleFPDataProc1Source (sim_cpu *cpu)
 {
@@ -7243,20 +8119,22 @@ dexSimpleFPDataProc1Source (sim_cpu *cpu)
                                000101 ==> FCVT (half-to-double)
 			       instr[14,10] = 10000.  */
 
-  uint32_t M_S = (uimm (aarch64_get_instr (cpu), 31, 31) << 1)
-    | uimm (aarch64_get_instr (cpu), 29, 29);
-  uint32_t type   = uimm (aarch64_get_instr (cpu), 23, 22);
-  uint32_t opcode = uimm (aarch64_get_instr (cpu), 20, 15);
+  uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
+  uint32_t type   = INSTR (23, 22);
+  uint32_t opcode = INSTR (20, 15);
 
   if (M_S != 0)
     HALT_UNALLOC;
 
   if (type == 3)
     {
-      if (opcode == 4 || opcode == 5)
-	HALT_NYI;
+      if (opcode == 4)
+	do_FCVT_half_to_single (cpu);
+      else if (opcode == 5)
+	do_FCVT_half_to_double (cpu);
       else
 	HALT_UNALLOC;
+      return;
     }
 
   if (type == 2)
@@ -7315,7 +8193,13 @@ dexSimpleFPDataProc1Source (sim_cpu *cpu)
        do_FRINT (cpu);
        return;
 
-    case 7:		/* FCVT double/single to half precision.  */
+    case 7:
+      if (INSTR (22, 22))
+	do_FCVT_double_to_half (cpu);
+      else
+	do_FCVT_single_to_half (cpu);
+      return;
+
     case 13:
       HALT_NYI;
 
@@ -7328,9 +8212,10 @@ dexSimpleFPDataProc1Source (sim_cpu *cpu)
 static void
 scvtf32 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float
     (cpu, sd, (float) aarch64_get_reg_s32 (cpu, rn, NO_SP));
 }
@@ -7339,9 +8224,10 @@ scvtf32 (sim_cpu *cpu)
 static void
 scvtf (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_float
     (cpu, sd, (float) aarch64_get_reg_s64 (cpu, rn, NO_SP));
 }
@@ -7350,9 +8236,10 @@ scvtf (sim_cpu *cpu)
 static void
 scvtd32 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double
     (cpu, sd, (double) aarch64_get_reg_s32 (cpu, rn, NO_SP));
 }
@@ -7361,9 +8248,10 @@ scvtd32 (sim_cpu *cpu)
 static void
 scvtd (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned sd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned sd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_FP_double
     (cpu, sd, (double) aarch64_get_reg_s64 (cpu, rn, NO_SP));
 }
@@ -7377,6 +8265,17 @@ static const float  FLOAT_LONG_MIN  = (float)  LONG_MIN;
 static const double DOUBLE_LONG_MAX = (double) LONG_MAX;
 static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
 
+#define UINT_MIN 0
+#define ULONG_MIN 0
+static const float  FLOAT_UINT_MAX   = (float)  UINT_MAX;
+static const float  FLOAT_UINT_MIN   = (float)  UINT_MIN;
+static const double DOUBLE_UINT_MAX  = (double) UINT_MAX;
+static const double DOUBLE_UINT_MIN  = (double) UINT_MIN;
+static const float  FLOAT_ULONG_MAX  = (float)  ULONG_MAX;
+static const float  FLOAT_ULONG_MIN  = (float)  ULONG_MIN;
+static const double DOUBLE_ULONG_MAX = (double) ULONG_MAX;
+static const double DOUBLE_ULONG_MIN = (double) ULONG_MIN;
+
 /* Check for FP exception conditions:
      NaN raises IO
      Infinity raises IO
@@ -7426,14 +8325,15 @@ static const double DOUBLE_LONG_MIN = (double) LONG_MIN;
 static void
 fcvtszs32 (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   /* TODO : check that this rounds toward zero.  */
   float   f = aarch64_get_FP_float (cpu, sn);
   int32_t value = (int32_t) f;
 
   RAISE_EXCEPTIONS (f, value, FLOAT, INT);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* Avoid sign extension to 64 bit.  */
   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
 }
@@ -7442,13 +8342,14 @@ fcvtszs32 (sim_cpu *cpu)
 static void
 fcvtszs (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   float f = aarch64_get_FP_float (cpu, sn);
   int64_t value = (int64_t) f;
 
   RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
 }
 
@@ -7456,14 +8357,15 @@ fcvtszs (sim_cpu *cpu)
 static void
 fcvtszd32 (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   /* TODO : check that this rounds toward zero.  */
   double   d = aarch64_get_FP_double (cpu, sn);
   int32_t  value = (int32_t) d;
 
   RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* Avoid sign extension to 64 bit.  */
   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
 }
@@ -7472,8 +8374,8 @@ fcvtszd32 (sim_cpu *cpu)
 static void
 fcvtszd (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned sn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   /* TODO : check that this rounds toward zero.  */
   double  d = aarch64_get_FP_double (cpu, sn);
   int64_t value;
@@ -7482,6 +8384,7 @@ fcvtszd (sim_cpu *cpu)
 
   RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
 }
 
@@ -7497,27 +8400,28 @@ do_fcvtzu (sim_cpu *cpu)
      instr[9,5]   = Rs
      instr[4,0]   = Rd.  */
 
-  unsigned rs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rs = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   NYI_assert (30, 23, 0x3C);
   NYI_assert (20, 16, 0x19);
 
-  if (uimm (aarch64_get_instr (cpu), 21, 21) != 1)
+  if (INSTR (21, 21) != 1)
     /* Convert to fixed point.  */
     HALT_NYI;
 
-  if (uimm (aarch64_get_instr (cpu), 31, 31))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (31, 31))
     {
       /* Convert to unsigned 64-bit integer.  */
-      if (uimm (aarch64_get_instr (cpu), 22, 22))
+      if (INSTR (22, 22))
 	{
 	  double  d = aarch64_get_FP_double (cpu, rs);
 	  uint64_t value = (uint64_t) d;
 
 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
 	  if (value != (1UL << 63))
-	    RAISE_EXCEPTIONS (d, value, DOUBLE, LONG);
+	    RAISE_EXCEPTIONS (d, value, DOUBLE, ULONG);
 
 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
 	}
@@ -7528,7 +8432,7 @@ do_fcvtzu (sim_cpu *cpu)
 
 	  /* Do not raise an exception if we have reached ULONG_MAX.  */
 	  if (value != (1UL << 63))
-	    RAISE_EXCEPTIONS (f, value, FLOAT, LONG);
+	    RAISE_EXCEPTIONS (f, value, FLOAT, ULONG);
 
 	  aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
 	}
@@ -7538,14 +8442,14 @@ do_fcvtzu (sim_cpu *cpu)
       uint32_t value;
 
       /* Convert to unsigned 32-bit integer.  */
-      if (uimm (aarch64_get_instr (cpu), 22, 22))
+      if (INSTR (22, 22))
 	{
 	  double  d = aarch64_get_FP_double (cpu, rs);
 
 	  value = (uint32_t) d;
 	  /* Do not raise an exception if we have reached UINT_MAX.  */
 	  if (value != (1UL << 31))
-	    RAISE_EXCEPTIONS (d, value, DOUBLE, INT);
+	    RAISE_EXCEPTIONS (d, value, DOUBLE, UINT);
 	}
       else
 	{
@@ -7554,7 +8458,7 @@ do_fcvtzu (sim_cpu *cpu)
 	  value = (uint32_t) f;
 	  /* Do not raise an exception if we have reached UINT_MAX.  */
 	  if (value != (1UL << 31))
-	    RAISE_EXCEPTIONS (f, value, FLOAT, INT);
+	    RAISE_EXCEPTIONS (f, value, FLOAT, UINT);
 	}
 
       aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
@@ -7573,21 +8477,22 @@ do_UCVTF (sim_cpu *cpu)
      instr[9,5]   = Rs
      instr[4,0]   = Rd.  */
 
-  unsigned rs = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rs = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   NYI_assert (30, 23, 0x3C);
   NYI_assert (20, 16, 0x03);
 
-  if (uimm (aarch64_get_instr (cpu), 21, 21) != 1)
+  if (INSTR (21, 21) != 1)
     HALT_NYI;
 
   /* FIXME: Add exception raising.  */
-  if (uimm (aarch64_get_instr (cpu), 31, 31))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (31, 31))
     {
       uint64_t value = aarch64_get_reg_u64 (cpu, rs, NO_SP);
 
-      if (uimm (aarch64_get_instr (cpu), 22, 22))
+      if (INSTR (22, 22))
 	aarch64_set_FP_double (cpu, rd, (double) value);
       else
 	aarch64_set_FP_float (cpu, rd, (float) value);
@@ -7596,7 +8501,7 @@ do_UCVTF (sim_cpu *cpu)
     {
       uint32_t value =  aarch64_get_reg_u32 (cpu, rs, NO_SP);
 
-      if (uimm (aarch64_get_instr (cpu), 22, 22))
+      if (INSTR (22, 22))
 	aarch64_set_FP_double (cpu, rd, (double) value);
       else
 	aarch64_set_FP_float (cpu, rd, (float) value);
@@ -7612,15 +8517,16 @@ float_vector_move (sim_cpu *cpu)
      instr[9,5]   ==> source
      instr[4,0]   ==> dest.  */
 
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   NYI_assert (31, 17, 0x4F57);
 
-  if (uimm (aarch64_get_instr (cpu), 15, 10) != 0)
+  if (INSTR (15, 10) != 0)
     HALT_UNALLOC;
 
-  if (uimm (aarch64_get_instr (cpu), 16, 16))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (16, 16))
     aarch64_set_vec_u64 (cpu, rd, 1, aarch64_get_reg_u64 (cpu, rn, NO_SP));
   else
     aarch64_set_reg_u64 (cpu, rd, NO_SP, aarch64_get_vec_u64 (cpu, rn, 1));
@@ -7646,22 +8552,22 @@ dexSimpleFPIntegerConvert (sim_cpu *cpu)
   uint32_t size;
   uint32_t S;
 
-  if (uimm (aarch64_get_instr (cpu), 31, 17) == 0x4F57)
+  if (INSTR (31, 17) == 0x4F57)
     {
       float_vector_move (cpu);
       return;
     }
 
-  size = uimm (aarch64_get_instr (cpu), 31, 31);
-  S = uimm (aarch64_get_instr (cpu), 29, 29);
+  size = INSTR (31, 31);
+  S = INSTR (29, 29);
   if (S != 0)
     HALT_UNALLOC;
 
-  type = uimm (aarch64_get_instr (cpu), 23, 22);
+  type = INSTR (23, 22);
   if (type > 1)
     HALT_UNALLOC;
 
-  rmode_opcode = uimm (aarch64_get_instr (cpu), 20, 16);
+  rmode_opcode = INSTR (20, 16);
   size_type = (size << 1) | type; /* 0==32f, 1==32d, 2==64f, 3==64d.  */
 
   switch (rmode_opcode)
@@ -7673,8 +8579,6 @@ dexSimpleFPIntegerConvert (sim_cpu *cpu)
 	case 1: scvtd32 (cpu); return;
 	case 2: scvtf (cpu); return;
 	case 3: scvtd (cpu); return;
-	default:
-	  HALT_UNREACHABLE;
 	}
 
     case 6:			/* FMOV GR, Vec.  */
@@ -7700,7 +8604,6 @@ dexSimpleFPIntegerConvert (sim_cpu *cpu)
 	case 1: fcvtszd32 (cpu); return;
 	case 2: fcvtszs (cpu); return;
 	case 3: fcvtszd (cpu); return;
-	default: HALT_UNREACHABLE;
 	}
 
     case 25: do_fcvtzu (cpu); return;
@@ -7724,8 +8627,22 @@ set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
 {
   uint32_t flags;
 
+  /* FIXME: Add exception raising.  */
   if (isnan (fvalue1) || isnan (fvalue2))
     flags = C|V;
+  else if (isinf (fvalue1) && isinf (fvalue2))
+    {
+      /* Subtracting two infinities may give a NaN.  We only need to compare
+	 the signs, which we can get from isinf.  */
+      int result = isinf (fvalue1) - isinf (fvalue2);
+
+      if (result == 0)
+	flags = Z|C;
+      else if (result < 0)
+	flags = N;
+      else /* (result > 0).  */
+	flags = C;
+    }
   else
     {
       float result = fvalue1 - fvalue2;
@@ -7744,12 +8661,13 @@ set_flags_for_float_compare (sim_cpu *cpu, float fvalue1, float fvalue2)
 static void
 fcmps (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
 
   float fvalue1 = aarch64_get_FP_float (cpu, sn);
   float fvalue2 = aarch64_get_FP_float (cpu, sm);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
 }
 
@@ -7758,9 +8676,10 @@ fcmps (sim_cpu *cpu)
 static void
 fcmpzs (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
+  unsigned sn = INSTR ( 9,  5);
   float fvalue1 = aarch64_get_FP_float (cpu, sn);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
 }
 
@@ -7768,12 +8687,13 @@ fcmpzs (sim_cpu *cpu)
 static void
 fcmpes (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
 
   float fvalue1 = aarch64_get_FP_float (cpu, sn);
   float fvalue2 = aarch64_get_FP_float (cpu, sm);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   set_flags_for_float_compare (cpu, fvalue1, fvalue2);
 }
 
@@ -7781,9 +8701,10 @@ fcmpes (sim_cpu *cpu)
 static void
 fcmpzes (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
+  unsigned sn = INSTR ( 9,  5);
   float fvalue1 = aarch64_get_FP_float (cpu, sn);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   set_flags_for_float_compare (cpu, fvalue1, 0.0f);
 }
 
@@ -7792,8 +8713,22 @@ set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
 {
   uint32_t flags;
 
+  /* FIXME: Add exception raising.  */
   if (isnan (dval1) || isnan (dval2))
     flags = C|V;
+  else if (isinf (dval1) && isinf (dval2))
+    {
+      /* Subtracting two infinities may give a NaN.  We only need to compare
+	 the signs, which we can get from isinf.  */
+      int result = isinf (dval1) - isinf (dval2);
+
+      if (result == 0)
+	flags = Z|C;
+      else if (result < 0)
+	flags = N;
+      else /* (result > 0).  */
+	flags = C;
+    }
   else
     {
       double result = dval1 - dval2;
@@ -7813,12 +8748,13 @@ set_flags_for_double_compare (sim_cpu *cpu, double dval1, double dval2)
 static void
 fcmpd (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
 
   double dvalue1 = aarch64_get_FP_double (cpu, sn);
   double dvalue2 = aarch64_get_FP_double (cpu, sm);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
 }
 
@@ -7827,9 +8763,10 @@ fcmpd (sim_cpu *cpu)
 static void
 fcmpzd (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
+  unsigned sn = INSTR ( 9,  5);
   double dvalue1 = aarch64_get_FP_double (cpu, sn);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   set_flags_for_double_compare (cpu, dvalue1, 0.0);
 }
 
@@ -7837,12 +8774,13 @@ fcmpzd (sim_cpu *cpu)
 static void
 fcmped (sim_cpu *cpu)
 {
-  unsigned sm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
+  unsigned sm = INSTR (20, 16);
+  unsigned sn = INSTR ( 9,  5);
 
   double dvalue1 = aarch64_get_FP_double (cpu, sn);
   double dvalue2 = aarch64_get_FP_double (cpu, sm);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   set_flags_for_double_compare (cpu, dvalue1, dvalue2);
 }
 
@@ -7850,9 +8788,10 @@ fcmped (sim_cpu *cpu)
 static void
 fcmpzed (sim_cpu *cpu)
 {
-  unsigned sn = uimm (aarch64_get_instr (cpu),  9,  5);
+  unsigned sn = INSTR ( 9,  5);
   double dvalue1 = aarch64_get_FP_double (cpu, sn);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   set_flags_for_double_compare (cpu, dvalue1, 0.0);
 }
 
@@ -7869,11 +8808,10 @@ dexSimpleFPCompare (sim_cpu *cpu)
                               01000 ==> FCMPZ, 11000 ==> FCMPEZ,
                               ow ==> UNALLOC  */
   uint32_t dispatch;
-  uint32_t M_S = (uimm (aarch64_get_instr (cpu), 31, 31) << 1)
-    | uimm (aarch64_get_instr (cpu), 29, 29);
-  uint32_t type = uimm (aarch64_get_instr (cpu), 23, 22);
-  uint32_t op = uimm (aarch64_get_instr (cpu), 15, 14);
-  uint32_t op2_2_0 = uimm (aarch64_get_instr (cpu), 2, 0);
+  uint32_t M_S = (INSTR (31, 31) << 1) | INSTR (29, 29);
+  uint32_t type = INSTR (23, 22);
+  uint32_t op = INSTR (15, 14);
+  uint32_t op2_2_0 = INSTR (2, 0);
 
   if (op2_2_0 != 0)
     HALT_UNALLOC;
@@ -7888,7 +8826,7 @@ dexSimpleFPCompare (sim_cpu *cpu)
     HALT_UNALLOC;
 
   /* dispatch on type and top 2 bits of opcode.  */
-  dispatch = (type << 2) | uimm (aarch64_get_instr (cpu), 4, 3);
+  dispatch = (type << 2) | INSTR (4, 3);
 
   switch (dispatch)
     {
@@ -7900,26 +8838,26 @@ dexSimpleFPCompare (sim_cpu *cpu)
     case 5: fcmpzd (cpu); return;
     case 6: fcmped (cpu); return;
     case 7: fcmpzed (cpu); return;
-    default: HALT_UNREACHABLE;
     }
 }
 
 static void
 do_scalar_FADDP (sim_cpu *cpu)
 {
-  /* instr [31,23] = 011111100
+  /* instr [31,23] = 0111 1110 0
      instr [22]    = single(0)/double(1)
-     instr [21,10] = 1100 0011 0110
+     instr [21,10] = 11 0000 1101 10
      instr [9,5]   = Fn
      instr [4,0]   = Fd.  */
 
-  unsigned Fn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned Fd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned Fn = INSTR (9, 5);
+  unsigned Fd = INSTR (4, 0);
 
   NYI_assert (31, 23, 0x0FC);
   NYI_assert (21, 10, 0xC36);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       double val1 = aarch64_get_vec_double (cpu, Fn, 0);
       double val2 = aarch64_get_vec_double (cpu, Fn, 1);
@@ -7948,15 +8886,16 @@ do_scalar_FABD (sim_cpu *cpu)
      instr [9, 5]  = Rn
      instr [4, 0]  = Rd.  */
 
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   NYI_assert (31, 23, 0x0FD);
   NYI_assert (21, 21, 1);
   NYI_assert (15, 10, 0x35);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     aarch64_set_FP_double (cpu, rd,
 			   fabs (aarch64_get_FP_double (cpu, rn)
 				 - aarch64_get_FP_double (cpu, rm)));
@@ -7975,13 +8914,14 @@ do_scalar_CMGT (sim_cpu *cpu)
      instr [9, 5]  = Rn
      instr [4, 0]  = Rd.  */
 
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   NYI_assert (31, 21, 0x2F7);
   NYI_assert (15, 10, 0x0D);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_vec_u64 (cpu, rd, 0,
 		       aarch64_get_vec_u64 (cpu, rn, 0) >
 		       aarch64_get_vec_u64 (cpu, rm, 0) ? -1L : 0L);
@@ -7996,38 +8936,219 @@ do_scalar_USHR (sim_cpu *cpu)
      instr [9, 5]  = Rn
      instr [4, 0]  = Rd.  */
 
-  unsigned amount = 128 - uimm (aarch64_get_instr (cpu), 22, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned amount = 128 - INSTR (22, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   NYI_assert (31, 23, 0x0FE);
   NYI_assert (15, 10, 0x01);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_vec_u64 (cpu, rd, 0,
 		       aarch64_get_vec_u64 (cpu, rn, 0) >> amount);
 }
 
 static void
-do_scalar_SHL (sim_cpu *cpu)
+do_scalar_SSHL (sim_cpu *cpu)
+{
+  /* instr [31,21] = 0101 1110 111
+     instr [20,16] = Rm
+     instr [15,10] = 0100 01
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
+
+  NYI_assert (31, 21, 0x2F7);
+  NYI_assert (15, 10, 0x11);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (shift >= 0)
+    aarch64_set_vec_s64 (cpu, rd, 0,
+			 aarch64_get_vec_s64 (cpu, rn, 0) << shift);
+  else
+    aarch64_set_vec_s64 (cpu, rd, 0,
+			 aarch64_get_vec_s64 (cpu, rn, 0) >> - shift);
+}
+
+/* Floating point scalar compare greater than or equal to 0.  */
+static void
+do_scalar_FCMGE_zero (sim_cpu *cpu)
+{
+  /* instr [31,23] = 0111 1110 1
+     instr [22,22] = size
+     instr [21,16] = 1000 00
+     instr [15,10] = 1100 10
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned size = INSTR (22, 22);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 23, 0x0FD);
+  NYI_assert (21, 16, 0x20);
+  NYI_assert (15, 10, 0x32);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (size)
+    aarch64_set_vec_u64 (cpu, rd, 0,
+			 aarch64_get_vec_double (cpu, rn, 0) >= 0.0 ? -1 : 0);
+  else
+    aarch64_set_vec_u32 (cpu, rd, 0,
+			 aarch64_get_vec_float (cpu, rn, 0) >= 0.0 ? -1 : 0);
+}
+
+/* Floating point scalar compare less than or equal to 0.  */
+static void
+do_scalar_FCMLE_zero (sim_cpu *cpu)
+{
+  /* instr [31,23] = 0111 1110 1
+     instr [22,22] = size
+     instr [21,16] = 1000 00
+     instr [15,10] = 1101 10
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned size = INSTR (22, 22);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 23, 0x0FD);
+  NYI_assert (21, 16, 0x20);
+  NYI_assert (15, 10, 0x36);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (size)
+    aarch64_set_vec_u64 (cpu, rd, 0,
+			 aarch64_get_vec_double (cpu, rn, 0) <= 0.0 ? -1 : 0);
+  else
+    aarch64_set_vec_u32 (cpu, rd, 0,
+			 aarch64_get_vec_float (cpu, rn, 0) <= 0.0 ? -1 : 0);
+}
+
+/* Floating point scalar compare greater than 0.  */
+static void
+do_scalar_FCMGT_zero (sim_cpu *cpu)
+{
+  /* instr [31,23] = 0101 1110 1
+     instr [22,22] = size
+     instr [21,16] = 1000 00
+     instr [15,10] = 1100 10
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned size = INSTR (22, 22);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 23, 0x0BD);
+  NYI_assert (21, 16, 0x20);
+  NYI_assert (15, 10, 0x32);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (size)
+    aarch64_set_vec_u64 (cpu, rd, 0,
+			 aarch64_get_vec_double (cpu, rn, 0) > 0.0 ? -1 : 0);
+  else
+    aarch64_set_vec_u32 (cpu, rd, 0,
+			 aarch64_get_vec_float (cpu, rn, 0) > 0.0 ? -1 : 0);
+}
+
+/* Floating point scalar compare equal to 0.  */
+static void
+do_scalar_FCMEQ_zero (sim_cpu *cpu)
 {
-  /* instr [31,23] = 0111 1101 0
+  /* instr [31,23] = 0101 1110 1
+     instr [22,22] = size
+     instr [21,16] = 1000 00
+     instr [15,10] = 1101 10
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned size = INSTR (22, 22);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 23, 0x0BD);
+  NYI_assert (21, 16, 0x20);
+  NYI_assert (15, 10, 0x36);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (size)
+    aarch64_set_vec_u64 (cpu, rd, 0,
+			 aarch64_get_vec_double (cpu, rn, 0) == 0.0 ? -1 : 0);
+  else
+    aarch64_set_vec_u32 (cpu, rd, 0,
+			 aarch64_get_vec_float (cpu, rn, 0) == 0.0 ? -1 : 0);
+}
+
+/* Floating point scalar compare less than 0.  */
+static void
+do_scalar_FCMLT_zero (sim_cpu *cpu)
+{
+  /* instr [31,23] = 0101 1110 1
+     instr [22,22] = size
+     instr [21,16] = 1000 00
+     instr [15,10] = 1110 10
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned size = INSTR (22, 22);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 23, 0x0BD);
+  NYI_assert (21, 16, 0x20);
+  NYI_assert (15, 10, 0x3A);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (size)
+    aarch64_set_vec_u64 (cpu, rd, 0,
+			 aarch64_get_vec_double (cpu, rn, 0) < 0.0 ? -1 : 0);
+  else
+    aarch64_set_vec_u32 (cpu, rd, 0,
+			 aarch64_get_vec_float (cpu, rn, 0) < 0.0 ? -1 : 0);
+}
+
+static void
+do_scalar_shift (sim_cpu *cpu)
+{
+  /* instr [31,23] = 0101 1111 0
      instr [22,16] = shift amount
-     instr [15,10] = 0101 01
+     instr [15,10] = 0101 01   [SHL]
+     instr [15,10] = 0000 01   [SSHR]
      instr [9, 5]  = Rn
      instr [4, 0]  = Rd.  */
 
-  unsigned amount = uimm (aarch64_get_instr (cpu), 22, 16) - 64;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned amount;
 
   NYI_assert (31, 23, 0x0BE);
-  NYI_assert (15, 10, 0x15);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22) == 0)
+  if (INSTR (22, 22) == 0)
     HALT_UNALLOC;
 
-  aarch64_set_vec_u64 (cpu, rd, 0,
-		       aarch64_get_vec_u64 (cpu, rn, 0) << amount);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  switch (INSTR (15, 10))
+    {
+    case 0x01: /* SSHR */
+      amount = 128 - INSTR (22, 16);
+      aarch64_set_vec_s64 (cpu, rd, 0,
+			   aarch64_get_vec_s64 (cpu, rn, 0) >> amount);
+      return;
+    case 0x15: /* SHL */
+      amount = INSTR (22, 16) - 64;
+      aarch64_set_vec_u64 (cpu, rd, 0,
+			   aarch64_get_vec_u64 (cpu, rn, 0) << amount);
+      return;
+    default:
+      HALT_NYI;
+    }
 }
 
 /* FCMEQ FCMGT FCMGE.  */
@@ -8047,12 +9168,10 @@ do_scalar_FCM (sim_cpu *cpu)
      instr [9, 5]  = Rn
      instr [4, 0]  = Rd.  */
 
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned EUac = (uimm (aarch64_get_instr (cpu), 23, 23) << 2)
-    | (uimm (aarch64_get_instr (cpu), 29, 29) << 1)
-    | uimm (aarch64_get_instr (cpu), 11, 11);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned EUac = (INSTR (23, 23) << 2) | (INSTR (29, 29) << 1) | INSTR (11, 11);
   unsigned result;
   float val1;
   float val2;
@@ -8063,7 +9182,8 @@ do_scalar_FCM (sim_cpu *cpu)
   NYI_assert (15, 12, 0xE);
   NYI_assert (10, 10, 1);
 
-  if (uimm (aarch64_get_instr (cpu), 22, 22))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
       double val1 = aarch64_get_FP_double (cpu, rn);
       double val2 = aarch64_get_FP_double (cpu, rm);
@@ -8127,72 +9247,166 @@ do_scalar_FCM (sim_cpu *cpu)
       HALT_UNALLOC;
     }
 
-  aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
+  aarch64_set_vec_u32 (cpu, rd, 0, result ? -1 : 0);
+}
+
+/* An alias of DUP.  */
+static void
+do_scalar_MOV (sim_cpu *cpu)
+{
+  /* instr [31,21] = 0101 1110 000
+     instr [20,16] = imm5
+     instr [15,10] = 0000 01
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  unsigned index;
+
+  NYI_assert (31, 21, 0x2F0);
+  NYI_assert (15, 10, 0x01);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (16, 16))
+    {
+      /* 8-bit.  */
+      index = INSTR (20, 17);
+      aarch64_set_vec_u8
+	(cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
+    }
+  else if (INSTR (17, 17))
+    {
+      /* 16-bit.  */
+      index = INSTR (20, 18);
+      aarch64_set_vec_u16
+	(cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
+    }
+  else if (INSTR (18, 18))
+    {
+      /* 32-bit.  */
+      index = INSTR (20, 19);
+      aarch64_set_vec_u32
+	(cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
+    }
+  else if (INSTR (19, 19))
+    {
+      /* 64-bit.  */
+      index = INSTR (20, 20);
+      aarch64_set_vec_u64
+	(cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
+    }
+  else
+    HALT_UNALLOC;
+}
+
+static void
+do_scalar_NEG (sim_cpu *cpu)
+{
+  /* instr [31,10] = 0111 1110 1110 0000 1011 10
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+
+  NYI_assert (31, 10, 0x1FB82E);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
+}
+
+static void
+do_scalar_USHL (sim_cpu *cpu)
+{
+  /* instr [31,21] = 0111 1110 111
+     instr [20,16] = Rm
+     instr [15,10] = 0100 01
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  signed int shift = aarch64_get_vec_s8 (cpu, rm, 0);
+
+  NYI_assert (31, 21, 0x3F7);
+  NYI_assert (15, 10, 0x11);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (shift >= 0)
+    aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) << shift);
+  else
+    aarch64_set_vec_u64 (cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, 0) >> - shift);
+}
+
+static void
+do_double_add (sim_cpu *cpu)
+{
+  /* instr [31,21] = 0101 1110 111
+     instr [20,16] = Fn
+     instr [15,10] = 1000 01
+     instr [9,5]   = Fm
+     instr [4,0]   = Fd.  */
+  unsigned Fd;
+  unsigned Fm;
+  unsigned Fn;
+  double val1;
+  double val2;
+
+  NYI_assert (31, 21, 0x2F7);
+  NYI_assert (15, 10, 0x21);
+
+  Fd = INSTR (4, 0);
+  Fm = INSTR (9, 5);
+  Fn = INSTR (20, 16);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  val1 = aarch64_get_FP_double (cpu, Fm);
+  val2 = aarch64_get_FP_double (cpu, Fn);
+
+  aarch64_set_FP_double (cpu, Fd, val1 + val2);
 }
 
-/* An alias of DUP.  */
 static void
-do_scalar_MOV (sim_cpu *cpu)
+do_scalar_UCVTF (sim_cpu *cpu)
 {
-  /* instr [31,21] = 0101 1110 000
-     instr [20,16] = imm5
-     instr [15,10] = 0000 01
-     instr [9, 5]  = Rn
-     instr [4, 0]  = Rd.  */
+  /* instr [31,23] = 0111 1110 0
+     instr [22]    = single(0)/double(1)
+     instr [21,10] = 10 0001 1101 10
+     instr [9,5]   = rn
+     instr [4,0]   = rd.  */
 
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned index;
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
-  NYI_assert (31, 21, 0x2F0);
-  NYI_assert (15, 10, 0x01);
+  NYI_assert (31, 23, 0x0FC);
+  NYI_assert (21, 10, 0x876);
 
-  if (uimm (aarch64_get_instr (cpu), 16, 16))
-    {
-      /* 8-bit.  */
-      index = uimm (aarch64_get_instr (cpu), 20, 17);
-      aarch64_set_vec_u8
-	(cpu, rd, 0, aarch64_get_vec_u8 (cpu, rn, index));
-    }
-  else if (uimm (aarch64_get_instr (cpu), 17, 17))
-    {
-      /* 16-bit.  */
-      index = uimm (aarch64_get_instr (cpu), 20, 18);
-      aarch64_set_vec_u16
-	(cpu, rd, 0, aarch64_get_vec_u16 (cpu, rn, index));
-    }
-  else if (uimm (aarch64_get_instr (cpu), 18, 18))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (INSTR (22, 22))
     {
-      /* 32-bit.  */
-      index = uimm (aarch64_get_instr (cpu), 20, 19);
-      aarch64_set_vec_u32
-	(cpu, rd, 0, aarch64_get_vec_u32 (cpu, rn, index));
+      uint64_t val = aarch64_get_vec_u64 (cpu, rn, 0);
+
+      aarch64_set_vec_double (cpu, rd, 0, (double) val);
     }
-  else if (uimm (aarch64_get_instr (cpu), 19, 19))
+  else
     {
-      /* 64-bit.  */
-      index = uimm (aarch64_get_instr (cpu), 20, 20);
-      aarch64_set_vec_u64
-	(cpu, rd, 0, aarch64_get_vec_u64 (cpu, rn, index));
+      uint32_t val = aarch64_get_vec_u32 (cpu, rn, 0);
+
+      aarch64_set_vec_float (cpu, rd, 0, (float) val);
     }
-  else
-    HALT_UNALLOC;
 }
 
 static void
-do_double_add (sim_cpu *cpu)
+do_scalar_vec (sim_cpu *cpu)
 {
+  /* instr [30] = 1.  */
   /* instr [28,25] = 1111.  */
-  unsigned Fd;
-  unsigned Fm;
-  unsigned Fn;
-  double val1;
-  double val2;
-
-  switch (uimm (aarch64_get_instr (cpu), 31, 23))
+  switch (INSTR (31, 23))
     {
     case 0xBC:
-      switch (uimm (aarch64_get_instr (cpu), 15, 10))
+      switch (INSTR (15, 10))
 	{
 	case 0x01: do_scalar_MOV (cpu); return;
 	case 0x39: do_scalar_FCM (cpu); return;
@@ -8200,22 +9414,32 @@ do_double_add (sim_cpu *cpu)
 	}
       break;
 
-    case 0xBE: do_scalar_SHL (cpu); return;
+    case 0xBE: do_scalar_shift (cpu); return;
 
     case 0xFC:
-      switch (uimm (aarch64_get_instr (cpu), 15, 10))
+      switch (INSTR (15, 10))
 	{
-	case 0x36: do_scalar_FADDP (cpu); return;
+	case 0x36:
+	  switch (INSTR (21, 16))
+	    {
+	    case 0x30: do_scalar_FADDP (cpu); return;
+	    case 0x21: do_scalar_UCVTF (cpu); return;
+	    }
+	  HALT_NYI;
 	case 0x39: do_scalar_FCM (cpu); return;
 	case 0x3B: do_scalar_FCM (cpu); return;
 	}
       break;
 
     case 0xFD:
-      switch (uimm (aarch64_get_instr (cpu), 15, 10))
+      switch (INSTR (15, 10))
 	{
 	case 0x0D: do_scalar_CMGT (cpu); return;
+	case 0x11: do_scalar_USHL (cpu); return;
+	case 0x2E: do_scalar_NEG (cpu); return;
+	case 0x32: do_scalar_FCMGE_zero (cpu); return;
 	case 0x35: do_scalar_FABD (cpu); return;
+	case 0x36: do_scalar_FCMLE_zero (cpu); return;
 	case 0x39: do_scalar_FCM (cpu); return;
 	case 0x3B: do_scalar_FCM (cpu); return;
 	default:
@@ -8223,27 +9447,22 @@ do_double_add (sim_cpu *cpu)
 	}
 
     case 0xFE: do_scalar_USHR (cpu); return;
-    default:
-      break;
-    }
-
-  /* instr [31,21] = 0101 1110 111
-     instr [20,16] = Fn
-     instr [15,10] = 1000 01
-     instr [9,5]   = Fm
-     instr [4,0]   = Fd.  */
-  if (uimm (aarch64_get_instr (cpu), 31, 21) != 0x2F7
-      || uimm (aarch64_get_instr (cpu), 15, 10) != 0x21)
-    HALT_NYI;
 
-  Fd = uimm (aarch64_get_instr (cpu), 4, 0);
-  Fm = uimm (aarch64_get_instr (cpu), 9, 5);
-  Fn = uimm (aarch64_get_instr (cpu), 20, 16);
-
-  val1 = aarch64_get_FP_double (cpu, Fm);
-  val2 = aarch64_get_FP_double (cpu, Fn);
+    case 0xBD:
+      switch (INSTR (15, 10))
+	{
+	case 0x21: do_double_add (cpu); return;
+	case 0x11: do_scalar_SSHL (cpu); return;
+	case 0x32: do_scalar_FCMGT_zero (cpu); return;
+	case 0x36: do_scalar_FCMEQ_zero (cpu); return;
+	case 0x3A: do_scalar_FCMLT_zero (cpu); return;
+	default:
+	  HALT_NYI;
+	}
 
-  aarch64_set_FP_double (cpu, Fd, val1 + val2);
+    default:
+      HALT_NYI;
+    }
 }
 
 static void
@@ -8251,25 +9470,25 @@ dexAdvSIMD1 (sim_cpu *cpu)
 {
   /* instr [28,25] = 1 111.  */
 
-  /* we are currently only interested in the basic
+  /* We are currently only interested in the basic
      scalar fp routines which all have bit 30 = 0.  */
-  if (uimm (aarch64_get_instr (cpu), 30, 30))
-    do_double_add (cpu);
+  if (INSTR (30, 30))
+    do_scalar_vec (cpu);
 
   /* instr[24] is set for FP data processing 3-source and clear for
      all other basic scalar fp instruction groups.  */
-  else if (uimm (aarch64_get_instr (cpu), 24, 24))
+  else if (INSTR (24, 24))
     dexSimpleFPDataProc3Source (cpu);
 
   /* instr[21] is clear for floating <-> fixed conversions and set for
      all other basic scalar fp instruction groups.  */
-  else if (!uimm (aarch64_get_instr (cpu), 21, 21))
+  else if (!INSTR (21, 21))
     dexSimpleFPFixedConvert (cpu);
 
   /* instr[11,10] : 01 ==> cond compare, 10 ==> Data Proc 2 Source
      11 ==> cond select,  00 ==> other.  */
   else
-    switch (uimm (aarch64_get_instr (cpu), 11, 10))
+    switch (INSTR (11, 10))
       {
       case 1: dexSimpleFPCondCompare (cpu); return;
       case 2: dexSimpleFPDataProc2Source (cpu); return;
@@ -8277,20 +9496,20 @@ dexAdvSIMD1 (sim_cpu *cpu)
 
       default:
 	/* Now an ordered cascade of tests.
-	   FP immediate has aarch64_get_instr (cpu)[12] == 1.
-	   FP compare has aarch64_get_instr (cpu)[13] == 1.
-	   FP Data Proc 1 Source has aarch64_get_instr (cpu)[14] == 1.
-	   FP floating <--> integer conversions has aarch64_get_instr (cpu)[15] == 0.  */
-	if (uimm (aarch64_get_instr (cpu), 12, 12))
+	   FP immediate has instr [12] == 1.
+	   FP compare has   instr [13] == 1.
+	   FP Data Proc 1 Source has instr [14] == 1.
+	   FP floating <--> integer conversions has instr [15] == 0.  */
+	if (INSTR (12, 12))
 	  dexSimpleFPImmediate (cpu);
 
-	else if (uimm (aarch64_get_instr (cpu), 13, 13))
+	else if (INSTR (13, 13))
 	  dexSimpleFPCompare (cpu);
 
-	else if (uimm (aarch64_get_instr (cpu), 14, 14))
+	else if (INSTR (14, 14))
 	  dexSimpleFPDataProc1Source (cpu);
 
-	else if (!uimm (aarch64_get_instr (cpu), 15, 15))
+	else if (!INSTR (15, 15))
 	  dexSimpleFPIntegerConvert (cpu);
 
 	else
@@ -8308,14 +9527,14 @@ pcadr (sim_cpu *cpu)
      instr[30,29] = immlo
      instr[23,5] = immhi.  */
   uint64_t address;
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  uint32_t isPage = uimm (aarch64_get_instr (cpu), 31, 31);
+  unsigned rd = INSTR (4, 0);
+  uint32_t isPage = INSTR (31, 31);
   union { int64_t u64; uint64_t s64; } imm;
   uint64_t offset;
 
   imm.s64 = simm64 (aarch64_get_instr (cpu), 23, 5);
   offset = imm.u64;
-  offset = (offset << 2) | uimm (aarch64_get_instr (cpu), 30, 29);
+  offset = (offset << 2) | INSTR (30, 29);
 
   address = aarch64_get_PC (cpu);
 
@@ -8325,6 +9544,7 @@ pcadr (sim_cpu *cpu)
       address &= ~0xfff;
     }
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, address + offset);
 }
 
@@ -8351,9 +9571,10 @@ dexPCRelAddressing (sim_cpu *cpu)
 static void
 and32 (sim_cpu *cpu, uint32_t bimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) & bimm);
 }
@@ -8362,9 +9583,10 @@ and32 (sim_cpu *cpu, uint32_t bimm)
 static void
 and64 (sim_cpu *cpu, uint64_t bimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) & bimm);
 }
@@ -8373,12 +9595,13 @@ and64 (sim_cpu *cpu, uint64_t bimm)
 static void
 ands32 (sim_cpu *cpu, uint32_t bimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t value2 = bimm;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   set_flags_for_binop32 (cpu, value1 & value2);
 }
@@ -8387,12 +9610,13 @@ ands32 (sim_cpu *cpu, uint32_t bimm)
 static void
 ands64 (sim_cpu *cpu, uint64_t bimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t value2 = bimm;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   set_flags_for_binop64 (cpu, value1 & value2);
 }
@@ -8401,9 +9625,10 @@ ands64 (sim_cpu *cpu, uint64_t bimm)
 static void
 eor32 (sim_cpu *cpu, uint32_t bimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) ^ bimm);
 }
@@ -8412,9 +9637,10 @@ eor32 (sim_cpu *cpu, uint32_t bimm)
 static void
 eor64 (sim_cpu *cpu, uint64_t bimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) ^ bimm);
 }
@@ -8423,9 +9649,10 @@ eor64 (sim_cpu *cpu, uint64_t bimm)
 static void
 orr32 (sim_cpu *cpu, uint32_t bimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u32 (cpu, rn, NO_SP) | bimm);
 }
@@ -8434,9 +9661,10 @@ orr32 (sim_cpu *cpu, uint32_t bimm)
 static void
 orr64 (sim_cpu *cpu, uint64_t bimm)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, SP_OK,
 		       aarch64_get_reg_u64 (cpu, rn, NO_SP) | bimm);
 }
@@ -8450,10 +9678,11 @@ orr64 (sim_cpu *cpu, uint64_t bimm)
 static void
 and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
      & shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
@@ -8463,10 +9692,11 @@ and32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
      & shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
@@ -8476,14 +9706,15 @@ and64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t value2 = shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 			       shift, count);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   set_flags_for_binop32 (cpu, value1 & value2);
 }
@@ -8492,14 +9723,15 @@ ands32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t value2 = shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
 			       shift, count);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   set_flags_for_binop64 (cpu, value1 & value2);
 }
@@ -8508,10 +9740,11 @@ ands64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
      & ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
@@ -8521,10 +9754,11 @@ bic32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
      & ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
@@ -8534,14 +9768,15 @@ bic64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint32_t value1 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t value2 = ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 				 shift, count);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   set_flags_for_binop32 (cpu, value1 & value2);
 }
@@ -8550,14 +9785,15 @@ bics32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   uint64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t value2 = ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP),
 				 shift, count);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value1 & value2);
   set_flags_for_binop64 (cpu, value1 & value2);
 }
@@ -8566,10 +9802,11 @@ bics64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
      ^ ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
@@ -8579,10 +9816,11 @@ eon32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
      ^ ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
@@ -8592,10 +9830,11 @@ eon64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
      ^ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
@@ -8605,10 +9844,11 @@ eor32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
      ^ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
@@ -8618,10 +9858,11 @@ eor64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
      | shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
@@ -8631,10 +9872,11 @@ orr32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
      | shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
@@ -8644,10 +9886,11 @@ orr64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u32 (cpu, rn, NO_SP)
      | ~ shifted32 (aarch64_get_reg_u32 (cpu, rm, NO_SP), shift, count));
@@ -8657,10 +9900,11 @@ orn32_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 static void
 orn64_shift (sim_cpu *cpu, Shift shift, uint32_t count)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, aarch64_get_reg_u64 (cpu, rn, NO_SP)
      | ~ shifted64 (aarch64_get_reg_u64 (cpu, rm, NO_SP), shift, count));
@@ -8679,13 +9923,13 @@ dexLogicalImmediate (sim_cpu *cpu)
      instr[4,0] = Rd  */
 
   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
-  uint32_t size = uimm (aarch64_get_instr (cpu), 31, 31);
-  uint32_t N = uimm (aarch64_get_instr (cpu), 22, 22);
-  /* uint32_t immr = uimm (aarch64_get_instr (cpu), 21, 16);.  */
-  /* uint32_t imms = uimm (aarch64_get_instr (cpu), 15, 10);.  */
-  uint32_t index = uimm (aarch64_get_instr (cpu), 22, 10);
+  uint32_t size = INSTR (31, 31);
+  uint32_t N = INSTR (22, 22);
+  /* uint32_t immr = INSTR (21, 16);.  */
+  /* uint32_t imms = INSTR (15, 10);.  */
+  uint32_t index = INSTR (22, 10);
   uint64_t bimm64 = LITable [index];
-  uint32_t dispatch = uimm (aarch64_get_instr (cpu), 30, 29);
+  uint32_t dispatch = INSTR (30, 29);
 
   if (~size & N)
     HALT_UNALLOC;
@@ -8730,8 +9974,9 @@ dexLogicalImmediate (sim_cpu *cpu)
 static void
 movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, val << (pos * 16));
 }
 
@@ -8739,8 +9984,9 @@ movz32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 static void
 movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((uint64_t) val) << (pos * 16));
 }
 
@@ -8748,8 +9994,9 @@ movz64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 static void
 movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, ((val << (pos * 16)) ^ 0xffffffffU));
 }
 
@@ -8757,8 +10004,9 @@ movn32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 static void
 movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, ((((uint64_t) val) << (pos * 16))
 		      ^ 0xffffffffffffffffULL));
@@ -8768,11 +10016,12 @@ movn64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 static void
 movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
   uint32_t current = aarch64_get_reg_u32 (cpu, rd, NO_SP);
   uint32_t value = val << (pos * 16);
   uint32_t mask = ~(0xffffU << (pos * 16));
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
 }
 
@@ -8780,11 +10029,12 @@ movk32 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 static void
 movk64 (sim_cpu *cpu, uint32_t val, uint32_t pos)
 {
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rd = INSTR (4, 0);
   uint64_t current = aarch64_get_reg_u64 (cpu, rd, NO_SP);
   uint64_t value = (uint64_t) val << (pos * 16);
   uint64_t mask = ~(0xffffULL << (pos * 16));
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, (value | (current & mask)));
 }
 
@@ -8802,9 +10052,9 @@ dexMoveWideImmediate (sim_cpu *cpu)
      we just pass the multiplier.  */
 
   uint32_t imm;
-  uint32_t size = uimm (aarch64_get_instr (cpu), 31, 31);
-  uint32_t op = uimm (aarch64_get_instr (cpu), 30, 29);
-  uint32_t shift = uimm (aarch64_get_instr (cpu), 22, 21);
+  uint32_t size = INSTR (31, 31);
+  uint32_t op = INSTR (30, 29);
+  uint32_t shift = INSTR (22, 21);
 
   /* 32 bit can only shift 0 or 1 lot of 16.
      anything else is an unallocated instruction.  */
@@ -8814,7 +10064,7 @@ dexMoveWideImmediate (sim_cpu *cpu)
   if (op == 1)
     HALT_UNALLOC;
 
-  imm = uimm (aarch64_get_instr (cpu), 20, 5);
+  imm = INSTR (20, 5);
 
   if (size == 0)
     {
@@ -8851,7 +10101,7 @@ static void
 ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
 {
   unsigned rd;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = INSTR (9, 5);
   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
 
   /* Pick either s+1-r or s+1 consecutive bits out of the original word.  */
@@ -8876,7 +10126,8 @@ ubfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
       value >>= r - (s + 1);
     }
 
-  rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  rd = INSTR (4, 0);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
 }
 
@@ -8886,7 +10137,7 @@ static void
 ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
 {
   unsigned rd;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = INSTR (9, 5);
   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
 
   if (r <= s)
@@ -8910,7 +10161,8 @@ ubfm (sim_cpu *cpu, uint32_t r, uint32_t s)
       value >>= r - (s + 1);
     }
 
-  rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  rd = INSTR (4, 0);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, value);
 }
 
@@ -8926,7 +10178,7 @@ static void
 sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
 {
   unsigned rd;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = INSTR (9, 5);
   /* as per ubfm32 but use an ASR instead of an LSR.  */
   int32_t value = aarch64_get_reg_s32 (cpu, rn, NO_SP);
 
@@ -8941,7 +10193,8 @@ sbfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
       value >>= r - (s + 1);
     }
 
-  rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  rd = INSTR (4, 0);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, (uint32_t) value);
 }
 
@@ -8951,7 +10204,7 @@ static void
 sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
 {
   unsigned rd;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = INSTR (9, 5);
   /* acpu per ubfm but use an ASR instead of an LSR.  */
   int64_t value = aarch64_get_reg_s64 (cpu, rn, NO_SP);
 
@@ -8966,7 +10219,8 @@ sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
       value >>= r - (s + 1);
     }
 
-  rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  rd = INSTR (4, 0);
   aarch64_set_reg_s64 (cpu, rd, NO_SP, value);
 }
 
@@ -8980,7 +10234,7 @@ sbfm (sim_cpu *cpu, uint32_t r, uint32_t s)
 static void
 bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = INSTR (9, 5);
   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t mask = -1;
   unsigned rd;
@@ -9014,12 +10268,13 @@ bfm32 (sim_cpu *cpu, uint32_t r, uint32_t s)
       mask >>= r - (s + 1);
     }
 
-  rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  rd = INSTR (4, 0);
   value2 = aarch64_get_reg_u32 (cpu, rd, NO_SP);
 
   value2 &= ~mask;
   value2 |= value;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, (aarch64_get_reg_u32 (cpu, rd, NO_SP) & ~mask) | value);
 }
@@ -9030,7 +10285,7 @@ static void
 bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
 {
   unsigned rd;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = INSTR (9, 5);
   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t mask = 0xffffffffffffffffULL;
 
@@ -9061,7 +10316,8 @@ bfm (sim_cpu *cpu, uint32_t r, uint32_t s)
       mask >>= r - (s + 1);
     }
 
-  rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  rd = INSTR (4, 0);
   aarch64_set_reg_u64
     (cpu, rd, NO_SP, (aarch64_get_reg_u64 (cpu, rd, NO_SP) & ~mask) | value);
 }
@@ -9081,11 +10337,11 @@ dexBitfieldImmediate (sim_cpu *cpu)
   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
   uint32_t dispatch;
   uint32_t imms;
-  uint32_t size = uimm (aarch64_get_instr (cpu), 31, 31);
-  uint32_t N = uimm (aarch64_get_instr (cpu), 22, 22);
+  uint32_t size = INSTR (31, 31);
+  uint32_t N = INSTR (22, 22);
   /* 32 bit operations must have immr[5] = 0 and imms[5] = 0.  */
   /* or else we have an UNALLOC.  */
-  uint32_t immr = uimm (aarch64_get_instr (cpu), 21, 16);
+  uint32_t immr = INSTR (21, 16);
 
   if (~size & N)
     HALT_UNALLOC;
@@ -9093,12 +10349,12 @@ dexBitfieldImmediate (sim_cpu *cpu)
   if (!size && uimm (immr, 5, 5))
     HALT_UNALLOC;
 
-  imms = uimm (aarch64_get_instr (cpu), 15, 10);
+  imms = INSTR (15, 10);
   if (!size && uimm (imms, 5, 5))
     HALT_UNALLOC;
 
   /* Switch on combined size and op.  */
-  dispatch = uimm (aarch64_get_instr (cpu), 31, 29);
+  dispatch = INSTR (31, 29);
   switch (dispatch)
     {
     case 0: sbfm32 (cpu, immr, imms); return;
@@ -9119,10 +10375,10 @@ do_EXTR_32 (sim_cpu *cpu)
      instr[15,10] = imms :  0xxxxx for 32 bit
      instr[9,5]   = Rn
      instr[4,0]   = Rd  */
-  unsigned rm   = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned imms = uimm (aarch64_get_instr (cpu), 15, 10) & 31;
-  unsigned rn   = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned rd   = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned rm   = INSTR (20, 16);
+  unsigned imms = INSTR (15, 10) & 31;
+  unsigned rn   = INSTR ( 9,  5);
+  unsigned rd   = INSTR ( 4,  0);
   uint64_t val1;
   uint64_t val2;
 
@@ -9131,6 +10387,7 @@ do_EXTR_32 (sim_cpu *cpu)
   val2 = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   val2 <<= (32 - imms);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP, val1 | val2);
 }
 
@@ -9142,10 +10399,10 @@ do_EXTR_64 (sim_cpu *cpu)
      instr[15,10] = imms
      instr[9,5]   = Rn
      instr[4,0]   = Rd  */
-  unsigned rm   = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned imms = uimm (aarch64_get_instr (cpu), 15, 10) & 63;
-  unsigned rn   = uimm (aarch64_get_instr (cpu),  9,  5);
-  unsigned rd   = uimm (aarch64_get_instr (cpu),  4,  0);
+  unsigned rm   = INSTR (20, 16);
+  unsigned imms = INSTR (15, 10) & 63;
+  unsigned rn   = INSTR ( 9,  5);
+  unsigned rd   = INSTR ( 4,  0);
   uint64_t val;
 
   val = aarch64_get_reg_u64 (cpu, rm, NO_SP);
@@ -9171,11 +10428,11 @@ dexExtractImmediate (sim_cpu *cpu)
   /* 32 bit operations must have N = 0 or else we have an UNALLOC.  */
   /* 64 bit operations must have N = 1 or else we have an UNALLOC.  */
   uint32_t dispatch;
-  uint32_t size = uimm (aarch64_get_instr (cpu), 31, 31);
-  uint32_t N = uimm (aarch64_get_instr (cpu), 22, 22);
+  uint32_t size = INSTR (31, 31);
+  uint32_t N = INSTR (22, 22);
   /* 32 bit operations must have imms[5] = 0
      or else we have an UNALLOC.  */
-  uint32_t imms = uimm (aarch64_get_instr (cpu), 15, 10);
+  uint32_t imms = INSTR (15, 10);
 
   if (size ^ N)
     HALT_UNALLOC;
@@ -9184,7 +10441,7 @@ dexExtractImmediate (sim_cpu *cpu)
     HALT_UNALLOC;
 
   /* Switch on combined size and op.  */
-  dispatch = uimm (aarch64_get_instr (cpu), 31, 29);
+  dispatch = INSTR (31, 29);
 
   if (dispatch == 0)
     do_EXTR_32 (cpu);
@@ -9251,10 +10508,9 @@ dexLoadUnscaledImmediate (sim_cpu *cpu)
      instr[23,22] = opc
      instr[20,12] = simm9
      instr[9,5] = rn may be SP.  */
-  /* unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);  */
-  uint32_t V = uimm (aarch64_get_instr (cpu), 26, 26);
-  uint32_t dispatch = ( (uimm (aarch64_get_instr (cpu), 31, 30) << 2)
-			| uimm (aarch64_get_instr (cpu), 23, 22));
+  /* unsigned rt = INSTR (4, 0);  */
+  uint32_t V = INSTR (26, 26);
+  uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
   int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
 
   if (!V)
@@ -9328,8 +10584,8 @@ dexLoadUnscaledImmediate (sim_cpu *cpu)
 static void
 ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int rt = INSTR (4, 0);
 
   /* The target register may not be SP but the source may be
      there is no scaling required for a byte load.  */
@@ -9343,9 +10599,9 @@ ldrsb32_abs (sim_cpu *cpu, uint32_t offset)
 static void
 ldrsb32_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned int rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rm = INSTR (20, 16);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int rt = INSTR (4, 0);
 
   /* rn may reference SP, rm and rt must reference ZR.  */
 
@@ -9365,8 +10621,8 @@ static void
 ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
   uint64_t address;
-  unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned int rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned int rn = INSTR (9, 5);
+  unsigned int rt = INSTR (4, 0);
 
   if (rn == rt && wb != NoWriteBack)
     HALT_UNALLOC;
@@ -9390,8 +10646,8 @@ ldrsb32_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 fstrb_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned st = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
 
   aarch64_set_mem_u8 (cpu,
 		      aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset,
@@ -9403,14 +10659,14 @@ fstrb_abs (sim_cpu *cpu, uint32_t offset)
 static void
 fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
 
   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 			       extension);
-  uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
+  uint64_t  displacement = scaling == Scaled ? extended : 0;
 
   aarch64_set_mem_u8
     (cpu, address + displacement, aarch64_get_vec_u8 (cpu, st, 0));
@@ -9420,8 +10676,8 @@ fstrb_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 fstrh_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned st = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
 
   aarch64_set_mem_u16
     (cpu,
@@ -9434,14 +10690,14 @@ fstrh_abs (sim_cpu *cpu, uint32_t offset)
 static void
 fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
 
   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 			       extension);
-  uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
+  uint64_t  displacement = OPT_SCALE (extended, 16, scaling);
 
   aarch64_set_mem_u16
     (cpu, address + displacement, aarch64_get_vec_u16 (cpu, st, 0));
@@ -9451,28 +10707,28 @@ fstrh_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 fstrs_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned st = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
 
-  aarch64_set_mem_float
+  aarch64_set_mem_u32
     (cpu,
      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
-     aarch64_get_FP_float (cpu, st));
+     aarch64_get_vec_u32 (cpu, st, 0));
 }
 
 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
 static void
 fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 
   if (wb != Post)
     address += offset;
 
-  aarch64_set_mem_float (cpu, address, aarch64_get_FP_float (cpu, st));
+  aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
 
   if (wb == Post)
     address += offset;
@@ -9486,45 +10742,45 @@ fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
 
   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 			       extension);
   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
 
-  aarch64_set_mem_float
-    (cpu, address + displacement, aarch64_get_FP_float (cpu, st));
+  aarch64_set_mem_u32
+    (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
 }
 
 /* 64 bit store scaled unsigned 12 bit.  */
 static void
 fstrd_abs (sim_cpu *cpu, uint32_t offset)
 {
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned st = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
 
-  aarch64_set_mem_double
+  aarch64_set_mem_u64
     (cpu,
      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
-     aarch64_get_FP_double (cpu, st));
+     aarch64_get_vec_u64 (cpu, st, 0));
 }
 
 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
 static void
 fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
 
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 
   if (wb != Post)
     address += offset;
 
-  aarch64_set_mem_double (cpu, address, aarch64_get_FP_double (cpu, st));
+  aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
 
   if (wb == Post)
     address += offset;
@@ -9538,17 +10794,17 @@ fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
 
   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
 			       extension);
   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
 
-  aarch64_set_mem_double
-    (cpu, address + displacement, aarch64_get_FP_double (cpu, st));
+  aarch64_set_mem_u64
+    (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
 }
 
 /* 128 bit store scaled unsigned 12 bit.  */
@@ -9556,8 +10812,8 @@ static void
 fstrq_abs (sim_cpu *cpu, uint32_t offset)
 {
   FRegister a;
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned st = INSTR (4, 0);
+  unsigned rn = INSTR (9, 5);
   uint64_t addr;
 
   aarch64_get_FP_long_double (cpu, st, & a);
@@ -9571,8 +10827,8 @@ static void
 fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
   FRegister a;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
 
   if (wb != Post)
@@ -9593,9 +10849,9 @@ fstrq_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned st = INSTR (4, 0);
 
   uint64_t  address = aarch64_get_reg_u64 (cpu, rn, SP_OK);
   int64_t   extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP),
@@ -9611,21 +10867,22 @@ fstrq_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 static void
 dexLoadImmediatePrePost (sim_cpu *cpu)
 {
-  /* instr[29,24] == 111_00
-     instr[21] == 0
-     instr[11,10] == 00
-     instr[31,30] = size
-     instr[26] = V
+  /* instr[31,30] = size
+     instr[29,27] = 111
+     instr[26]    = V
+     instr[25,24] = 00
      instr[23,22] = opc
+     instr[21]    = 0
      instr[20,12] = simm9
-     instr[11] = wb : 0 ==> Post, 1 ==> Pre
-     instr[9,5] = rn may be SP.  */
-  /* unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0); */
-  uint32_t V = uimm (aarch64_get_instr (cpu), 26, 26);
-  uint32_t dispatch = ( (uimm (aarch64_get_instr (cpu), 31, 30) << 2)
-		       | uimm (aarch64_get_instr (cpu), 23, 22));
-  int32_t imm = simm32 (aarch64_get_instr (cpu), 20, 12);
-  WriteBack wb = writeback (aarch64_get_instr (cpu), 11);
+     instr[11]    = wb : 0 ==> Post, 1 ==> Pre
+     instr[10]    = 0
+     instr[9,5]   = Rn may be SP.
+     instr[4,0]   = Rt */
+
+  uint32_t  V        = INSTR (26, 26);
+  uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
+  int32_t   imm      = simm32 (aarch64_get_instr (cpu), 20, 12);
+  WriteBack wb       = INSTR (11, 11);
 
   if (!V)
     {
@@ -9699,11 +10956,10 @@ dexLoadRegisterOffset (sim_cpu *cpu)
      instr[9,5]   = rn
      instr[4,0]   = rt.  */
 
-  uint32_t V = uimm (aarch64_get_instr (cpu), 26,26);
-  uint32_t dispatch = ( (uimm (aarch64_get_instr (cpu), 31, 30) << 2)
-		       | uimm (aarch64_get_instr (cpu), 23, 22));
-  Scaling scale = scaling (aarch64_get_instr (cpu), 12);
-  Extension extensionType = extension (aarch64_get_instr (cpu), 13);
+  uint32_t  V = INSTR (26, 26);
+  uint32_t  dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
+  Scaling   scale = INSTR (12, 12);
+  Extension extensionType = INSTR (15, 13);
 
   /* Check for illegal extension types.  */
   if (uimm (extensionType, 1, 1) == 0)
@@ -9770,17 +11026,17 @@ dexLoadRegisterOffset (sim_cpu *cpu)
 static void
 dexLoadUnsignedImmediate (sim_cpu *cpu)
 {
-  /* assert instr[29,24] == 111_01
+  /* instr[29,24] == 111_01
      instr[31,30] = size
-     instr[26] = V
+     instr[26]    = V
      instr[23,22] = opc
      instr[21,10] = uimm12 : unsigned immediate offset
-     instr[9,5] = rn may be SP.  */
-  /* unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0); */
-  uint32_t V = uimm (aarch64_get_instr (cpu), 26,26);
-  uint32_t dispatch = ( (uimm (aarch64_get_instr (cpu), 31, 30) << 2)
-		       | uimm (aarch64_get_instr (cpu), 23, 22));
-  uint32_t imm = uimm (aarch64_get_instr (cpu), 21, 10);
+     instr[9,5]   = rn may be SP.
+     instr[4,0]   = rt.  */
+
+  uint32_t V = INSTR (26,26);
+  uint32_t dispatch = ((INSTR (31, 30) << 2) | INSTR (23, 22));
+  uint32_t imm = INSTR (21, 10);
 
   if (!V)
     {
@@ -9812,19 +11068,17 @@ dexLoadUnsignedImmediate (sim_cpu *cpu)
   /* FReg operations.  */
   switch (dispatch)
     {
-    case 3:  fldrq_abs (cpu, imm); return;
-    case 9:  fldrs_abs (cpu, imm); return;
-    case 13: fldrd_abs (cpu, imm); return;
-
     case 0:  fstrb_abs (cpu, imm); return;
-    case 2:  fstrq_abs (cpu, imm); return;
     case 4:  fstrh_abs (cpu, imm); return;
     case 8:  fstrs_abs (cpu, imm); return;
     case 12: fstrd_abs (cpu, imm); return;
+    case 2:  fstrq_abs (cpu, imm); return;
 
-    case 1: /* LDR 8 bit FP.  */
-    case 5: /* LDR 8 bit FP.  */
-      HALT_NYI;
+    case 1:  fldrb_abs (cpu, imm); return;
+    case 5:  fldrh_abs (cpu, imm); return;
+    case 9:  fldrs_abs (cpu, imm); return;
+    case 13: fldrd_abs (cpu, imm); return;
+    case 3:  fldrq_abs (cpu, imm); return;
 
     default:
     case 6:
@@ -9851,7 +11105,7 @@ dexLoadExclusive (sim_cpu *cpu)
      instr[9,5] = Rn
      instr[4.0] = Rt.  */
 
-  switch (uimm (aarch64_get_instr (cpu), 22, 21))
+  switch (INSTR (22, 21))
     {
     case 2:   ldxr (cpu); return;
     case 0:   stxr (cpu); return;
@@ -9867,14 +11121,13 @@ dexLoadOther (sim_cpu *cpu)
   /* instr[29,25] = 111_0
      instr[24] == 0 ==> dispatch, 1 ==> ldst reg unsigned immediate
      instr[21:11,10] is the secondary dispatch.  */
-  if (uimm (aarch64_get_instr (cpu), 24, 24))
+  if (INSTR (24, 24))
     {
       dexLoadUnsignedImmediate (cpu);
       return;
     }
 
-  dispatch = ( (uimm (aarch64_get_instr (cpu), 21, 21) << 2)
-	      | uimm (aarch64_get_instr (cpu), 11, 10));
+  dispatch = ((INSTR (21, 21) << 2) | INSTR (11, 10));
   switch (dispatch)
     {
     case 0: dexLoadUnscaledImmediate (cpu); return;
@@ -9894,9 +11147,9 @@ dexLoadOther (sim_cpu *cpu)
 static void
 store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   if ((rn == rd || rm == rd) && wb != NoWriteBack)
@@ -9922,9 +11175,9 @@ store_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   if ((rn == rd || rm == rd) && wb != NoWriteBack)
@@ -9936,9 +11189,9 @@ store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
     address += offset;
 
   aarch64_set_mem_u64 (cpu, address,
-		       aarch64_get_reg_u64 (cpu, rm, SP_OK));
+		       aarch64_get_reg_u64 (cpu, rm, NO_SP));
   aarch64_set_mem_u64 (cpu, address + 8,
-		       aarch64_get_reg_u64 (cpu, rn, SP_OK));
+		       aarch64_get_reg_u64 (cpu, rn, NO_SP));
 
   if (wb == Post)
     address += offset;
@@ -9950,12 +11203,12 @@ store_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
-  /* treat this as unalloc to make sure we don't do it.  */
+  /* Treat this as unalloc to make sure we don't do it.  */
   if (rn == rm)
     HALT_UNALLOC;
 
@@ -9977,9 +11230,9 @@ load_pair_u32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   /* Treat this as unalloc to make sure we don't do it.  */
@@ -10004,9 +11257,9 @@ load_pair_s32 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 load_pair_u64 (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   /* Treat this as unalloc to make sure we don't do it.  */
@@ -10041,8 +11294,7 @@ dex_load_store_pair_gr (sim_cpu *cpu)
      instr[ 9, 5] = Rd
      instr[ 4, 0] = Rm.  */
 
-  uint32_t dispatch = ((uimm (aarch64_get_instr (cpu), 31, 30) << 3)
-			| uimm (aarch64_get_instr (cpu), 24, 22));
+  uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
 
   switch (dispatch)
@@ -10073,9 +11325,9 @@ dex_load_store_pair_gr (sim_cpu *cpu)
 static void
 store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   offset <<= 2;
@@ -10083,8 +11335,8 @@ store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_mem_float (cpu, address,     aarch64_get_FP_float (cpu, rm));
-  aarch64_set_mem_float (cpu, address + 4, aarch64_get_FP_float (cpu, rn));
+  aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
+  aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
 
   if (wb == Post)
     address += offset;
@@ -10096,9 +11348,9 @@ store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   offset <<= 3;
@@ -10106,8 +11358,8 @@ store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_mem_double (cpu, address,     aarch64_get_FP_double (cpu, rm));
-  aarch64_set_mem_double (cpu, address + 8, aarch64_get_FP_double (cpu, rn));
+  aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
+  aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
 
   if (wb == Post)
     address += offset;
@@ -10120,9 +11372,9 @@ static void
 store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
   FRegister a;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   offset <<= 4;
@@ -10145,9 +11397,9 @@ store_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   if (rm == rn)
@@ -10158,8 +11410,8 @@ load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_FP_float (cpu, rm, aarch64_get_mem_float (cpu, address));
-  aarch64_set_FP_float (cpu, rn, aarch64_get_mem_float (cpu, address + 4));
+  aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
+  aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
 
   if (wb == Post)
     address += offset;
@@ -10171,9 +11423,9 @@ load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
 static void
 load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   if (rm == rn)
@@ -10184,8 +11436,8 @@ load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_FP_double (cpu, rm, aarch64_get_mem_double (cpu, address));
-  aarch64_set_FP_double (cpu, rn, aarch64_get_mem_double (cpu, address + 8));
+  aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
+  aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
 
   if (wb == Post)
     address += offset;
@@ -10198,9 +11450,9 @@ static void
 load_pair_long_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
 {
   FRegister a;
-  unsigned rn = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rm = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (14, 10);
+  unsigned rd = INSTR (9, 5);
+  unsigned rm = INSTR (4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rd, SP_OK);
 
   if (rm == rn)
@@ -10235,8 +11487,7 @@ dex_load_store_pair_fp (sim_cpu *cpu)
      instr[ 9, 5] = Rd
      instr[ 4, 0] = Rm  */
 
-  uint32_t dispatch = ((uimm (aarch64_get_instr (cpu), 31, 30) << 3)
-			| uimm (aarch64_get_instr (cpu), 24, 22));
+  uint32_t dispatch = ((INSTR (31, 30) << 3) | INSTR (24, 22));
   int32_t offset = simm32 (aarch64_get_instr (cpu), 21, 15);
 
   switch (dispatch)
@@ -10273,603 +11524,466 @@ vec_reg (unsigned v, unsigned o)
   return (v + o) & 0x3F;
 }
 
-/* Load multiple N-element structures to N consecutive registers.  */
+/* Load multiple N-element structures to M consecutive registers.  */
 static void
-vec_load (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_load (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
 {
-  int      all  = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned size = uimm (aarch64_get_instr (cpu), 11, 10);
-  unsigned vd   = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned i;
+  int      all  = INSTR (30, 30);
+  unsigned size = INSTR (11, 10);
+  unsigned vd   = INSTR (4, 0);
+  unsigned rpt = (N == M) ? 1 : M;
+  unsigned selem = N;
+  unsigned i, j, k;
 
   switch (size)
     {
     case 0: /* 8-bit operations.  */
-      if (all)
-	for (i = 0; i < (16 * N); i++)
-	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15,
-			      aarch64_get_mem_u8 (cpu, address + i));
-      else
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7,
-			      aarch64_get_mem_u8 (cpu, address + i));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (8 + (8 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u8 (cpu, vec_reg (vd, i + k), j,
+				  aarch64_get_mem_u8 (cpu, address));
+	      address += 1;
+	    }
       return;
 
     case 1: /* 16-bit operations.  */
-      if (all)
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7,
-			       aarch64_get_mem_u16 (cpu, address + i * 2));
-      else
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3,
-			       aarch64_get_mem_u16 (cpu, address + i * 2));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (4 + (4 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u16 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u16 (cpu, address));
+	      address += 2;
+	    }
       return;
 
     case 2: /* 32-bit operations.  */
-      if (all)
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3,
-			       aarch64_get_mem_u32 (cpu, address + i * 4));
-      else
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1,
-			       aarch64_get_mem_u32 (cpu, address + i * 4));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (2 + (2 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u32 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u32 (cpu, address));
+	      address += 4;
+	    }
       return;
 
     case 3: /* 64-bit operations.  */
-      if (all)
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1,
-			       aarch64_get_mem_u64 (cpu, address + i * 8));
-      else
-	for (i = 0; i < N; i++)
-	  aarch64_set_vec_u64 (cpu, vec_reg (vd, i), 0,
-			       aarch64_get_mem_u64 (cpu, address + i * 8));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (1 + all); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_vec_u64 (cpu, vec_reg (vd, i + k), j,
+				   aarch64_get_mem_u64 (cpu, address));
+	      address += 8;
+	    }
       return;
-
-    default:
-      HALT_UNREACHABLE;
     }
 }
 
-/* LD4: load multiple 4-element to four consecutive registers.  */
+/* Load multiple 4-element structures into four consecutive registers.  */
 static void
 LD4 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 4);
+  vec_load (cpu, address, 4, 4);
 }
 
-/* LD3: load multiple 3-element structures to three consecutive registers.  */
+/* Load multiple 3-element structures into three consecutive registers.  */
 static void
 LD3 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 3);
+  vec_load (cpu, address, 3, 3);
 }
 
-/* LD2: load multiple 2-element structures to two consecutive registers.  */
+/* Load multiple 2-element structures into two consecutive registers.  */
 static void
 LD2 (sim_cpu *cpu, uint64_t address)
 {
-  vec_load (cpu, address, 2);
+  vec_load (cpu, address, 2, 2);
 }
 
 /* Load multiple 1-element structures into one register.  */
 static void
 LD1_1 (sim_cpu *cpu, uint64_t address)
 {
-  int      all  = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned size = uimm (aarch64_get_instr (cpu), 11, 10);
-  unsigned vd   = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned i;
-
-  switch (size)
-    {
-    case 0:
-      /* LD1 {Vd.16b}, addr, #16 */
-      /* LD1 {Vd.8b}, addr, #8 */
-      for (i = 0; i < (all ? 16 : 8); i++)
-	aarch64_set_vec_u8 (cpu, vd, i,
-			    aarch64_get_mem_u8 (cpu, address + i));
-      return;
-
-    case 1:
-      /* LD1 {Vd.8h}, addr, #16 */
-      /* LD1 {Vd.4h}, addr, #8 */
-      for (i = 0; i < (all ? 8 : 4); i++)
-	aarch64_set_vec_u16 (cpu, vd, i,
-			     aarch64_get_mem_u16 (cpu, address + i * 2));
-      return;
-
-    case 2:
-      /* LD1 {Vd.4s}, addr, #16 */
-      /* LD1 {Vd.2s}, addr, #8 */
-      for (i = 0; i < (all ? 4 : 2); i++)
-	aarch64_set_vec_u32 (cpu, vd, i,
-			     aarch64_get_mem_u32 (cpu, address + i * 4));
-      return;
-
-    case 3:
-      /* LD1 {Vd.2d}, addr, #16 */
-      /* LD1 {Vd.1d}, addr, #8 */
-      for (i = 0; i < (all ? 2 : 1); i++)
-	aarch64_set_vec_u64 (cpu, vd, i,
-			     aarch64_get_mem_u64 (cpu, address + i * 8));
-      return;
-
-    default:
-      HALT_UNREACHABLE;
-    }
+  vec_load (cpu, address, 1, 1);
 }
 
 /* Load multiple 1-element structures into two registers.  */
 static void
 LD1_2 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD2 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 2);
+  vec_load (cpu, address, 1, 2);
 }
 
 /* Load multiple 1-element structures into three registers.  */
 static void
 LD1_3 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD3 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 3);
+  vec_load (cpu, address, 1, 3);
 }
 
 /* Load multiple 1-element structures into four registers.  */
 static void
 LD1_4 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the LD4 version.
-     So why have two different instructions ?  There must be something
-     wrong somewhere.  */
-  vec_load (cpu, address, 4);
+  vec_load (cpu, address, 1, 4);
 }
 
-/* Store multiple N-element structures to N consecutive registers.  */
+/* Store multiple N-element structures from M consecutive registers.  */
 static void
-vec_store (sim_cpu *cpu, uint64_t address, unsigned N)
+vec_store (sim_cpu *cpu, uint64_t address, unsigned N, unsigned M)
 {
-  int      all  = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned size = uimm (aarch64_get_instr (cpu), 11, 10);
-  unsigned vd   = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned i;
+  int      all  = INSTR (30, 30);
+  unsigned size = INSTR (11, 10);
+  unsigned vd   = INSTR (4, 0);
+  unsigned rpt = (N == M) ? 1 : M;
+  unsigned selem = N;
+  unsigned i, j, k;
 
   switch (size)
     {
     case 0: /* 8-bit operations.  */
-      if (all)
-	for (i = 0; i < (16 * N); i++)
-	  aarch64_set_mem_u8
-	    (cpu, address + i,
-	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 4), i & 15));
-      else
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_mem_u8
-	    (cpu, address + i,
-	     aarch64_get_vec_u8 (cpu, vec_reg (vd, i >> 3), i & 7));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (8 + (8 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u8
+		(cpu, address,
+		 aarch64_get_vec_u8 (cpu, vec_reg (vd, i + k), j));
+	      address += 1;
+	    }
       return;
 
     case 1: /* 16-bit operations.  */
-      if (all)
-	for (i = 0; i < (8 * N); i++)
-	  aarch64_set_mem_u16
-	    (cpu, address + i * 2,
-	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 3), i & 7));
-      else
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_mem_u16
-	    (cpu, address + i * 2,
-	     aarch64_get_vec_u16 (cpu, vec_reg (vd, i >> 2), i & 3));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (4 + (4 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u16
+		(cpu, address,
+		 aarch64_get_vec_u16 (cpu, vec_reg (vd, i + k), j));
+	      address += 2;
+	    }
       return;
 
     case 2: /* 32-bit operations.  */
-      if (all)
-	for (i = 0; i < (4 * N); i++)
-	  aarch64_set_mem_u32
-	    (cpu, address + i * 4,
-	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 2), i & 3));
-      else
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_mem_u32
-	    (cpu, address + i * 4,
-	     aarch64_get_vec_u32 (cpu, vec_reg (vd, i >> 1), i & 1));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (2 + (2 * all)); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u32
+		(cpu, address,
+		 aarch64_get_vec_u32 (cpu, vec_reg (vd, i + k), j));
+	      address += 4;
+	    }
       return;
 
     case 3: /* 64-bit operations.  */
-      if (all)
-	for (i = 0; i < (2 * N); i++)
-	  aarch64_set_mem_u64
-	    (cpu, address + i * 8,
-	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i >> 1), i & 1));
-      else
-	for (i = 0; i < N; i++)
-	  aarch64_set_mem_u64
-	    (cpu, address + i * 8,
-	     aarch64_get_vec_u64 (cpu, vec_reg (vd, i), 0));
+      for (i = 0; i < rpt; i++)
+	for (j = 0; j < (1 + all); j++)
+	  for (k = 0; k < selem; k++)
+	    {
+	      aarch64_set_mem_u64
+		(cpu, address,
+		 aarch64_get_vec_u64 (cpu, vec_reg (vd, i + k), j));
+	      address += 8;
+	    }
       return;
-
-    default:
-      HALT_UNREACHABLE;
     }
 }
 
-/* Store multiple 4-element structure to four consecutive registers.  */
+/* Store multiple 4-element structure from four consecutive registers.  */
 static void
 ST4 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 4);
+  vec_store (cpu, address, 4, 4);
 }
 
-/* Store multiple 3-element structures to three consecutive registers.  */
+/* Store multiple 3-element structures from three consecutive registers.  */
 static void
 ST3 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 3);
+  vec_store (cpu, address, 3, 3);
 }
 
-/* Store multiple 2-element structures to two consecutive registers.  */
+/* Store multiple 2-element structures from two consecutive registers.  */
 static void
 ST2 (sim_cpu *cpu, uint64_t address)
 {
-  vec_store (cpu, address, 2);
+  vec_store (cpu, address, 2, 2);
 }
 
-/* Store multiple 1-element structures into one register.  */
+/* Store multiple 1-element structures from one register.  */
 static void
 ST1_1 (sim_cpu *cpu, uint64_t address)
 {
-  int      all  = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned size = uimm (aarch64_get_instr (cpu), 11, 10);
-  unsigned vd   = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned i;
-
-  switch (size)
-    {
-    case 0:
-      for (i = 0; i < (all ? 16 : 8); i++)
-	aarch64_set_mem_u8 (cpu, address + i,
-			    aarch64_get_vec_u8 (cpu, vd, i));
-      return;
-
-    case 1:
-      for (i = 0; i < (all ? 8 : 4); i++)
-	aarch64_set_mem_u16 (cpu, address + i * 2,
-			     aarch64_get_vec_u16 (cpu, vd, i));
-      return;
-
-    case 2:
-      for (i = 0; i < (all ? 4 : 2); i++)
-	aarch64_set_mem_u32 (cpu, address + i * 4,
-			     aarch64_get_vec_u32 (cpu, vd, i));
-      return;
-
-    case 3:
-      for (i = 0; i < (all ? 2 : 1); i++)
-	aarch64_set_mem_u64 (cpu, address + i * 8,
-			     aarch64_get_vec_u64 (cpu, vd, i));
-      return;
-
-    default:
-      HALT_UNREACHABLE;
-    }
+  vec_store (cpu, address, 1, 1);
 }
 
-/* Store multiple 1-element structures into two registers.  */
+/* Store multiple 1-element structures from two registers.  */
 static void
 ST1_2 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST2 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 2);
+  vec_store (cpu, address, 1, 2);
 }
 
-/* Store multiple 1-element structures into three registers.  */
+/* Store multiple 1-element structures from three registers.  */
 static void
 ST1_3 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST3 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 3);
+  vec_store (cpu, address, 1, 3);
 }
 
-/* Store multiple 1-element structures into four registers.  */
+/* Store multiple 1-element structures from four registers.  */
 static void
 ST1_4 (sim_cpu *cpu, uint64_t address)
 {
-  /* FIXME: This algorithm is *exactly* the same as the ST4 version.
-     So why have two different instructions ?  There must be
-     something wrong somewhere.  */
-  vec_store (cpu, address, 4);
-}
+  vec_store (cpu, address, 1, 4);
+}
+
+#define LDn_STn_SINGLE_LANE_AND_SIZE()				\
+  do								\
+    {								\
+      switch (INSTR (15, 14))					\
+	{							\
+	case 0:							\
+	  lane = (full << 3) | (s << 2) | size;			\
+	  size = 0;						\
+	  break;						\
+								\
+	case 1:							\
+	  if ((size & 1) == 1)					\
+	    HALT_UNALLOC;					\
+	  lane = (full << 2) | (s << 1) | (size >> 1);		\
+	  size = 1;						\
+	  break;						\
+								\
+	case 2:							\
+	  if ((size & 2) == 2)					\
+	    HALT_UNALLOC;					\
+								\
+	  if ((size & 1) == 0)					\
+	    {							\
+	      lane = (full << 1) | s;				\
+	      size = 2;						\
+	    }							\
+	  else							\
+	    {							\
+	      if (s)						\
+		HALT_UNALLOC;					\
+	      lane = full;					\
+	      size = 3;						\
+	    }							\
+	  break;						\
+								\
+	default:						\
+	  HALT_UNALLOC;						\
+	}							\
+    }								\
+  while (0)
 
+/* Load single structure into one lane of N registers.  */
 static void
-do_vec_LDnR (sim_cpu *cpu, uint64_t address)
+do_vec_LDn_single (sim_cpu *cpu, uint64_t address)
 {
   /* instr[31]    = 0
      instr[30]    = element selector 0=>half, 1=>all elements
      instr[29,24] = 00 1101
      instr[23]    = 0=>simple, 1=>post
      instr[22]    = 1
-     instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
+     instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
      instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
                       11111 (immediate post inc)
-     instr[15,14] = 11
-     instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
-     instr[12]    = 0
-     instr[11,10] = element size 00=> byte(b), 01=> half(h),
-                                 10=> word(s), 11=> double(d)
+     instr[15,13] = opcode
+     instr[12]    = S, used for lane number
+     instr[11,10] = size, also used for lane number
      instr[9,5]   = address
      instr[4,0]   = Vd  */
 
-  unsigned full = uimm (aarch64_get_instr (cpu), 30, 30);
-  unsigned vd = uimm (aarch64_get_instr (cpu), 4, 0);
-  unsigned size = uimm (aarch64_get_instr (cpu), 11, 10);
+  unsigned full = INSTR (30, 30);
+  unsigned vd = INSTR (4, 0);
+  unsigned size = INSTR (11, 10);
+  unsigned s = INSTR (12, 12);
+  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+  int lane = 0;
   int i;
 
   NYI_assert (29, 24, 0x0D);
   NYI_assert (22, 22, 1);
-  NYI_assert (15, 14, 3);
-  NYI_assert (12, 12, 0);
-
-  switch ((uimm (aarch64_get_instr (cpu), 13, 13) << 1)
-	  | uimm (aarch64_get_instr (cpu), 21, 21))
-    {
-    case 0: /* LD1R.  */
-      switch (size)
-	{
-	case 0:
-	  {
-	    uint8_t val = aarch64_get_mem_u8 (cpu, address);
-	    for (i = 0; i < (full ? 16 : 8); i++)
-	      aarch64_set_vec_u8 (cpu, vd, i, val);
-	    break;
-	  }
-
-	case 1:
-	  {
-	    uint16_t val = aarch64_get_mem_u16 (cpu, address);
-	    for (i = 0; i < (full ? 8 : 4); i++)
-	      aarch64_set_vec_u16 (cpu, vd, i, val);
-	    break;
-	  }
-
-	case 2:
-	  {
-	    uint32_t val = aarch64_get_mem_u32 (cpu, address);
-	    for (i = 0; i < (full ? 4 : 2); i++)
-	      aarch64_set_vec_u32 (cpu, vd, i, val);
-	    break;
-	  }
 
-	case 3:
-	  {
-	    uint64_t val = aarch64_get_mem_u64 (cpu, address);
-	    for (i = 0; i < (full ? 2 : 1); i++)
-	      aarch64_set_vec_u64 (cpu, vd, i, val);
-	    break;
-	  }
+  /* Compute the lane number first (using size), and then compute size.  */
+  LDn_STn_SINGLE_LANE_AND_SIZE ();
 
-	default:
-	  HALT_UNALLOC;
+  for (i = 0; i < nregs; i++)
+    switch (size)
+      {
+      case 0:
+	{
+	  uint8_t val = aarch64_get_mem_u8 (cpu, address + i);
+	  aarch64_set_vec_u8 (cpu, vd + i, lane, val);
+	  break;
 	}
-      break;
 
-    case 1: /* LD2R.  */
-      switch (size)
+      case 1:
 	{
-	case 0:
-	  {
-	    uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
-	    uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
-
-	    for (i = 0; i < (full ? 16 : 8); i++)
-	      {
-		aarch64_set_vec_u8 (cpu, vd, 0, val1);
-		aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
-	      }
-	    break;
-	  }
-
-	case 1:
-	  {
-	    uint16_t val1 = aarch64_get_mem_u16 (cpu, address);
-	    uint16_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
-
-	    for (i = 0; i < (full ? 8 : 4); i++)
-	      {
-		aarch64_set_vec_u16 (cpu, vd, 0, val1);
-		aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
-	      }
-	    break;
-	  }
-
-	case 2:
-	  {
-	    uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
-	    uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
-
-	    for (i = 0; i < (full ? 4 : 2); i++)
-	      {
-		aarch64_set_vec_u32 (cpu, vd, 0, val1);
-		aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
-	      }
-	    break;
-	  }
-
-	case 3:
-	  {
-	    uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
-	    uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
-
-	    for (i = 0; i < (full ? 2 : 1); i++)
-	      {
-		aarch64_set_vec_u64 (cpu, vd, 0, val1);
-		aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
-	      }
-	    break;
-	  }
-
-	default:
-	  HALT_UNALLOC;
+	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (i * 2));
+	  aarch64_set_vec_u16 (cpu, vd + i, lane, val);
+	  break;
 	}
-      break;
 
-    case 2: /* LD3R.  */
-      switch (size)
+      case 2:
 	{
-	case 0:
-	  {
-	    uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
-	    uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
-	    uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
-
-	    for (i = 0; i < (full ? 16 : 8); i++)
-	      {
-		aarch64_set_vec_u8 (cpu, vd, 0, val1);
-		aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
-		aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
-	      }
-	  }
+	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (i * 4));
+	  aarch64_set_vec_u32 (cpu, vd + i, lane, val);
 	  break;
+	}
 
-	case 1:
-	  {
-	    uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
-	    uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
-	    uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
-
-	    for (i = 0; i < (full ? 8 : 4); i++)
-	      {
-		aarch64_set_vec_u16 (cpu, vd, 0, val1);
-		aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
-		aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
-	      }
-	  }
+      case 3:
+	{
+	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (i * 8));
+	  aarch64_set_vec_u64 (cpu, vd + i, lane, val);
 	  break;
+	}
+      }
+}
 
-	case 2:
-	  {
-	    uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
-	    uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
-	    uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
+/* Store single structure from one lane from N registers.  */
+static void
+do_vec_STn_single (sim_cpu *cpu, uint64_t address)
+{
+  /* instr[31]    = 0
+     instr[30]    = element selector 0=>half, 1=>all elements
+     instr[29,24] = 00 1101
+     instr[23]    = 0=>simple, 1=>post
+     instr[22]    = 0
+     instr[21]    = width: LD1-or-LD3 (0) / LD2-or-LD4 (1)
+     instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
+                      11111 (immediate post inc)
+     instr[15,13] = opcode
+     instr[12]    = S, used for lane number
+     instr[11,10] = size, also used for lane number
+     instr[9,5]   = address
+     instr[4,0]   = Vd  */
 
-	    for (i = 0; i < (full ? 4 : 2); i++)
-	      {
-		aarch64_set_vec_u32 (cpu, vd, 0, val1);
-		aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
-		aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
-	      }
-	  }
-	  break;
+  unsigned full = INSTR (30, 30);
+  unsigned vd = INSTR (4, 0);
+  unsigned size = INSTR (11, 10);
+  unsigned s = INSTR (12, 12);
+  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+  int lane = 0;
+  int i;
 
-	case 3:
-	  {
-	    uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
-	    uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
-	    uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
+  NYI_assert (29, 24, 0x0D);
+  NYI_assert (22, 22, 0);
 
-	    for (i = 0; i < (full ? 2 : 1); i++)
-	      {
-		aarch64_set_vec_u64 (cpu, vd, 0, val1);
-		aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
-		aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
-	      }
-	  }
+  /* Compute the lane number first (using size), and then compute size.  */
+  LDn_STn_SINGLE_LANE_AND_SIZE ();
+
+  for (i = 0; i < nregs; i++)
+    switch (size)
+      {
+      case 0:
+	{
+	  uint8_t val = aarch64_get_vec_u8 (cpu, vd + i, lane);
+	  aarch64_set_mem_u8 (cpu, address + i, val);
 	  break;
+	}
 
-	default:
-	  HALT_UNALLOC;
+      case 1:
+	{
+	  uint16_t val = aarch64_get_vec_u16 (cpu, vd + i, lane);
+	  aarch64_set_mem_u16 (cpu, address + (i * 2), val);
+	  break;
 	}
-      break;
 
-    case 3: /* LD4R.  */
-      switch (size)
+      case 2:
 	{
-	case 0:
-	  {
-	    uint8_t val1 = aarch64_get_mem_u8 (cpu, address);
-	    uint8_t val2 = aarch64_get_mem_u8 (cpu, address + 1);
-	    uint8_t val3 = aarch64_get_mem_u8 (cpu, address + 2);
-	    uint8_t val4 = aarch64_get_mem_u8 (cpu, address + 3);
+	  uint32_t val = aarch64_get_vec_u32 (cpu, vd + i, lane);
+	  aarch64_set_mem_u32 (cpu, address + (i * 4), val);
+	  break;
+	}
 
-	    for (i = 0; i < (full ? 16 : 8); i++)
-	      {
-		aarch64_set_vec_u8 (cpu, vd, 0, val1);
-		aarch64_set_vec_u8 (cpu, vd + 1, 0, val2);
-		aarch64_set_vec_u8 (cpu, vd + 2, 0, val3);
-		aarch64_set_vec_u8 (cpu, vd + 3, 0, val4);
-	      }
-	  }
+      case 3:
+	{
+	  uint64_t val = aarch64_get_vec_u64 (cpu, vd + i, lane);
+	  aarch64_set_mem_u64 (cpu, address + (i * 8), val);
 	  break;
+	}
+      }
+}
 
-	case 1:
-	  {
-	    uint32_t val1 = aarch64_get_mem_u16 (cpu, address);
-	    uint32_t val2 = aarch64_get_mem_u16 (cpu, address + 2);
-	    uint32_t val3 = aarch64_get_mem_u16 (cpu, address + 4);
-	    uint32_t val4 = aarch64_get_mem_u16 (cpu, address + 6);
+/* Load single structure into all lanes of N registers.  */
+static void
+do_vec_LDnR (sim_cpu *cpu, uint64_t address)
+{
+  /* instr[31]    = 0
+     instr[30]    = element selector 0=>half, 1=>all elements
+     instr[29,24] = 00 1101
+     instr[23]    = 0=>simple, 1=>post
+     instr[22]    = 1
+     instr[21]    = width: LD1R-or-LD3R (0) / LD2R-or-LD4R (1)
+     instr[20,16] = 0 0000 (simple), Vinc (reg-post-inc, no SP),
+                      11111 (immediate post inc)
+     instr[15,14] = 11
+     instr[13]    = width: LD1R-or-LD2R (0) / LD3R-or-LD4R (1)
+     instr[12]    = 0
+     instr[11,10] = element size 00=> byte(b), 01=> half(h),
+                                 10=> word(s), 11=> double(d)
+     instr[9,5]   = address
+     instr[4,0]   = Vd  */
 
-	    for (i = 0; i < (full ? 8 : 4); i++)
-	      {
-		aarch64_set_vec_u16 (cpu, vd, 0, val1);
-		aarch64_set_vec_u16 (cpu, vd + 1, 0, val2);
-		aarch64_set_vec_u16 (cpu, vd + 2, 0, val3);
-		aarch64_set_vec_u16 (cpu, vd + 3, 0, val4);
-	      }
-	  }
-	  break;
+  unsigned full = INSTR (30, 30);
+  unsigned vd = INSTR (4, 0);
+  unsigned size = INSTR (11, 10);
+  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+  int i, n;
 
-	case 2:
-	  {
-	    uint32_t val1 = aarch64_get_mem_u32 (cpu, address);
-	    uint32_t val2 = aarch64_get_mem_u32 (cpu, address + 4);
-	    uint32_t val3 = aarch64_get_mem_u32 (cpu, address + 8);
-	    uint32_t val4 = aarch64_get_mem_u32 (cpu, address + 12);
+  NYI_assert (29, 24, 0x0D);
+  NYI_assert (22, 22, 1);
+  NYI_assert (15, 14, 3);
+  NYI_assert (12, 12, 0);
 
-	    for (i = 0; i < (full ? 4 : 2); i++)
-	      {
-		aarch64_set_vec_u32 (cpu, vd, 0, val1);
-		aarch64_set_vec_u32 (cpu, vd + 1, 0, val2);
-		aarch64_set_vec_u32 (cpu, vd + 2, 0, val3);
-		aarch64_set_vec_u32 (cpu, vd + 3, 0, val4);
-	      }
-	  }
+  for (n = 0; n < nregs; n++)
+    switch (size)
+      {
+      case 0:
+	{
+	  uint8_t val = aarch64_get_mem_u8 (cpu, address + n);
+	  for (i = 0; i < (full ? 16 : 8); i++)
+	    aarch64_set_vec_u8 (cpu, vd + n, i, val);
 	  break;
+	}
 
-	case 3:
-	  {
-	    uint64_t val1 = aarch64_get_mem_u64 (cpu, address);
-	    uint64_t val2 = aarch64_get_mem_u64 (cpu, address + 8);
-	    uint64_t val3 = aarch64_get_mem_u64 (cpu, address + 16);
-	    uint64_t val4 = aarch64_get_mem_u64 (cpu, address + 24);
+      case 1:
+	{
+	  uint16_t val = aarch64_get_mem_u16 (cpu, address + (n * 2));
+	  for (i = 0; i < (full ? 8 : 4); i++)
+	    aarch64_set_vec_u16 (cpu, vd + n, i, val);
+	  break;
+	}
 
-	    for (i = 0; i < (full ? 2 : 1); i++)
-	      {
-		aarch64_set_vec_u64 (cpu, vd, 0, val1);
-		aarch64_set_vec_u64 (cpu, vd + 1, 0, val2);
-		aarch64_set_vec_u64 (cpu, vd + 2, 0, val3);
-		aarch64_set_vec_u64 (cpu, vd + 3, 0, val4);
-	      }
-	  }
+      case 2:
+	{
+	  uint32_t val = aarch64_get_mem_u32 (cpu, address + (n * 4));
+	  for (i = 0; i < (full ? 4 : 2); i++)
+	    aarch64_set_vec_u32 (cpu, vd + n, i, val);
 	  break;
+	}
 
-	default:
-	  HALT_UNALLOC;
+      case 3:
+	{
+	  uint64_t val = aarch64_get_mem_u64 (cpu, address + (n * 8));
+	  for (i = 0; i < (full ? 2 : 1); i++)
+	    aarch64_set_vec_u64 (cpu, vd + n, i, val);
+	  break;
 	}
-      break;
 
-    default:
-      HALT_UNALLOC;
-    }
+      default:
+	HALT_UNALLOC;
+      }
 }
 
 static void
@@ -10880,7 +11994,7 @@ do_vec_load_store (sim_cpu *cpu)
      instr[31]    = 0
      instr[30]    = element selector 0=>half, 1=>all elements
      instr[29,25] = 00110
-     instr[24]    = ?
+     instr[24]    = 0=>multiple struct, 1=>single struct
      instr[23]    = 0=>simple, 1=>post
      instr[22]    = 0=>store, 1=>load
      instr[21]    = 0 (LDn) / small(0)-large(1) selector (LDnR)
@@ -10908,62 +12022,105 @@ do_vec_load_store (sim_cpu *cpu)
      instr[9,5]   = Vn, can be SP
      instr[4,0]   = Vd  */
 
+  int single;
   int post;
   int load;
   unsigned vn;
   uint64_t address;
   int type;
 
-  if (uimm (aarch64_get_instr (cpu), 31, 31) != 0
-      || uimm (aarch64_get_instr (cpu), 29, 25) != 0x06)
+  if (INSTR (31, 31) != 0 || INSTR (29, 25) != 0x06)
     HALT_NYI;
 
-  type = uimm (aarch64_get_instr (cpu), 15, 12);
-  if (type != 0xE && type != 0xE && uimm (aarch64_get_instr (cpu), 21, 21) != 0)
-    HALT_NYI;
-
-  post = uimm (aarch64_get_instr (cpu), 23, 23);
-  load = uimm (aarch64_get_instr (cpu), 22, 22);
-  vn = uimm (aarch64_get_instr (cpu), 9, 5);
+  single = INSTR (24, 24);
+  post = INSTR (23, 23);
+  load = INSTR (22, 22);
+  type = INSTR (15, 12);
+  vn = INSTR (9, 5);
   address = aarch64_get_reg_u64 (cpu, vn, SP_OK);
 
+  if (! single && INSTR (21, 21) != 0)
+    HALT_UNALLOC;
+
   if (post)
     {
-      unsigned vm = uimm (aarch64_get_instr (cpu), 20, 16);
+      unsigned vm = INSTR (20, 16);
 
       if (vm == R31)
 	{
 	  unsigned sizeof_operation;
 
-	  switch (type)
+	  if (single)
+	    {
+	      if ((type >= 0) && (type <= 11))
+		{
+		  int nregs = ((INSTR (13, 13) << 1) | INSTR (21, 21)) + 1;
+		  switch (INSTR (15, 14))
+		    {
+		    case 0:
+		      sizeof_operation = nregs * 1;
+		      break;
+		    case 1:
+		      sizeof_operation = nregs * 2;
+		      break;
+		    case 2:
+		      if (INSTR (10, 10) == 0)
+			sizeof_operation = nregs * 4;
+		      else
+			sizeof_operation = nregs * 8;
+		      break;
+		    default:
+		      HALT_UNALLOC;
+		    }
+		}
+	      else if (type == 0xC)
+		{
+		  sizeof_operation = INSTR (21, 21) ? 2 : 1;
+		  sizeof_operation <<= INSTR (11, 10);
+		}
+	      else if (type == 0xE)
+		{
+		  sizeof_operation = INSTR (21, 21) ? 4 : 3;
+		  sizeof_operation <<= INSTR (11, 10);
+		}
+	      else
+		HALT_UNALLOC;
+	    }
+	  else
 	    {
-	    case 0: sizeof_operation = 32; break;
-	    case 4: sizeof_operation = 24; break;
-	    case 8: sizeof_operation = 16; break;
+	      switch (type)
+		{
+		case 0: sizeof_operation = 32; break;
+		case 4: sizeof_operation = 24; break;
+		case 8: sizeof_operation = 16; break;
 
-	    case 0xC:
-	      sizeof_operation = uimm (aarch64_get_instr (cpu), 21, 21) ? 2 : 1;
-	      sizeof_operation <<= uimm (aarch64_get_instr (cpu), 11, 10);
-	      break;
+		case 7:
+		  /* One register, immediate offset variant.  */
+		  sizeof_operation = 8;
+		  break;
 
-	    case 0xE:
-	      sizeof_operation = uimm (aarch64_get_instr (cpu), 21, 21) ? 8 : 4;
-	      sizeof_operation <<= uimm (aarch64_get_instr (cpu), 11, 10);
-	      break;
+		case 10:
+		  /* Two registers, immediate offset variant.  */
+		  sizeof_operation = 16;
+		  break;
 
-	    case 2:
-	    case 6:
-	    case 10:
-	    case 7:
-	      sizeof_operation = 2 << uimm (aarch64_get_instr (cpu), 11, 10);
-	      break;
+		case 6:
+		  /* Three registers, immediate offset variant.  */
+		  sizeof_operation = 24;
+		  break;
 
-	    default:
-	      HALT_UNALLOC;
-	    }
+		case 2:
+		  /* Four registers, immediate offset variant.  */
+		  sizeof_operation = 32;
+		  break;
 
-	  if (uimm (aarch64_get_instr (cpu), 30, 30))
-	    sizeof_operation *= 2;
+		default:
+		  HALT_UNALLOC;
+		}
+
+	      if (INSTR (30, 30))
+		sizeof_operation *= 2;
+	    }
 
 	  aarch64_set_reg_u64 (cpu, vn, SP_OK, address + sizeof_operation);
 	}
@@ -10976,6 +12133,29 @@ do_vec_load_store (sim_cpu *cpu)
       NYI_assert (20, 16, 0);
     }
 
+  if (single)
+    {
+      if (load)
+	{
+	  if ((type >= 0) && (type <= 11))
+	    do_vec_LDn_single (cpu, address);
+	  else if ((type == 0xC) || (type == 0xE))
+	    do_vec_LDnR (cpu, address);
+	  else
+	    HALT_UNALLOC;
+	  return;
+	}
+
+      /* Stores.  */
+      if ((type >= 0) && (type <= 11))
+	{
+	  do_vec_STn_single (cpu, address);
+	  return;
+	}
+
+      HALT_UNALLOC;
+    }
+
   if (load)
     {
       switch (type)
@@ -10988,11 +12168,8 @@ do_vec_load_store (sim_cpu *cpu)
 	case 10: LD1_2 (cpu, address); return;
 	case 7:  LD1_1 (cpu, address); return;
 
-	case 0xE:
-	case 0xC: do_vec_LDnR (cpu, address); return;
-
 	default:
-	  HALT_NYI;
+	  HALT_UNALLOC;
 	}
     }
 
@@ -11007,7 +12184,7 @@ do_vec_load_store (sim_cpu *cpu)
     case 10: ST1_2 (cpu, address); return;
     case 7:  ST1_1 (cpu, address); return;
     default:
-      HALT_NYI;
+      HALT_UNALLOC;
     }
 }
 
@@ -11053,36 +12230,27 @@ dexLdSt (sim_cpu *cpu)
 static void
 dexLogicalShiftedRegister (sim_cpu *cpu)
 {
-  /* assert instr[28:24] = 01010
-     instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
-     instr[30,29:21] = op,N : 000 ==> AND, 001 ==> BIC,
-                               010 ==> ORR, 011 ==> ORN
-                               100 ==> EOR, 101 ==> EON,
-                               110 ==> ANDS, 111 ==> BICS
+  /* instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
+     instr[30,29] = op
+     instr[28:24] = 01010
      instr[23,22] = shift : 0 ==> LSL, 1 ==> LSR, 2 ==> ASR, 3 ==> ROR
+     instr[21]    = N
+     instr[20,16] = Rm
      instr[15,10] = count : must be 0xxxxx for 32 bit
-     instr[9,5] = Rn
-     instr[4,0] = Rd  */
-
-  /* unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16); */
-  uint32_t dispatch;
-  Shift shiftType;
-  uint32_t size = uimm (aarch64_get_instr (cpu), 31, 31);
+     instr[9,5]   = Rn
+     instr[4,0]   = Rd  */
 
-  /* 32 bit operations must have count[5] = 0.  */
-  /* or else we have an UNALLOC.  */
-  uint32_t count = uimm (aarch64_get_instr (cpu), 15, 10);
+  uint32_t size      = INSTR (31, 31);
+  Shift    shiftType = INSTR (23, 22);
+  uint32_t count     = INSTR (15, 10);
 
-  if (!size && uimm (count, 5, 5))
+  /* 32 bit operations must have count[5] = 0.
+     or else we have an UNALLOC.  */
+  if (size == 0 && uimm (count, 5, 5))
     HALT_UNALLOC;
 
-  shiftType = shift (aarch64_get_instr (cpu), 22);
-
-  /* dispatch on size:op:N i.e aarch64_get_instr (cpu)[31,29:21].  */
-  dispatch = ( (uimm (aarch64_get_instr (cpu), 31, 29) << 1)
-	      | uimm (aarch64_get_instr (cpu), 21, 21));
-
-  switch (dispatch)
+  /* Dispatch on size:op:N.  */
+  switch ((INSTR (31, 29) << 1) | INSTR (21, 21))
     {
     case 0: and32_shift  (cpu, shiftType, count); return;
     case 1: bic32_shift  (cpu, shiftType, count); return;
@@ -11100,7 +12268,6 @@ dexLogicalShiftedRegister (sim_cpu *cpu)
     case 13:eon64_shift  (cpu, shiftType, count); return;
     case 14:ands64_shift (cpu, shiftType, count); return;
     case 15:bics64_shift (cpu, shiftType, count); return;
-    default: HALT_UNALLOC;
     }
 }
 
@@ -11108,9 +12275,9 @@ dexLogicalShiftedRegister (sim_cpu *cpu)
 static void
 csel32 (sim_cpu *cpu, CondCode cc)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       testConditionCode (cpu, cc)
@@ -11122,9 +12289,9 @@ csel32 (sim_cpu *cpu, CondCode cc)
 static void
 csel64 (sim_cpu *cpu, CondCode cc)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       testConditionCode (cpu, cc)
@@ -11136,9 +12303,9 @@ csel64 (sim_cpu *cpu, CondCode cc)
 static void
 csinc32 (sim_cpu *cpu, CondCode cc)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       testConditionCode (cpu, cc)
@@ -11150,9 +12317,9 @@ csinc32 (sim_cpu *cpu, CondCode cc)
 static void
 csinc64 (sim_cpu *cpu, CondCode cc)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       testConditionCode (cpu, cc)
@@ -11164,9 +12331,9 @@ csinc64 (sim_cpu *cpu, CondCode cc)
 static void
 csinv32 (sim_cpu *cpu, CondCode cc)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       testConditionCode (cpu, cc)
@@ -11178,9 +12345,9 @@ csinv32 (sim_cpu *cpu, CondCode cc)
 static void
 csinv64 (sim_cpu *cpu, CondCode cc)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       testConditionCode (cpu, cc)
@@ -11192,9 +12359,9 @@ csinv64 (sim_cpu *cpu, CondCode cc)
 static void
 csneg32 (sim_cpu *cpu, CondCode cc)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       testConditionCode (cpu, cc)
@@ -11206,9 +12373,9 @@ csneg32 (sim_cpu *cpu, CondCode cc)
 static void
 csneg64 (sim_cpu *cpu, CondCode cc)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       testConditionCode (cpu, cc)
@@ -11219,8 +12386,8 @@ csneg64 (sim_cpu *cpu, CondCode cc)
 static void
 dexCondSelect (sim_cpu *cpu)
 {
-  /* assert instr[28,21] = 11011011
-     instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
+  /* instr[28,21] = 11011011
+     instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
      instr[30:11,10] = op : 000 ==> CSEL, 001 ==> CSINC,
                             100 ==> CSINV, 101 ==> CSNEG,
                             _1_ ==> UNALLOC
@@ -11228,10 +12395,9 @@ dexCondSelect (sim_cpu *cpu)
      instr[15,12] = cond
      instr[29] = S : 0 ==> ok, 1 ==> UNALLOC  */
 
-  CondCode cc;
-  uint32_t dispatch;
-  uint32_t S = uimm (aarch64_get_instr (cpu), 29, 29);
-  uint32_t op2 = uimm (aarch64_get_instr (cpu), 11, 10);
+  CondCode cc = INSTR (15, 12);
+  uint32_t S = INSTR (29, 29);
+  uint32_t op2 = INSTR (11, 10);
 
   if (S == 1)
     HALT_UNALLOC;
@@ -11239,10 +12405,7 @@ dexCondSelect (sim_cpu *cpu)
   if (op2 & 0x2)
     HALT_UNALLOC;
 
-  cc = condcode (aarch64_get_instr (cpu), 12);
-  dispatch = ((uimm (aarch64_get_instr (cpu), 31, 30) << 1) | op2);
-
-  switch (dispatch)
+  switch ((INSTR (31, 30) << 1) | op2)
     {
     case 0: csel32  (cpu, cc); return;
     case 1: csinc32 (cpu, cc); return;
@@ -11252,7 +12415,6 @@ dexCondSelect (sim_cpu *cpu)
     case 5: csinc64 (cpu, cc); return;
     case 6: csinv64 (cpu, cc); return;
     case 7: csneg64 (cpu, cc); return;
-    default: HALT_UNALLOC;
     }
 }
 
@@ -11353,8 +12515,8 @@ leading64 (uint64_t value)
 static void
 cls32 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   /* N.B. the result needs to exclude the leading bit.  */
   aarch64_set_reg_u64
@@ -11365,8 +12527,8 @@ cls32 (sim_cpu *cpu)
 static void
 cls64 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   /* N.B. the result needs to exclude the leading bit.  */
   aarch64_set_reg_u64
@@ -11377,8 +12539,8 @@ cls64 (sim_cpu *cpu)
 static void
 clz32 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
 
   /* if the sign (top) bit is set then the count is 0.  */
@@ -11392,8 +12554,8 @@ clz32 (sim_cpu *cpu)
 static void
 clz64 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
 
   /* if the sign (top) bit is set then the count is 0.  */
@@ -11407,8 +12569,8 @@ clz64 (sim_cpu *cpu)
 static void
 rbit32 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t result = 0;
   int i;
@@ -11426,8 +12588,8 @@ rbit32 (sim_cpu *cpu)
 static void
 rbit64 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t result = 0;
   int i;
@@ -11435,7 +12597,7 @@ rbit64 (sim_cpu *cpu)
   for (i = 0; i < 64; i++)
     {
       result <<= 1;
-      result |= (value & 1L);
+      result |= (value & 1UL);
       value >>= 1;
     }
   aarch64_set_reg_u64 (cpu, rd, NO_SP, result);
@@ -11445,8 +12607,8 @@ rbit64 (sim_cpu *cpu)
 static void
 rev32 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t result = 0;
   int i;
@@ -11464,8 +12626,8 @@ rev32 (sim_cpu *cpu)
 static void
 rev64 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t result = 0;
   int i;
@@ -11484,8 +12646,8 @@ rev64 (sim_cpu *cpu)
 static void
 revh32 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint32_t value = aarch64_get_reg_u32 (cpu, rn, NO_SP);
   uint32_t result = 0;
   int i;
@@ -11504,8 +12666,8 @@ revh32 (sim_cpu *cpu)
 static void
 revh64 (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   uint64_t value = aarch64_get_reg_u64 (cpu, rn, NO_SP);
   uint64_t result = 0;
   int i;
@@ -11522,22 +12684,22 @@ revh64 (sim_cpu *cpu)
 static void
 dexDataProc1Source (sim_cpu *cpu)
 {
-  /* assert instr[30] == 1
-     aarch64_get_instr (cpu)[28,21] == 111010110
-     instr[31] = size : 0 ==> 32 bit, 1 ==> 64 bit
-     instr[29] = S : 0 ==> ok, 1 ==> UNALLOC
+  /* instr[30]    = 1
+     instr[28,21] = 111010110
+     instr[31]    = size : 0 ==> 32 bit, 1 ==> 64 bit
+     instr[29]    = S : 0 ==> ok, 1 ==> UNALLOC
      instr[20,16] = opcode2 : 00000 ==> ok, ow ==> UNALLOC
      instr[15,10] = opcode : 000000 ==> RBIT, 000001 ==> REV16,
                              000010 ==> REV, 000011 ==> UNALLOC
                              000100 ==> CLZ, 000101 ==> CLS
                              ow ==> UNALLOC
-     instr[9,5] = rn : may not be SP
-     instr[4,0] = rd : may not be SP.  */
+     instr[9,5]   = rn : may not be SP
+     instr[4,0]   = rd : may not be SP.  */
 
-  uint32_t S = uimm (aarch64_get_instr (cpu), 29, 29);
-  uint32_t opcode2 = uimm (aarch64_get_instr (cpu), 20, 16);
-  uint32_t opcode = uimm (aarch64_get_instr (cpu), 15, 10);
-  uint32_t dispatch = ((uimm (aarch64_get_instr (cpu), 31, 31) << 3) | opcode);
+  uint32_t S = INSTR (29, 29);
+  uint32_t opcode2 = INSTR (20, 16);
+  uint32_t opcode = INSTR (15, 10);
+  uint32_t dispatch = ((INSTR (31, 31) << 3) | opcode);
 
   if (S == 1)
     HALT_UNALLOC;
@@ -11577,9 +12739,9 @@ dexDataProc1Source (sim_cpu *cpu)
 static void
 asrv32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64
     (cpu, rd, NO_SP,
@@ -11591,9 +12753,9 @@ asrv32 (sim_cpu *cpu)
 static void
 asrv64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64
     (cpu, rd, NO_SP,
@@ -11605,9 +12767,9 @@ asrv64 (sim_cpu *cpu)
 static void
 lslv32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64
     (cpu, rd, NO_SP,
@@ -11619,9 +12781,9 @@ lslv32 (sim_cpu *cpu)
 static void
 lslv64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64
     (cpu, rd, NO_SP,
@@ -11633,9 +12795,9 @@ lslv64 (sim_cpu *cpu)
 static void
 lsrv32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64
     (cpu, rd, NO_SP,
@@ -11647,9 +12809,9 @@ lsrv32 (sim_cpu *cpu)
 static void
 lsrv64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64
     (cpu, rd, NO_SP,
@@ -11661,9 +12823,9 @@ lsrv64 (sim_cpu *cpu)
 static void
 rorv32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64
     (cpu, rd, NO_SP,
@@ -11675,9 +12837,9 @@ rorv32 (sim_cpu *cpu)
 static void
 rorv64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   aarch64_set_reg_u64
     (cpu, rd, NO_SP,
@@ -11692,9 +12854,9 @@ rorv64 (sim_cpu *cpu)
 static void
 cpuiv32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
   /* N.B. the pseudo-code does the divide using 64 bit data.  */
   /* TODO : check that this rounds towards zero as required.  */
   int64_t dividend = aarch64_get_reg_s32 (cpu, rn, NO_SP);
@@ -11708,9 +12870,9 @@ cpuiv32 (sim_cpu *cpu)
 static void
 cpuiv64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   /* TODO : check that this rounds towards zero as required.  */
   int64_t divisor = aarch64_get_reg_s64 (cpu, rm, NO_SP);
@@ -11724,9 +12886,9 @@ cpuiv64 (sim_cpu *cpu)
 static void
 udiv32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   /* N.B. the pseudo-code does the divide using 64 bit data.  */
   uint64_t dividend = aarch64_get_reg_u32 (cpu, rn, NO_SP);
@@ -11740,9 +12902,9 @@ udiv32 (sim_cpu *cpu)
 static void
 udiv64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   /* TODO : check that this rounds towards zero as required.  */
   uint64_t divisor = aarch64_get_reg_u64 (cpu, rm, NO_SP);
@@ -11765,8 +12927,8 @@ dexDataProc2Source (sim_cpu *cpu)
                              ow ==> UNALLOC.  */
 
   uint32_t dispatch;
-  uint32_t S = uimm (aarch64_get_instr (cpu), 29, 29);
-  uint32_t opcode = uimm (aarch64_get_instr (cpu), 15, 10);
+  uint32_t S = INSTR (29, 29);
+  uint32_t opcode = INSTR (15, 10);
 
   if (S == 1)
     HALT_UNALLOC;
@@ -11774,7 +12936,7 @@ dexDataProc2Source (sim_cpu *cpu)
   if (opcode & 0x34)
     HALT_UNALLOC;
 
-  dispatch = (  (uimm (aarch64_get_instr (cpu), 31, 31) << 3)
+  dispatch = (  (INSTR (31, 31) << 3)
 	      | (uimm (opcode, 3, 3) << 2)
 	      |  uimm (opcode, 1, 0));
   switch (dispatch)
@@ -11802,11 +12964,12 @@ dexDataProc2Source (sim_cpu *cpu)
 static void
 madd32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned ra = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned ra = INSTR (14, 10);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
 		       + aarch64_get_reg_u32 (cpu, rn, NO_SP)
@@ -11817,26 +12980,28 @@ madd32 (sim_cpu *cpu)
 static void
 madd64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned ra = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned ra = INSTR (14, 10);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
-		       + aarch64_get_reg_u64 (cpu, rn, NO_SP)
-		       * aarch64_get_reg_u64 (cpu, rm, NO_SP));
+		       + (aarch64_get_reg_u64 (cpu, rn, NO_SP)
+			  * aarch64_get_reg_u64 (cpu, rm, NO_SP)));
 }
 
 /* 32 bit multiply and sub.  */
 static void
 msub32 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned ra = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned ra = INSTR (14, 10);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u32 (cpu, ra, NO_SP)
 		       - aarch64_get_reg_u32 (cpu, rn, NO_SP)
@@ -11847,11 +13012,12 @@ msub32 (sim_cpu *cpu)
 static void
 msub64 (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned ra = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned ra = INSTR (14, 10);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       aarch64_get_reg_u64 (cpu, ra, NO_SP)
 		       - aarch64_get_reg_u64 (cpu, rn, NO_SP)
@@ -11862,10 +13028,10 @@ msub64 (sim_cpu *cpu)
 static void
 smaddl (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned ra = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned ra = INSTR (14, 10);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
      obtain a 64 bit product.  */
@@ -11880,10 +13046,10 @@ smaddl (sim_cpu *cpu)
 static void
 smsubl (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned ra = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned ra = INSTR (14, 10);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
      obtain a 64 bit product.  */
@@ -11936,7 +13102,6 @@ mul64hi (uint64_t value1, uint64_t value2)
   uint64_t value2_hi = highWordToU64 (value2);
 
   /* Cross-multiply and collect results.  */
-
   uint64_t xproductlo = value1_lo * value2_lo;
   uint64_t xproductmid1 = value1_lo * value2_hi;
   uint64_t xproductmid2 = value1_hi * value2_lo;
@@ -11962,6 +13127,8 @@ mul64hi (uint64_t value1, uint64_t value2)
 
   /* Drop lowest 32 bits of middle cross-product.  */
   result = resultmid1 >> 32;
+  /* Move carry bit to just above middle cross-product highest bit.  */
+  carry = carry << 32;
 
   /* Add top cross-product plus and any carry.  */
   result += xproducthi + carry;
@@ -11975,16 +13142,16 @@ static void
 smulh (sim_cpu *cpu)
 {
   uint64_t uresult;
-  int64_t result;
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  GReg ra = greg (aarch64_get_instr (cpu), 10);
-  int64_t value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
-  int64_t value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
+  int64_t  result;
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  GReg     ra = INSTR (14, 10);
+  int64_t  value1 = aarch64_get_reg_u64 (cpu, rn, NO_SP);
+  int64_t  value2 = aarch64_get_reg_u64 (cpu, rm, NO_SP);
   uint64_t uvalue1;
   uint64_t uvalue2;
-  int64_t signum = 1;
+  int  negate = 0;
 
   if (ra != R31)
     HALT_UNALLOC;
@@ -11993,7 +13160,7 @@ smulh (sim_cpu *cpu)
      the fix the sign up afterwards.  */
   if (value1 < 0)
     {
-      signum *= -1L;
+      negate = !negate;
       uvalue1 = -value1;
     }
   else
@@ -12003,7 +13170,7 @@ smulh (sim_cpu *cpu)
 
   if (value2 < 0)
     {
-      signum *= -1L;
+      negate = !negate;
       uvalue2 = -value2;
     }
   else
@@ -12011,9 +13178,19 @@ smulh (sim_cpu *cpu)
       uvalue2 = value2;
     }
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+
   uresult = mul64hi (uvalue1, uvalue2);
   result = uresult;
-  result *= signum;
+
+  if (negate)
+    {
+      /* Multiply 128-bit result by -1, which means highpart gets inverted,
+	 and has carry in added only if low part is 0.  */
+      result = ~result;
+      if ((uvalue1 * uvalue2) == 0)
+	result += 1;
+    }
 
   aarch64_set_reg_s64 (cpu, rd, NO_SP, result);
 }
@@ -12023,11 +13200,12 @@ smulh (sim_cpu *cpu)
 static void
 umaddl (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned ra = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned ra = INSTR (14, 10);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
      obtain a 64 bit product.  */
   aarch64_set_reg_u64
@@ -12041,11 +13219,12 @@ umaddl (sim_cpu *cpu)
 static void
 umsubl (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned ra = uimm (aarch64_get_instr (cpu), 14, 10);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rm = INSTR (20, 16);
+  unsigned ra = INSTR (14, 10);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   /* N.B. we need to multiply the signed 32 bit values in rn, rm to
      obtain a 64 bit product.  */
   aarch64_set_reg_u64
@@ -12060,14 +13239,15 @@ umsubl (sim_cpu *cpu)
 static void
 umulh (sim_cpu *cpu)
 {
-  unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
-  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
-  GReg ra = greg (aarch64_get_instr (cpu), 10);
+  unsigned rm = INSTR (20, 16);
+  unsigned rn = INSTR (9, 5);
+  unsigned rd = INSTR (4, 0);
+  GReg     ra = INSTR (14, 10);
 
   if (ra != R31)
     HALT_UNALLOC;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
 		       mul64hi (aarch64_get_reg_u64 (cpu, rn, NO_SP),
 				aarch64_get_reg_u64 (cpu, rm, NO_SP)));
@@ -12089,10 +13269,10 @@ dexDataProc3Source (sim_cpu *cpu)
                               ow ==> UNALLOC.  */
 
   uint32_t dispatch;
-  uint32_t size = uimm (aarch64_get_instr (cpu), 31, 31);
-  uint32_t op54 = uimm (aarch64_get_instr (cpu), 30, 29);
-  uint32_t op31 = uimm (aarch64_get_instr (cpu), 23, 21);
-  uint32_t o0 = uimm (aarch64_get_instr (cpu), 15, 15);
+  uint32_t size = INSTR (31, 31);
+  uint32_t op54 = INSTR (30, 29);
+  uint32_t op31 = INSTR (23, 21);
+  uint32_t o0 = INSTR (15, 15);
 
   if (op54 != 0)
     HALT_UNALLOC;
@@ -12221,6 +13401,7 @@ static unsigned stack_depth = 0;
 static void
 bl (sim_cpu *cpu, int32_t offset)
 {
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_save_LR (cpu);
   aarch64_set_next_PC_by_offset (cpu, offset);
 
@@ -12231,7 +13412,8 @@ bl (sim_cpu *cpu, int32_t offset)
 		    " %*scall %" PRIx64 " [%s]"
 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
 		    stack_depth, " ", aarch64_get_next_PC (cpu),
-		    aarch64_get_func (aarch64_get_next_PC (cpu)),
+		    aarch64_get_func (CPU_STATE (cpu),
+				      aarch64_get_next_PC (cpu)),
 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
@@ -12246,7 +13428,8 @@ bl (sim_cpu *cpu, int32_t offset)
 static void
 br (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = INSTR (9, 5);
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
 }
 
@@ -12254,12 +13437,12 @@ br (sim_cpu *cpu)
 static void
 blr (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  /* Ensure we read the destination before we write LR.  */
+  uint64_t target = aarch64_get_reg_u64 (cpu, INSTR (9, 5), NO_SP);
 
-  /* The pseudo code in the spec says we update LR before fetching.
-     the value from the rn.  */
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_save_LR (cpu);
-  aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
+  aarch64_set_next_PC (cpu, target);
 
   if (TRACE_BRANCH_P (cpu))
     {
@@ -12268,7 +13451,8 @@ blr (sim_cpu *cpu)
 		    " %*scall %" PRIx64 " [%s]"
 		    " [args: %" PRIx64 " %" PRIx64 " %" PRIx64 "]",
 		    stack_depth, " ", aarch64_get_next_PC (cpu),
-		    aarch64_get_func (aarch64_get_next_PC (cpu)),
+		    aarch64_get_func (CPU_STATE (cpu),
+				      aarch64_get_next_PC (cpu)),
 		    aarch64_get_reg_u64 (cpu, 0, NO_SP),
 		    aarch64_get_reg_u64 (cpu, 1, NO_SP),
 		    aarch64_get_reg_u64 (cpu, 2, NO_SP)
@@ -12282,9 +13466,10 @@ blr (sim_cpu *cpu)
 static void
 ret (sim_cpu *cpu)
 {
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = INSTR (9, 5);
   aarch64_set_next_PC (cpu, aarch64_get_reg_u64 (cpu, rn, NO_SP));
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   if (TRACE_BRANCH_P (cpu))
     {
       TRACE_BRANCH (cpu,
@@ -12300,6 +13485,7 @@ ret (sim_cpu *cpu)
 static void
 nop (sim_cpu *cpu)
 {
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 }
 
 /* Data synchronization barrier.  */
@@ -12307,6 +13493,7 @@ nop (sim_cpu *cpu)
 static void
 dsb (sim_cpu *cpu)
 {
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 }
 
 /* Data memory barrier.  */
@@ -12314,6 +13501,7 @@ dsb (sim_cpu *cpu)
 static void
 dmb (sim_cpu *cpu)
 {
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 }
 
 /* Instruction synchronization barrier.  */
@@ -12321,6 +13509,7 @@ dmb (sim_cpu *cpu)
 static void
 isb (sim_cpu *cpu)
 {
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
 }
 
 static void
@@ -12330,7 +13519,7 @@ dexBranchImmediate (sim_cpu *cpu)
      instr[31] ==> 0 == B, 1 == BL
      instr[25,0] == imm26 branch offset counted in words.  */
 
-  uint32_t top = uimm (aarch64_get_instr (cpu), 31, 31);
+  uint32_t top = INSTR (31, 31);
   /* We have a 26 byte signed word offset which we need to pass to the
      execute routine as a signed byte offset.  */
   int32_t offset = simm32 (aarch64_get_instr (cpu), 25, 0) << 2;
@@ -12355,7 +13544,8 @@ dexBranchImmediate (sim_cpu *cpu)
 static void
 bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
 {
-  /* the test returns TRUE if CC is met.  */
+  /* The test returns TRUE if CC is met.  */
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   if (testConditionCode (cpu, cc))
     aarch64_set_next_PC_by_offset (cpu, offset);
 }
@@ -12364,8 +13554,9 @@ bcc (sim_cpu *cpu, int32_t offset, CondCode cc)
 static void
 cbnz32 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) != 0)
     aarch64_set_next_PC_by_offset (cpu, offset);
 }
@@ -12374,8 +13565,9 @@ cbnz32 (sim_cpu *cpu, int32_t offset)
 static void
 cbnz (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) != 0)
     aarch64_set_next_PC_by_offset (cpu, offset);
 }
@@ -12384,8 +13576,9 @@ cbnz (sim_cpu *cpu, int32_t offset)
 static void
 cbz32 (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   if (aarch64_get_reg_u32 (cpu, rt, NO_SP) == 0)
     aarch64_set_next_PC_by_offset (cpu, offset);
 }
@@ -12394,8 +13587,9 @@ cbz32 (sim_cpu *cpu, int32_t offset)
 static void
 cbz (sim_cpu *cpu, int32_t offset)
 {
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rt = INSTR (4, 0);
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   if (aarch64_get_reg_u64 (cpu, rt, NO_SP) == 0)
     aarch64_set_next_PC_by_offset (cpu, offset);
 }
@@ -12404,19 +13598,21 @@ cbz (sim_cpu *cpu, int32_t offset)
 static void
 tbnz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
 {
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rt = INSTR (4, 0);
 
-  if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (1 << pos))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos))
     aarch64_set_next_PC_by_offset (cpu, offset);
 }
 
-/* branch on register bit test zero -- one size fits all.  */
+/* Branch on register bit test zero -- one size fits all.  */
 static void
 tbz (sim_cpu *cpu, uint32_t  pos, int32_t offset)
 {
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned rt = INSTR (4, 0);
 
-  if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (1 << pos)))
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  if (!(aarch64_get_reg_u64 (cpu, rt, NO_SP) & (((uint64_t) 1) << pos)))
     aarch64_set_next_PC_by_offset (cpu, offset);
 }
 
@@ -12429,8 +13625,8 @@ dexCompareBranchImmediate (sim_cpu *cpu)
      instr[23,5]  = simm19 branch offset counted in words
      instr[4,0]   = rt  */
 
-  uint32_t size = uimm (aarch64_get_instr (cpu), 31, 31);
-  uint32_t op   = uimm (aarch64_get_instr (cpu), 24, 24);
+  uint32_t size = INSTR (31, 31);
+  uint32_t op   = INSTR (24, 24);
   int32_t offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
 
   if (size == 0)
@@ -12459,13 +13655,12 @@ dexTestBranchImmediate (sim_cpu *cpu)
      instr[18,5]  = simm14 : signed offset counted in words
      instr[4,0]   = uimm5  */
 
-  uint32_t pos = ((uimm (aarch64_get_instr (cpu), 31, 31) << 4)
-		  | uimm (aarch64_get_instr (cpu), 23,19));
+  uint32_t pos = ((INSTR (31, 31) << 5) | INSTR (23, 19));
   int32_t offset = simm32 (aarch64_get_instr (cpu), 18, 5) << 2;
 
   NYI_assert (30, 25, 0x1b);
 
-  if (uimm (aarch64_get_instr (cpu), 24, 24) == 0)
+  if (INSTR (24, 24) == 0)
     tbz (cpu, pos, offset);
   else
     tbnz (cpu, pos, offset);
@@ -12481,9 +13676,7 @@ dexCondBranchImmediate (sim_cpu *cpu)
      instr[3,0]   = cond  */
 
   int32_t offset;
-  CondCode cc;
-  uint32_t op = ((uimm (aarch64_get_instr (cpu), 24, 24) << 1)
-		 | uimm (aarch64_get_instr (cpu), 4, 4));
+  uint32_t op = ((INSTR (24, 24) << 1) | INSTR (4, 4));
 
   NYI_assert (31, 25, 0x2a);
 
@@ -12491,9 +13684,8 @@ dexCondBranchImmediate (sim_cpu *cpu)
     HALT_UNALLOC;
 
   offset = simm32 (aarch64_get_instr (cpu), 23, 5) << 2;
-  cc = condcode (aarch64_get_instr (cpu), 0);
 
-  bcc (cpu, offset, cc);
+  bcc (cpu, offset, INSTR (3, 0));
 }
 
 static void
@@ -12505,10 +13697,10 @@ dexBranchRegister (sim_cpu *cpu)
      instr[15,10] = op3 : must be 000000
      instr[4,0]   = op2 : must be 11111.  */
 
-  uint32_t op = uimm (aarch64_get_instr (cpu), 24, 21);
-  uint32_t op2 = uimm (aarch64_get_instr (cpu), 20, 16);
-  uint32_t op3 = uimm (aarch64_get_instr (cpu), 15, 10);
-  uint32_t op4 = uimm (aarch64_get_instr (cpu), 4, 0);
+  uint32_t op = INSTR (24, 21);
+  uint32_t op2 = INSTR (20, 16);
+  uint32_t op3 = INSTR (15, 10);
+  uint32_t op4 = INSTR (4, 0);
 
   NYI_assert (31, 25, 0x6b);
 
@@ -12526,9 +13718,9 @@ dexBranchRegister (sim_cpu *cpu)
 
   else
     {
-      /* ERET and DRPS accept 0b11111 for rn = aarch64_get_instr (cpu)[4,0].  */
+      /* ERET and DRPS accept 0b11111 for rn = instr [4,0].  */
       /* anything else is unallocated.  */
-      uint32_t rn = greg (aarch64_get_instr (cpu), 0);
+      uint32_t rn = INSTR (4, 0);
 
       if (rn != 0x1f)
 	HALT_UNALLOC;
@@ -12567,6 +13759,7 @@ handle_halt (sim_cpu *cpu, uint32_t val)
 {
   uint64_t result = 0;
 
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   if (val != 0xf000)
     {
       TRACE_SYSCALL (cpu, " HLT [0x%x]", val);
@@ -12682,9 +13875,6 @@ handle_halt (sim_cpu *cpu, uint32_t val)
 	else if (fd == 1)
 	  {
 	    printf ("%.*s", (int) len, aarch64_get_mem_ptr (cpu, buf));
-	    if (disas)
-	      /* So that the output stays in sync with trace output.  */
-	      fflush (stdout);
 	  }
 	else if (fd == 2)
 	  {
@@ -12753,9 +13943,9 @@ dexExcpnGen (sim_cpu *cpu)
      instr[4,2]   = opc2 000 ==> OK, ow ==> UNALLOC
      instr[1,0]   = LL : discriminates opc  */
 
-  uint32_t opc = uimm (aarch64_get_instr (cpu), 23, 21);
-  uint32_t imm16 = uimm (aarch64_get_instr (cpu), 20, 5);
-  uint32_t opc2 = uimm (aarch64_get_instr (cpu), 4, 2);
+  uint32_t opc = INSTR (23, 21);
+  uint32_t imm16 = INSTR (20, 5);
+  uint32_t opc2 = INSTR (4, 2);
   uint32_t LL;
 
   NYI_assert (31, 24, 0xd4);
@@ -12763,7 +13953,7 @@ dexExcpnGen (sim_cpu *cpu)
   if (opc2 != 0)
     HALT_UNALLOC;
 
-  LL = uimm (aarch64_get_instr (cpu), 1, 0);
+  LL = INSTR (1, 0);
 
   /* We only implement HLT and BRK for now.  */
   if (opc == 1 && LL == 0)
@@ -12783,9 +13973,7 @@ dexExcpnGen (sim_cpu *cpu)
     HALT_UNALLOC;
 }
 
-/* Stub for accessing system registers.
-   We implement support for the DCZID register since this is used
-   by the C library's memset function.  */
+/* Stub for accessing system registers.  */
 
 static uint64_t
 system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
@@ -12794,33 +13982,147 @@ system_get (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
   if (crn == 0 && op1 == 3 && crm == 0 && op2 == 7)
     /* DCZID_EL0 - the Data Cache Zero ID register.
        We do not support DC ZVA at the moment, so
-       we return a value with the disable bit set.  */
+       we return a value with the disable bit set.
+       We implement support for the DCZID register since
+       it is used by the C library's memset function.  */
     return ((uint64_t) 1) << 4;
 
+  if (crn == 0 && op1 == 3 && crm == 0 && op2 == 1)
+    /* Cache Type Register.  */
+    return 0x80008000UL;
+
+  if (crn == 13 && op1 == 3 && crm == 0 && op2 == 2)
+    /* TPIDR_EL0 - thread pointer id.  */
+    return aarch64_get_thread_id (cpu);
+
+  if (op1 == 3 && crm == 4 && op2 == 0)
+    return aarch64_get_FPCR (cpu);
+
+  if (op1 == 3 && crm == 4 && op2 == 1)
+    return aarch64_get_FPSR (cpu);
+
+  else if (op1 == 3 && crm == 2 && op2 == 0)
+    return aarch64_get_CPSR (cpu);
+
   HALT_NYI;
 }
 
+static void
+system_set (sim_cpu *cpu, unsigned op0, unsigned op1, unsigned crn,
+	    unsigned crm, unsigned op2, uint64_t val)
+{
+  if (op1 == 3 && crm == 4 && op2 == 0)
+    aarch64_set_FPCR (cpu, val);
+
+  else if (op1 == 3 && crm == 4 && op2 == 1)
+    aarch64_set_FPSR (cpu, val);
+
+  else if (op1 == 3 && crm == 2 && op2 == 0)
+    aarch64_set_CPSR (cpu, val);
+
+  else
+    HALT_NYI;
+}
+
 static void
 do_mrs (sim_cpu *cpu)
 {
-  /* instr[31:20] = 1101 01010 0011
+  /* instr[31:20] = 1101 0101 0001 1
      instr[19]    = op0
      instr[18,16] = op1
      instr[15,12] = CRn
      instr[11,8]  = CRm
      instr[7,5]   = op2
      instr[4,0]   = Rt  */
-  unsigned sys_op0 = uimm (aarch64_get_instr (cpu), 19, 19) + 2;
-  unsigned sys_op1 = uimm (aarch64_get_instr (cpu), 18, 16);
-  unsigned sys_crn = uimm (aarch64_get_instr (cpu), 15, 12);
-  unsigned sys_crm = uimm (aarch64_get_instr (cpu), 11, 8);
-  unsigned sys_op2 = uimm (aarch64_get_instr (cpu), 7, 5);
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
-
+  unsigned sys_op0 = INSTR (19, 19) + 2;
+  unsigned sys_op1 = INSTR (18, 16);
+  unsigned sys_crn = INSTR (15, 12);
+  unsigned sys_crm = INSTR (11, 8);
+  unsigned sys_op2 = INSTR (7, 5);
+  unsigned rt = INSTR (4, 0);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
   aarch64_set_reg_u64 (cpu, rt, NO_SP,
 		       system_get (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2));
 }
 
+static void
+do_MSR_immediate (sim_cpu *cpu)
+{
+  /* instr[31:19] = 1101 0101 0000 0
+     instr[18,16] = op1
+     instr[15,12] = 0100
+     instr[11,8]  = CRm
+     instr[7,5]   = op2
+     instr[4,0]   = 1 1111  */
+
+  unsigned op1 = INSTR (18, 16);
+  /*unsigned crm = INSTR (11, 8);*/
+  unsigned op2 = INSTR (7, 5);
+
+  NYI_assert (31, 19, 0x1AA0);
+  NYI_assert (15, 12, 0x4);
+  NYI_assert (4,  0,  0x1F);
+
+  if (op1 == 0)
+    {
+      if (op2 == 5)
+	HALT_NYI; /* set SPSel.  */
+      else
+	HALT_UNALLOC;
+    }
+  else if (op1 == 3)
+    {
+      if (op2 == 6)
+	HALT_NYI; /* set DAIFset.  */
+      else if (op2 == 7)
+	HALT_NYI; /* set DAIFclr.  */
+      else
+	HALT_UNALLOC;
+    }
+  else
+    HALT_UNALLOC;
+}
+
+static void
+do_MSR_reg (sim_cpu *cpu)
+{
+  /* instr[31:20] = 1101 0101 0001
+     instr[19]    = op0
+     instr[18,16] = op1
+     instr[15,12] = CRn
+     instr[11,8]  = CRm
+     instr[7,5]   = op2
+     instr[4,0]   = Rt  */
+
+  unsigned sys_op0 = INSTR (19, 19) + 2;
+  unsigned sys_op1 = INSTR (18, 16);
+  unsigned sys_crn = INSTR (15, 12);
+  unsigned sys_crm = INSTR (11, 8);
+  unsigned sys_op2 = INSTR (7, 5);
+  unsigned rt = INSTR (4, 0);
+
+  NYI_assert (31, 20, 0xD51);
+
+  TRACE_DECODE (cpu, "emulated at line %d", __LINE__);
+  system_set (cpu, sys_op0, sys_op1, sys_crn, sys_crm, sys_op2,
+	      aarch64_get_reg_u64 (cpu, rt, NO_SP));
+}
+
+static void
+do_SYS (sim_cpu *cpu)
+{
+  /* instr[31,19] = 1101 0101 0000 1
+     instr[18,16] = op1
+     instr[15,12] = CRn
+     instr[11,8]  = CRm
+     instr[7,5]   = op2
+     instr[4,0]   = Rt  */
+  NYI_assert (31, 19, 0x1AA1);
+
+  /* FIXME: For now we just silently accept system ops.  */
+}
+
 static void
 dexSystem (sim_cpu *cpu)
 {
@@ -12850,20 +14152,19 @@ dexSystem (sim_cpu *cpu)
      types :  01 ==> Reads, 10 ==> Writes,
               11 ==> All, 00 ==> All (domain == FullSystem).  */
 
-  unsigned rt = uimm (aarch64_get_instr (cpu), 4, 0);
-  uint32_t l_op0_op1_crn = uimm (aarch64_get_instr (cpu), 21, 12);
+  unsigned rt = INSTR (4, 0);
 
   NYI_assert (31, 22, 0x354);
 
-  switch (l_op0_op1_crn)
+  switch (INSTR (21, 12))
     {
     case 0x032:
       if (rt == 0x1F)
 	{
 	  /* NOP has CRm != 0000 OR.  */
 	  /*         (CRm == 0000 AND (op2 == 000 OR op2 > 101)).  */
-	  uint32_t crm = uimm (aarch64_get_instr (cpu), 11, 8);
-	  uint32_t op2 = uimm (aarch64_get_instr (cpu), 7, 5);
+	  uint32_t crm = INSTR (11, 8);
+	  uint32_t op2 = INSTR (7, 5);
 
 	  if (crm != 0 || (op2 == 0 || op2 > 5))
 	    {
@@ -12876,7 +14177,7 @@ dexSystem (sim_cpu *cpu)
 
     case 0x033:
       {
-	uint32_t op2 =  uimm (aarch64_get_instr (cpu), 7, 5);
+	uint32_t op2 =  INSTR (7, 5);
 
 	switch (op2)
 	  {
@@ -12884,31 +14185,27 @@ dexSystem (sim_cpu *cpu)
 	  case 4: dsb (cpu); return;
 	  case 5: dmb (cpu); return;
 	  case 6: isb (cpu); return;
-	  case 7:
 	  default: HALT_UNALLOC;
 	}
       }
 
     case 0x3B0:
-      /* MRS Wt, sys-reg.  */
-      do_mrs (cpu);
-      return;
-
     case 0x3B4:
     case 0x3BD:
-      /* MRS Xt, sys-reg.  */
       do_mrs (cpu);
       return;
 
     case 0x0B7:
-      /* DC <type>, x<n>.  */
-      HALT_NYI;
+      do_SYS (cpu); /* DC is an alias of SYS.  */
       return;
 
     default:
-      /* if (uimm (aarch64_get_instr (cpu), 21, 20) == 0x1)
-         MRS Xt, sys-reg.  */
-      HALT_NYI;
+      if (INSTR (21, 20) == 0x1)
+	do_MSR_reg (cpu);
+      else if (INSTR (21, 19) == 0 && INSTR (15, 12) == 0x4)
+	do_MSR_immediate (cpu);
+      else
+	HALT_NYI;
       return;
     }
 }
@@ -12928,7 +14225,7 @@ dexBr (sim_cpu *cpu)
 
     case BR_IMMCMP_001:
       /* Compare has bit 25 clear while test has it set.  */
-      if (!uimm (aarch64_get_instr (cpu), 25, 25))
+      if (!INSTR (25, 25))
 	dexCompareBranchImmediate (cpu);
       else
 	dexTestBranchImmediate (cpu);
@@ -12937,7 +14234,7 @@ dexBr (sim_cpu *cpu)
     case BR_IMMCOND_010:
       /* This is a conditional branch if bit 25 is clear otherwise
          unallocated.  */
-      if (!uimm (aarch64_get_instr (cpu), 25, 25))
+      if (!INSTR (25, 25))
 	dexCondBranchImmediate (cpu);
       else
 	HALT_UNALLOC;
@@ -12952,7 +14249,7 @@ dexBr (sim_cpu *cpu)
 
     case BR_IMMCMP_101:
       /* Compare has bit 25 clear while test has it set.  */
-      if (!uimm (aarch64_get_instr (cpu), 25, 25))
+      if (!INSTR (25, 25))
 	dexCompareBranchImmediate (cpu);
       else
 	dexTestBranchImmediate (cpu);
@@ -12960,20 +14257,20 @@ dexBr (sim_cpu *cpu)
 
     case BR_REG_110:
       /* Unconditional branch reg has bit 25 set.  */
-      if (uimm (aarch64_get_instr (cpu), 25, 25))
+      if (INSTR (25, 25))
 	dexBranchRegister (cpu);
 
       /* This includes both Excpn Gen, System and unalloc operations.
          We need to decode the Excpn Gen operation BRK so we can plant
          debugger entry points.
-         Excpn Gen operations have aarch64_get_instr (cpu)[24] = 0.
+         Excpn Gen operations have instr [24] = 0.
          we need to decode at least one of the System operations NOP
          which is an alias for HINT #0.
-         System operations have aarch64_get_instr (cpu)[24,22] = 100.  */
-      else if (uimm (aarch64_get_instr (cpu), 24, 24) == 0)
+         System operations have instr [24,22] = 100.  */
+      else if (INSTR (24, 24) == 0)
 	dexExcpnGen (cpu);
 
-      else if (uimm (aarch64_get_instr (cpu), 24, 22) == 4)
+      else if (INSTR (24, 22) == 4)
 	dexSystem (cpu);
 
       else
@@ -13034,21 +14331,15 @@ aarch64_step (sim_cpu *cpu)
     return FALSE;
 
   aarch64_set_next_PC (cpu, pc + 4);
-  aarch64_get_instr (cpu) = aarch64_get_mem_u32 (cpu, pc);
 
-  if (TRACE_INSN_P (cpu))
-    {
-      if (disas)
-	TRACE_INSN (cpu, " pc = %" PRIx64 " ", pc);
-      else
-	TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %x", pc,
-		    aarch64_get_instr (cpu));
-    }
-  else if (disas)
-    sim_io_eprintf (CPU_STATE (cpu), " %" PRIx64 " ", pc);
+  /* Code is always little-endian.  */
+  sim_core_read_buffer (CPU_STATE (cpu), cpu, read_map,
+			& aarch64_get_instr (cpu), pc, 4);
+  aarch64_get_instr (cpu) = endian_le2h_4 (aarch64_get_instr (cpu));
 
-  if (disas)
-    aarch64_print_insn (CPU_STATE (cpu), pc);
+  TRACE_INSN (cpu, " pc = %" PRIx64 " instr = %08x", pc,
+	      aarch64_get_instr (cpu));
+  TRACE_DISASM (cpu, pc);
 
   aarch64_decode_and_execute (cpu, pc);
 
@@ -13061,10 +14352,15 @@ aarch64_run (SIM_DESC sd)
   sim_cpu *cpu = STATE_CPU (sd, 0);
 
   while (aarch64_step (cpu))
-    aarch64_update_PC (cpu);
+    {
+      aarch64_update_PC (cpu);
+
+      if (sim_events_tick (sd))
+	sim_events_process (sd);
+    }
 
-  sim_engine_halt (sd, NULL, NULL, aarch64_get_PC (cpu),
-		   sim_exited, aarch64_get_reg_s32 (cpu, R0, SP_OK));
+  sim_engine_halt (sd, cpu, NULL, aarch64_get_PC (cpu),
+		   sim_exited, aarch64_get_reg_s32 (cpu, R0, NO_SP));
 }
 
 void