X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=gas%2Fconfig%2Ftc-arm.c;h=8ea3ad198b7be1d4455bfdd44e85d12dd97c9331;hb=945c025aafa9f4f36cdc5bb2f1dad083fa34a6d2;hp=5ccbfc1f13460b08753b1e833e1288ffa50cecfa;hpb=6e0080dd37a13129b04c34c7aa18ec2bee85b731;p=deliverable%2Fbinutils-gdb.git

diff --git a/gas/config/tc-arm.c b/gas/config/tc-arm.c
index 5ccbfc1f13..8ea3ad198b 100644
--- a/gas/config/tc-arm.c
+++ b/gas/config/tc-arm.c
@@ -1,6 +1,6 @@
 /* tc-arm.c -- Assemble for the ARM
    Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
-   2004, 2005
+   2004, 2005, 2006
    Free Software Foundation, Inc.
    Contributed by Richard Earnshaw (rwe@pegasus.esprit.ec.org)
 	Modified by David Taylor (dtaylor@armltd.co.uk)
@@ -25,29 +25,24 @@
    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
    02110-1301, USA.  */
 
-#include <string.h>
 #include <limits.h>
+#include <stdarg.h>
 #define	 NO_RELOC 0
 #include "as.h"
 #include "safe-ctype.h"
-
-/* Need TARGET_CPU.  */
-#include "config.h"
 #include "subsegs.h"
 #include "obstack.h"
-#include "symbols.h"
-#include "listing.h"
 
 #include "opcode/arm.h"
 
 #ifdef OBJ_ELF
 #include "elf/arm.h"
-#include "dwarf2dbg.h"
 #include "dw2gencfi.h"
 #endif
 
-/* XXX Set this to 1 after the next binutils release.  */
-#define WARN_DEPRECATED 0
+#include "dwarf2dbg.h"
+
+#define WARN_DEPRECATED 1
 
 #ifdef OBJ_ELF
 /* Must be at least the size of the largest unwind opcode (currently two).  */
@@ -91,6 +86,15 @@ static unsigned int marked_pr_dependency = 0;
 
 #endif /* OBJ_ELF */
 
+/* Results from operand parsing worker functions.  */
+
+typedef enum
+{
+  PARSE_OPERAND_SUCCESS,
+  PARSE_OPERAND_FAIL,
+  PARSE_OPERAND_FAIL_NO_BACKTRACK
+} parse_operand_result;
+
 enum arm_float_abi
 {
   ARM_FLOAT_ABI_HARD,
@@ -151,6 +155,7 @@ static const arm_feature_set *mcpu_fpu_opt = NULL;
 static const arm_feature_set *march_cpu_opt = NULL;
 static const arm_feature_set *march_fpu_opt = NULL;
 static const arm_feature_set *mfpu_opt = NULL;
+static const arm_feature_set *object_arch = NULL;
 
 /* Constants for known architecture features.  */
 static const arm_feature_set fpu_default = FPU_DEFAULT;
@@ -197,6 +202,8 @@ static const arm_feature_set arm_arch_full = ARM_FEATURE (-1, -1);
 static const arm_feature_set arm_arch_t2 = ARM_ARCH_THUMB2;
 static const arm_feature_set arm_arch_none = ARM_ARCH_NONE;
 
+static const arm_feature_set arm_cext_iwmmxt2 =
+  ARM_FEATURE (0, ARM_CEXT_IWMMXT2);
 static const arm_feature_set arm_cext_iwmmxt =
   ARM_FEATURE (0, ARM_CEXT_IWMMXT);
 static const arm_feature_set arm_cext_xscale =
@@ -225,6 +232,12 @@ static int meabi_flags = EABI_DEFAULT;
 # else
 static int meabi_flags = EF_ARM_EABI_UNKNOWN;
 # endif
+
+bfd_boolean
+arm_is_eabi(void)
+{
+  return (EF_ARM_EABI_VERSION (meabi_flags) >= EF_ARM_EABI_VER4);
+}
 #endif
 
 #ifdef OBJ_ELF
@@ -295,6 +308,10 @@ struct arm_it
   int		size;
   int		size_req;
   int		cond;
+  /* "uncond_value" is set to the value in place of the conditional field in
+     unconditional versions of the instruction, or -1 if nothing is
+     appropriate.  */
+  int		uncond_value;
   struct neon_type vectype;
   /* Set to the opcode if the instruction needs relaxation.
      Zero if the instruction is not relaxed.  */
@@ -319,7 +336,9 @@ struct arm_it
     /* Note: we abuse "regisimm" to mean "is Neon register" in VMOV
        instructions. This allows us to disambiguate ARM <-> vector insns.  */
     unsigned regisimm   : 1;  /* 64-bit immediate, reg forms high 32 bits.  */
+    unsigned isvec      : 1;  /* Is a single, double or quad VFP/Neon reg.  */
     unsigned isquad     : 1;  /* Operand is Neon quad-precision register.  */
+    unsigned issingle   : 1;  /* Operand is VFP single-precision register.  */
     unsigned hasreloc	: 1;  /* Operand has relocation suffix.  */
     unsigned writeback	: 1;  /* Operand has trailing !  */
     unsigned preind	: 1;  /* Preindexed address.  */
@@ -428,7 +447,9 @@ enum arm_reg_type
   REG_TYPE_VFS,
   REG_TYPE_VFD,
   REG_TYPE_NQ,
+  REG_TYPE_VFSD,
   REG_TYPE_NDQ,
+  REG_TYPE_NSDQ,
   REG_TYPE_VFC,
   REG_TYPE_MVF,
   REG_TYPE_MVD,
@@ -465,7 +486,9 @@ const char *const reg_expected_msgs[] =
   N_("VFP single precision register expected"),
   N_("VFP/Neon double precision register expected"),
   N_("Neon quad precision register expected"),
+  N_("VFP single or double precision register expected"),
   N_("Neon double or quad precision register expected"),
+  N_("VFP single, double or Neon quad precision register expected"),
   N_("VFP system register expected"),
   N_("Maverick MVF register expected"),
   N_("Maverick MVD register expected"),
@@ -631,6 +654,7 @@ struct asm_opcode
 #define BAD_ADDR_MODE   _("instruction does not accept this addressing mode");
 #define BAD_BRANCH	_("branch must be last instruction in IT block")
 #define BAD_NOT_IT	_("instruction not allowed in IT block")
+#define BAD_FPU		_("selected FPU does not support instruction")
 
 static struct hash_control *arm_ops_hsh;
 static struct hash_control *arm_cond_hsh;
@@ -1110,6 +1134,11 @@ parse_neon_type (struct neon_type *type, char **str)
 	case 'p': thistype = NT_poly; break;
 	case 's': thistype = NT_signed; break;
 	case 'u': thistype = NT_unsigned; break;
+        case 'd':
+          thistype = NT_float;
+          thissize = 64;
+          ptr++;
+          goto done;
 	default:
 	  as_bad (_("unexpected character `%c' in type specifier"), *ptr);
 	  return FAIL;
@@ -1133,6 +1162,7 @@ parse_neon_type (struct neon_type *type, char **str)
 	    }
 	}
 
+      done:
       if (type)
         {
           type->el[type->elems].type = thistype;
@@ -1233,9 +1263,16 @@ parse_typed_reg_or_scalar (char **ccp, enum arm_reg_type type,
       return altreg;
     }
 
-  /* Undo polymorphism for Neon D and Q registers.  */
-  if (type == REG_TYPE_NDQ
-      && (reg->type == REG_TYPE_NQ || reg->type == REG_TYPE_VFD))
+  /* Undo polymorphism when a set of register types may be accepted.  */
+  if ((type == REG_TYPE_NDQ
+       && (reg->type == REG_TYPE_NQ || reg->type == REG_TYPE_VFD))
+      || (type == REG_TYPE_VFSD
+          && (reg->type == REG_TYPE_VFS || reg->type == REG_TYPE_VFD))
+      || (type == REG_TYPE_NSDQ
+          && (reg->type == REG_TYPE_VFS || reg->type == REG_TYPE_VFD
+              || reg->type == REG_TYPE_NQ))
+      || (type == REG_TYPE_MMXWC
+	  && (reg->type == REG_TYPE_MMXWCG)))
     type = reg->type;
 
   if (type != reg->type)
@@ -1525,8 +1562,9 @@ enum reg_list_els
    bug.  */
 
 static int
-parse_vfp_reg_list (char **str, unsigned int *pbase, enum reg_list_els etype)
+parse_vfp_reg_list (char **ccp, unsigned int *pbase, enum reg_list_els etype)
 {
+  char *str = *ccp;
   int base_reg;
   int new_base;
   enum arm_reg_type regtype = 0;
@@ -1536,13 +1574,13 @@ parse_vfp_reg_list (char **str, unsigned int *pbase, enum reg_list_els etype)
   unsigned long mask = 0;
   int i;
 
-  if (**str != '{')
+  if (*str != '{')
     {
       inst.error = _("expecting {");
       return FAIL;
     }
 
-  (*str)++;
+  str++;
 
   switch (etype)
     {
@@ -1583,7 +1621,7 @@ parse_vfp_reg_list (char **str, unsigned int *pbase, enum reg_list_els etype)
     {
       int setmask = 1, addregs = 1;
 
-      new_base = arm_typed_reg_parse (str, regtype, &regtype, NULL);
+      new_base = arm_typed_reg_parse (&str, regtype, &regtype, NULL);
 
       if (new_base == FAIL)
 	{
@@ -1622,13 +1660,13 @@ parse_vfp_reg_list (char **str, unsigned int *pbase, enum reg_list_els etype)
       mask |= setmask << new_base;
       count += addregs;
 
-      if (**str == '-') /* We have the start of a range expression */
+      if (*str == '-') /* We have the start of a range expression */
 	{
 	  int high_range;
 
-	  (*str)++;
+	  str++;
 
-	  if ((high_range = arm_typed_reg_parse (str, regtype, NULL, NULL))
+	  if ((high_range = arm_typed_reg_parse (&str, regtype, NULL, NULL))
               == FAIL)
 	    {
 	      inst.error = gettext (reg_expected_msgs[regtype]);
@@ -1663,9 +1701,9 @@ parse_vfp_reg_list (char **str, unsigned int *pbase, enum reg_list_els etype)
 	    }
 	}
     }
-  while (skip_past_comma (str) != FAIL);
+  while (skip_past_comma (&str) != FAIL);
 
-  (*str)++;
+  str++;
 
   /* Sanity check -- should have raised a parse error above.  */
   if (count == 0 || count > max_regs)
@@ -1684,6 +1722,8 @@ parse_vfp_reg_list (char **str, unsigned int *pbase, enum reg_list_els etype)
 	}
     }
 
+  *ccp = str;
+
   return count;
 }
 
@@ -2248,7 +2288,7 @@ s_unreq (int a ATTRIBUTE_UNUSED)
 
 static enum mstate mapstate = MAP_UNDEFINED;
 
-static void
+void
 mapping_state (enum mstate state)
 {
   symbolS * symbolP;
@@ -3245,7 +3285,57 @@ s_arm_unwind_save_fpa (int reg)
 }
 
 
-/* Parse a directive saving VFP registers.  */
+/* Parse a directive saving VFP registers for ARMv6 and above.  */
+
+static void
+s_arm_unwind_save_vfp_armv6 (void)
+{
+  int count;
+  unsigned int start;
+  valueT op;
+  int num_vfpv3_regs = 0;
+  int num_regs_below_16;
+
+  count = parse_vfp_reg_list (&input_line_pointer, &start, REGLIST_VFP_D);
+  if (count == FAIL)
+    {
+      as_bad (_("expected register list"));
+      ignore_rest_of_line ();
+      return;
+    }
+
+  demand_empty_rest_of_line ();
+
+  /* We always generate FSTMD/FLDMD-style unwinding opcodes (rather
+     than FSTMX/FLDMX-style ones).  */
+
+  /* Generate opcode for (VFPv3) registers numbered in the range 16 .. 31.  */
+  if (start >= 16)
+    num_vfpv3_regs = count;
+  else if (start + count > 16)
+    num_vfpv3_regs = start + count - 16;
+
+  if (num_vfpv3_regs > 0)
+    {
+      int start_offset = start > 16 ? start - 16 : 0;
+      op = 0xc800 | (start_offset << 4) | (num_vfpv3_regs - 1);
+      add_unwind_opcode (op, 2);
+    }
+
+  /* Generate opcode for registers numbered in the range 0 .. 15.  */
+  num_regs_below_16 = num_vfpv3_regs > 0 ? 16 - (int) start : count;
+  assert (num_regs_below_16 + num_vfpv3_regs == count);
+  if (num_regs_below_16 > 0)
+    {
+      op = 0xc900 | (start << 4) | (num_regs_below_16 - 1);
+      add_unwind_opcode (op, 2);
+    }
+
+  unwind.frame_size += count * 8;
+}
+
+
+/* Parse a directive saving VFP registers for pre-ARMv6.  */
 
 static void
 s_arm_unwind_save_vfp (void)
@@ -3369,7 +3459,7 @@ s_arm_unwind_save_mmxwr (void)
 
 	      op = 0xffff << (reg - 1);
 	      if (reg > 0
-		  || ((mask & op) == (1u << (reg - 1))))
+		  && ((mask & op) == (1u << (reg - 1))))
 		{
 		  op = (1 << (reg + i + 1)) - 1;
 		  op &= ~((1 << reg) - 1);
@@ -3483,10 +3573,11 @@ error:
 }
 
 
-/* Parse an unwind_save directive.  */
+/* Parse an unwind_save directive.
+   If the argument is non-zero, this is a .vsave directive.  */
 
 static void
-s_arm_unwind_save (int ignored ATTRIBUTE_UNUSED)
+s_arm_unwind_save (int arch_v6)
 {
   char *peek;
   struct reg_entry *reg;
@@ -3523,7 +3614,12 @@ s_arm_unwind_save (int ignored ATTRIBUTE_UNUSED)
       return;
 
     case REG_TYPE_RN:	  s_arm_unwind_save_core ();   return;
-    case REG_TYPE_VFD:	   s_arm_unwind_save_vfp ();	return;
+    case REG_TYPE_VFD:
+      if (arch_v6)
+        s_arm_unwind_save_vfp_armv6 ();
+      else
+        s_arm_unwind_save_vfp ();
+      return;
     case REG_TYPE_MMXWR:  s_arm_unwind_save_mmxwr ();  return;
     case REG_TYPE_MMXWCG: s_arm_unwind_save_mmxwcg (); return;
 
@@ -3541,6 +3637,7 @@ s_arm_unwind_movsp (int ignored ATTRIBUTE_UNUSED)
 {
   int reg;
   valueT op;
+  int offset;
 
   reg = arm_reg_parse (&input_line_pointer, REG_TYPE_RN);
   if (reg == FAIL)
@@ -3549,6 +3646,16 @@ s_arm_unwind_movsp (int ignored ATTRIBUTE_UNUSED)
       ignore_rest_of_line ();
       return;
     }
+
+  /* Optional constant.	 */
+  if (skip_past_comma (&input_line_pointer) != FAIL)
+    {
+      if (immediate_for_directive (&offset) == FAIL)
+	return;
+    }
+  else
+    offset = 0;
+
   demand_empty_rest_of_line ();
 
   if (reg == REG_SP || reg == REG_PC)
@@ -3566,7 +3673,7 @@ s_arm_unwind_movsp (int ignored ATTRIBUTE_UNUSED)
 
   /* Record the information for later.	*/
   unwind.fp_reg = reg;
-  unwind.fp_offset = unwind.frame_size;
+  unwind.fp_offset = unwind.frame_size - offset;
   unwind.sp_restored = 1;
 }
 
@@ -3790,9 +3897,32 @@ bad:
 #endif /* OBJ_ELF */
 
 static void s_arm_arch (int);
+static void s_arm_object_arch (int);
 static void s_arm_cpu (int);
 static void s_arm_fpu (int);
 
+#ifdef TE_PE
+
+static void
+pe_directive_secrel (int dummy ATTRIBUTE_UNUSED) 
+{
+  expressionS exp;
+
+  do
+    {
+      expression (&exp);
+      if (exp.X_op == O_symbol)
+	exp.X_op = O_secrel;
+
+      emit_expr (&exp, 4);
+    }
+  while (*input_line_pointer++ == ',');
+
+  input_line_pointer--;
+  demand_empty_rest_of_line ();
+}
+#endif /* TE_PE */
+
 /* This table describes all the machine specific pseudo-ops the assembler
    has to support.  The fields are:
      pseudo-op name without dot
@@ -3821,6 +3951,7 @@ const pseudo_typeS md_pseudo_table[] =
   { "syntax",	   s_syntax,	  0 },
   { "cpu",	   s_arm_cpu,	  0 },
   { "arch",	   s_arm_arch,	  0 },
+  { "object_arch", s_arm_object_arch,	0 },
   { "fpu",	   s_arm_fpu,	  0 },
 #ifdef OBJ_ELF
   { "word",	   s_arm_elf_cons, 4 },
@@ -3833,6 +3964,7 @@ const pseudo_typeS md_pseudo_table[] =
   { "personalityindex",	s_arm_unwind_personalityindex, 0 },
   { "handlerdata",	s_arm_unwind_handlerdata, 0 },
   { "save",		s_arm_unwind_save,	0 },
+  { "vsave",		s_arm_unwind_save,	1 },
   { "movsp",		s_arm_unwind_movsp,	0 },
   { "pad",		s_arm_unwind_pad,	0 },
   { "setfp",		s_arm_unwind_setfp,	0 },
@@ -3840,10 +3972,22 @@ const pseudo_typeS md_pseudo_table[] =
   { "eabi_attribute",	s_arm_eabi_attribute,	0 },
 #else
   { "word",	   cons, 4},
+
+  /* These are used for dwarf.  */
+  {"2byte", cons, 2},
+  {"4byte", cons, 4},
+  {"8byte", cons, 8},
+  /* These are used for dwarf2.  */
+  { "file", (void (*) (int)) dwarf2_directive_file, 0 },
+  { "loc",  dwarf2_directive_loc,  0 },
+  { "loc_mark_labels", dwarf2_directive_loc_mark_labels, 0 },
 #endif
   { "extend",	   float_cons, 'x' },
   { "ldouble",	   float_cons, 'x' },
   { "packed",	   float_cons, 'p' },
+#ifdef TE_PE
+  {"secrel32", pe_directive_secrel, 0},
+#endif
   { 0, 0, 0 }
 };
 
@@ -3878,7 +4022,7 @@ parse_immediate (char **str, int *val, int min, int max,
 }
 
 /* Less-generic immediate-value read function with the possibility of loading a
-   big (64-bit) immediate, as required by Neon VMOV and VMVN immediate
+   big (64-bit) immediate, as required by Neon VMOV, VMVN and logic immediate
    instructions. Puts the result directly in inst.operands[i].  */
 
 static int
@@ -3890,7 +4034,18 @@ parse_big_immediate (char **str, int i)
   my_get_expression (&exp, &ptr, GE_OPT_PREFIX_BIG);
 
   if (exp.X_op == O_constant)
-    inst.operands[i].imm = exp.X_add_number;
+    {
+      inst.operands[i].imm = exp.X_add_number & 0xffffffff;
+      /* If we're on a 64-bit host, then a 64-bit number can be returned using
+	 O_constant.  We have to be careful not to break compilation for
+	 32-bit X_add_number, though.  */
+      if ((exp.X_add_number & ~0xffffffffl) != 0)
+	{
+          /* X >> 32 is illegal if sizeof (exp.X_add_number) == 4.  */
+	  inst.operands[i].reg = ((exp.X_add_number >> 16) >> 16) & 0xffffffff;
+	  inst.operands[i].regisimm = 1;
+	}
+    }
   else if (exp.X_op == O_big
            && LITTLENUM_NUMBER_OF_BITS * exp.X_add_number > 32
            && LITTLENUM_NUMBER_OF_BITS * exp.X_add_number <= 64)
@@ -4241,6 +4396,168 @@ parse_shifter_operand (char **str, int i)
   return SUCCESS;
 }
 
+/* Group relocation information.  Each entry in the table contains the
+   textual name of the relocation as may appear in assembler source
+   and must end with a colon.
+   Along with this textual name are the relocation codes to be used if
+   the corresponding instruction is an ALU instruction (ADD or SUB only),
+   an LDR, an LDRS, or an LDC.  */
+
+struct group_reloc_table_entry
+{
+  const char *name;
+  int alu_code;
+  int ldr_code;
+  int ldrs_code;
+  int ldc_code;
+};
+
+typedef enum
+{
+  /* Varieties of non-ALU group relocation.  */
+
+  GROUP_LDR,
+  GROUP_LDRS,
+  GROUP_LDC
+} group_reloc_type;
+
+static struct group_reloc_table_entry group_reloc_table[] =
+  { /* Program counter relative: */
+    { "pc_g0_nc",
+      BFD_RELOC_ARM_ALU_PC_G0_NC,	/* ALU */
+      0,				/* LDR */
+      0,				/* LDRS */
+      0 },				/* LDC */
+    { "pc_g0",
+      BFD_RELOC_ARM_ALU_PC_G0,		/* ALU */
+      BFD_RELOC_ARM_LDR_PC_G0,		/* LDR */
+      BFD_RELOC_ARM_LDRS_PC_G0,		/* LDRS */
+      BFD_RELOC_ARM_LDC_PC_G0 },	/* LDC */
+    { "pc_g1_nc",
+      BFD_RELOC_ARM_ALU_PC_G1_NC,	/* ALU */
+      0,				/* LDR */
+      0,				/* LDRS */
+      0 },				/* LDC */
+    { "pc_g1",
+      BFD_RELOC_ARM_ALU_PC_G1,		/* ALU */
+      BFD_RELOC_ARM_LDR_PC_G1, 		/* LDR */
+      BFD_RELOC_ARM_LDRS_PC_G1,		/* LDRS */
+      BFD_RELOC_ARM_LDC_PC_G1 },	/* LDC */
+    { "pc_g2",
+      BFD_RELOC_ARM_ALU_PC_G2,		/* ALU */
+      BFD_RELOC_ARM_LDR_PC_G2,		/* LDR */
+      BFD_RELOC_ARM_LDRS_PC_G2,		/* LDRS */
+      BFD_RELOC_ARM_LDC_PC_G2 },	/* LDC */
+    /* Section base relative */
+    { "sb_g0_nc",
+      BFD_RELOC_ARM_ALU_SB_G0_NC,	/* ALU */
+      0,				/* LDR */
+      0,				/* LDRS */
+      0 },				/* LDC */
+    { "sb_g0",
+      BFD_RELOC_ARM_ALU_SB_G0,		/* ALU */
+      BFD_RELOC_ARM_LDR_SB_G0,		/* LDR */
+      BFD_RELOC_ARM_LDRS_SB_G0,		/* LDRS */
+      BFD_RELOC_ARM_LDC_SB_G0 },	/* LDC */
+    { "sb_g1_nc",
+      BFD_RELOC_ARM_ALU_SB_G1_NC,	/* ALU */
+      0,				/* LDR */
+      0,				/* LDRS */
+      0 },				/* LDC */
+    { "sb_g1",
+      BFD_RELOC_ARM_ALU_SB_G1,		/* ALU */
+      BFD_RELOC_ARM_LDR_SB_G1, 		/* LDR */
+      BFD_RELOC_ARM_LDRS_SB_G1,		/* LDRS */
+      BFD_RELOC_ARM_LDC_SB_G1 },	/* LDC */
+    { "sb_g2",
+      BFD_RELOC_ARM_ALU_SB_G2,		/* ALU */
+      BFD_RELOC_ARM_LDR_SB_G2,		/* LDR */
+      BFD_RELOC_ARM_LDRS_SB_G2,		/* LDRS */
+      BFD_RELOC_ARM_LDC_SB_G2 }	};	/* LDC */
+
+/* Given the address of a pointer pointing to the textual name of a group
+   relocation as may appear in assembler source, attempt to find its details
+   in group_reloc_table.  The pointer will be updated to the character after
+   the trailing colon.  On failure, FAIL will be returned; SUCCESS
+   otherwise.  On success, *entry will be updated to point at the relevant
+   group_reloc_table entry. */
+
+static int
+find_group_reloc_table_entry (char **str, struct group_reloc_table_entry **out)
+{
+  unsigned int i;
+  for (i = 0; i < ARRAY_SIZE (group_reloc_table); i++)
+    {
+      int length = strlen (group_reloc_table[i].name);
+
+      if (strncasecmp (group_reloc_table[i].name, *str, length) == 0 &&
+          (*str)[length] == ':')
+        {
+          *out = &group_reloc_table[i];
+          *str += (length + 1);
+          return SUCCESS;
+        }
+    }
+
+  return FAIL;
+}
+
+/* Parse a <shifter_operand> for an ARM data processing instruction
+   (as for parse_shifter_operand) where group relocations are allowed:
+
+      #<immediate>
+      #<immediate>, <rotate>
+      #:<group_reloc>:<expression>
+      <Rm>
+      <Rm>, <shift>
+
+   where <group_reloc> is one of the strings defined in group_reloc_table.
+   The hashes are optional.
+
+   Everything else is as for parse_shifter_operand.  */
+
+static parse_operand_result
+parse_shifter_operand_group_reloc (char **str, int i)
+{
+  /* Determine if we have the sequence of characters #: or just :
+     coming next.  If we do, then we check for a group relocation.
+     If we don't, punt the whole lot to parse_shifter_operand.  */
+
+  if (((*str)[0] == '#' && (*str)[1] == ':')
+      || (*str)[0] == ':')
+    {
+      struct group_reloc_table_entry *entry;
+
+      if ((*str)[0] == '#')
+        (*str) += 2;
+      else
+        (*str)++;
+
+      /* Try to parse a group relocation.  Anything else is an error.  */
+      if (find_group_reloc_table_entry (str, &entry) == FAIL)
+        {
+          inst.error = _("unknown group relocation");
+          return PARSE_OPERAND_FAIL_NO_BACKTRACK;
+        }
+
+      /* We now have the group relocation table entry corresponding to
+         the name in the assembler source.  Next, we parse the expression.  */
+      if (my_get_expression (&inst.reloc.exp, str, GE_NO_PREFIX))
+        return PARSE_OPERAND_FAIL_NO_BACKTRACK;
+
+      /* Record the relocation type (always the ALU variant here).  */
+      inst.reloc.type = entry->alu_code;
+      assert (inst.reloc.type != 0);
+
+      return PARSE_OPERAND_SUCCESS;
+    }
+  else
+    return parse_shifter_operand (str, i) == SUCCESS
+           ? PARSE_OPERAND_SUCCESS : PARSE_OPERAND_FAIL;
+
+  /* Never reached.  */
+}
+
 /* Parse all forms of an ARM address expression.  Information is written
    to inst.operands[i] and/or inst.reloc.
 
@@ -4273,8 +4590,9 @@ parse_shifter_operand (char **str, int i)
   It is the caller's responsibility to check for addressing modes not
   supported by the instruction, and to set inst.reloc.type.  */
 
-static int
-parse_address (char **str, int i)
+static parse_operand_result
+parse_address_main (char **str, int i, int group_relocations,
+                    group_reloc_type group_type)
 {
   char *p = *str;
   int reg;
@@ -4292,16 +4610,16 @@ parse_address (char **str, int i)
       /* else a load-constant pseudo op, no special treatment needed here */
 
       if (my_get_expression (&inst.reloc.exp, &p, GE_NO_PREFIX))
-	return FAIL;
+	return PARSE_OPERAND_FAIL;
 
       *str = p;
-      return SUCCESS;
+      return PARSE_OPERAND_SUCCESS;
     }
 
   if ((reg = arm_reg_parse (&p, REG_TYPE_RN)) == FAIL)
     {
       inst.error = _(reg_expected_msgs[REG_TYPE_RN]);
-      return FAIL;
+      return PARSE_OPERAND_FAIL;
     }
   inst.operands[i].reg = reg;
   inst.operands[i].isreg = 1;
@@ -4320,7 +4638,7 @@ parse_address (char **str, int i)
 
 	  if (skip_past_comma (&p) == SUCCESS)
 	    if (parse_shift (&p, i, SHIFT_IMMEDIATE) == FAIL)
-	      return FAIL;
+	      return PARSE_OPERAND_FAIL;
 	}
       else if (skip_past_char (&p, ':') == SUCCESS)
         {
@@ -4332,7 +4650,7 @@ parse_address (char **str, int i)
           if (exp.X_op != O_constant)
             {
               inst.error = _("alignment must be constant");
-              return FAIL;
+              return PARSE_OPERAND_FAIL;
             }
           inst.operands[i].imm = exp.X_add_number << 8;
           inst.operands[i].immisalign = 1;
@@ -4346,15 +4664,68 @@ parse_address (char **str, int i)
 	      inst.operands[i].negative = 0;
 	      p--;
 	    }
-	  if (my_get_expression (&inst.reloc.exp, &p, GE_IMM_PREFIX))
-	    return FAIL;
+
+	  if (group_relocations &&
+              ((*p == '#' && *(p + 1) == ':') || *p == ':'))
+
+	    {
+	      struct group_reloc_table_entry *entry;
+
+              /* Skip over the #: or : sequence.  */
+              if (*p == '#')
+                p += 2;
+              else
+                p++;
+
+	      /* Try to parse a group relocation.  Anything else is an
+                 error.  */
+	      if (find_group_reloc_table_entry (&p, &entry) == FAIL)
+		{
+		  inst.error = _("unknown group relocation");
+		  return PARSE_OPERAND_FAIL_NO_BACKTRACK;
+		}
+
+	      /* We now have the group relocation table entry corresponding to
+		 the name in the assembler source.  Next, we parse the
+                 expression.  */
+	      if (my_get_expression (&inst.reloc.exp, &p, GE_NO_PREFIX))
+		return PARSE_OPERAND_FAIL_NO_BACKTRACK;
+
+	      /* Record the relocation type.  */
+              switch (group_type)
+                {
+                  case GROUP_LDR:
+	            inst.reloc.type = entry->ldr_code;
+                    break;
+
+                  case GROUP_LDRS:
+	            inst.reloc.type = entry->ldrs_code;
+                    break;
+
+                  case GROUP_LDC:
+	            inst.reloc.type = entry->ldc_code;
+                    break;
+
+                  default:
+                    assert (0);
+                }
+
+              if (inst.reloc.type == 0)
+		{
+		  inst.error = _("this group relocation is not allowed on this instruction");
+		  return PARSE_OPERAND_FAIL_NO_BACKTRACK;
+		}
+            }
+          else
+	    if (my_get_expression (&inst.reloc.exp, &p, GE_IMM_PREFIX))
+	      return PARSE_OPERAND_FAIL;
 	}
     }
 
   if (skip_past_char (&p, ']') == FAIL)
     {
       inst.error = _("']' expected");
-      return FAIL;
+      return PARSE_OPERAND_FAIL;
     }
 
   if (skip_past_char (&p, '!') == SUCCESS)
@@ -4367,20 +4738,20 @@ parse_address (char **str, int i)
 	  /* [Rn], {expr} - unindexed, with option */
 	  if (parse_immediate (&p, &inst.operands[i].imm,
 			       0, 255, TRUE) == FAIL)
-	    return FAIL;
+	    return PARSE_OPERAND_FAIL;
 
 	  if (skip_past_char (&p, '}') == FAIL)
 	    {
 	      inst.error = _("'}' expected at end of 'option' field");
-	      return FAIL;
+	      return PARSE_OPERAND_FAIL;
 	    }
 	  if (inst.operands[i].preind)
 	    {
 	      inst.error = _("cannot combine index with option");
-	      return FAIL;
+	      return PARSE_OPERAND_FAIL;
 	    }
 	  *str = p;
-	  return SUCCESS;
+	  return PARSE_OPERAND_SUCCESS;
 	}
       else
 	{
@@ -4390,7 +4761,7 @@ parse_address (char **str, int i)
 	  if (inst.operands[i].preind)
 	    {
 	      inst.error = _("cannot combine pre- and post-indexing");
-	      return FAIL;
+	      return PARSE_OPERAND_FAIL;
 	    }
 
 	  if (*p == '+') p++;
@@ -4408,7 +4779,7 @@ parse_address (char **str, int i)
 
 	      if (skip_past_comma (&p) == SUCCESS)
 		if (parse_shift (&p, i, SHIFT_IMMEDIATE) == FAIL)
-		  return FAIL;
+		  return PARSE_OPERAND_FAIL;
 	    }
 	  else
 	    {
@@ -4418,7 +4789,7 @@ parse_address (char **str, int i)
 		  p--;
 		}
 	      if (my_get_expression (&inst.reloc.exp, &p, GE_IMM_PREFIX))
-		return FAIL;
+		return PARSE_OPERAND_FAIL;
 	    }
 	}
     }
@@ -4432,6 +4803,59 @@ parse_address (char **str, int i)
       inst.reloc.exp.X_add_number = 0;
     }
   *str = p;
+  return PARSE_OPERAND_SUCCESS;
+}
+
+static int
+parse_address (char **str, int i)
+{
+  return parse_address_main (str, i, 0, 0) == PARSE_OPERAND_SUCCESS
+         ? SUCCESS : FAIL;
+}
+
+static parse_operand_result
+parse_address_group_reloc (char **str, int i, group_reloc_type type)
+{
+  return parse_address_main (str, i, 1, type);
+}
+
+/* Parse an operand for a MOVW or MOVT instruction.  */
+static int
+parse_half (char **str)
+{
+  char * p;
+  
+  p = *str;
+  skip_past_char (&p, '#');
+  if (strncasecmp (p, ":lower16:", 9) == 0) 
+    inst.reloc.type = BFD_RELOC_ARM_MOVW;
+  else if (strncasecmp (p, ":upper16:", 9) == 0)
+    inst.reloc.type = BFD_RELOC_ARM_MOVT;
+
+  if (inst.reloc.type != BFD_RELOC_UNUSED)
+    {
+      p += 9;
+      skip_whitespace(p);
+    }
+
+  if (my_get_expression (&inst.reloc.exp, &p, GE_NO_PREFIX))
+    return FAIL;
+
+  if (inst.reloc.type == BFD_RELOC_UNUSED)
+    {
+      if (inst.reloc.exp.X_op != O_constant)
+	{
+	  inst.error = _("constant expression expected");
+	  return FAIL;
+	}
+      if (inst.reloc.exp.X_add_number < 0
+	  || inst.reloc.exp.X_add_number > 0xffff)
+	{
+	  inst.error = _("immediate value out of range");
+	  return FAIL;
+	}
+    }
+  *str = p;
   return SUCCESS;
 }
 
@@ -4700,9 +5124,8 @@ parse_tb (char **str)
 
 /* Parse the operands of a Neon VMOV instruction. See do_neon_mov for more
    information on the types the operands can take and how they are encoded.
-   Note particularly the abuse of ".regisimm" to signify a Neon register.
-   Up to three operands may be read; this function handles setting the
-   ".present" field for each operand itself.
+   Up to four operands may be read; this function handles setting the
+   ".present" field for each read operand itself.
    Updates STR and WHICH_OPERAND if parsing is successful and returns SUCCESS,
    else returns FAIL.  */
 
@@ -4732,7 +5155,7 @@ parse_neon_mov (char **str, int *which_operand)
       inst.operands[i].isreg = 1;
       inst.operands[i].present = 1;
     }
-  else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NDQ, &rtype, &optype))
+  else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype, &optype))
            != FAIL)
     {
       /* Cases 0, 1, 2, 3, 5 (D only).  */
@@ -4742,57 +5165,84 @@ parse_neon_mov (char **str, int *which_operand)
       inst.operands[i].reg = val;
       inst.operands[i].isreg = 1;
       inst.operands[i].isquad = (rtype == REG_TYPE_NQ);
+      inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
+      inst.operands[i].isvec = 1;
       inst.operands[i].vectype = optype;
       inst.operands[i++].present = 1;
 
       if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
         {
-          /* Case 5: VMOV<c><q> <Dm>, <Rd>, <Rn>.  */
-          inst.operands[i-1].regisimm = 1;
+          /* Case 5: VMOV<c><q> <Dm>, <Rd>, <Rn>.
+             Case 13: VMOV <Sd>, <Rm>  */
           inst.operands[i].reg = val;
           inst.operands[i].isreg = 1;
-          inst.operands[i++].present = 1;
+          inst.operands[i].present = 1;
 
           if (rtype == REG_TYPE_NQ)
             {
               first_error (_("can't use Neon quad register here"));
               return FAIL;
             }
-          if (skip_past_comma (&ptr) == FAIL)
-            goto wanted_comma;
-          if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
-            goto wanted_arm;
-          inst.operands[i].reg = val;
-          inst.operands[i].isreg = 1;
-          inst.operands[i].present = 1;
+          else if (rtype != REG_TYPE_VFS)
+            {
+              i++;
+              if (skip_past_comma (&ptr) == FAIL)
+                goto wanted_comma;
+              if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+                goto wanted_arm;
+              inst.operands[i].reg = val;
+              inst.operands[i].isreg = 1;
+              inst.operands[i].present = 1;
+            }
         }
       else if (parse_qfloat_immediate (&ptr, &inst.operands[i].imm) == SUCCESS)
-        {
           /* Case 2: VMOV<c><q>.<dt> <Qd>, #<float-imm>
-             Case 3: VMOV<c><q>.<dt> <Dd>, #<float-imm>  */
-          if (!thumb_mode && (inst.instruction & 0xf0000000) != 0xe0000000)
-            goto bad_cond;
-        }
+             Case 3: VMOV<c><q>.<dt> <Dd>, #<float-imm>
+             Case 10: VMOV.F32 <Sd>, #<imm>
+             Case 11: VMOV.F64 <Dd>, #<imm>  */
+        ;
       else if (parse_big_immediate (&ptr, i) == SUCCESS)
-        {
           /* Case 2: VMOV<c><q>.<dt> <Qd>, #<imm>
              Case 3: VMOV<c><q>.<dt> <Dd>, #<imm>  */
-          if (!thumb_mode && (inst.instruction & 0xf0000000) != 0xe0000000)
-            goto bad_cond;
-        }
-      else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NDQ, &rtype, &optype))
-               != FAIL)
+        ;
+      else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype,
+                                           &optype)) != FAIL)
         {
           /* Case 0: VMOV<c><q> <Qd>, <Qm>
-             Case 1: VMOV<c><q> <Dd>, <Dm>  */
-          if (!thumb_mode && (inst.instruction & 0xf0000000) != 0xe0000000)
-            goto bad_cond;
+             Case 1: VMOV<c><q> <Dd>, <Dm>
+             Case 8: VMOV.F32 <Sd>, <Sm>
+             Case 15: VMOV <Sd>, <Se>, <Rn>, <Rm>  */
 
           inst.operands[i].reg = val;
           inst.operands[i].isreg = 1;
           inst.operands[i].isquad = (rtype == REG_TYPE_NQ);
+          inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
+          inst.operands[i].isvec = 1;
           inst.operands[i].vectype = optype;
           inst.operands[i].present = 1;
+          
+          if (skip_past_comma (&ptr) == SUCCESS)
+            {
+              /* Case 15.  */
+              i++;
+
+              if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+                goto wanted_arm;
+
+              inst.operands[i].reg = val;
+              inst.operands[i].isreg = 1;
+              inst.operands[i++].present = 1;
+              
+              if (skip_past_comma (&ptr) == FAIL)
+                goto wanted_comma;
+              
+              if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+                goto wanted_arm;
+              
+              inst.operands[i].reg = val;
+              inst.operands[i].isreg = 1;
+              inst.operands[i++].present = 1;
+            }
         }
       else
         {
@@ -4828,18 +5278,50 @@ parse_neon_mov (char **str, int *which_operand)
           if (skip_past_comma (&ptr) == FAIL)
             goto wanted_comma;
           
-          if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFD, NULL, &optype))
+          if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFSD, &rtype, &optype))
               == FAIL)
             {
-              first_error (_(reg_expected_msgs[REG_TYPE_VFD]));
+              first_error (_(reg_expected_msgs[REG_TYPE_VFSD]));
               return FAIL;
             }
 
           inst.operands[i].reg = val;
           inst.operands[i].isreg = 1;
-          inst.operands[i].regisimm = 1;
+          inst.operands[i].isvec = 1;
+          inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
           inst.operands[i].vectype = optype;
           inst.operands[i].present = 1;
+          
+          if (rtype == REG_TYPE_VFS)
+            {
+              /* Case 14.  */
+              i++;
+              if (skip_past_comma (&ptr) == FAIL)
+                goto wanted_comma;
+              if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL,
+                                              &optype)) == FAIL)
+                {
+                  first_error (_(reg_expected_msgs[REG_TYPE_VFS]));
+                  return FAIL;
+                }
+              inst.operands[i].reg = val;
+              inst.operands[i].isreg = 1;
+              inst.operands[i].isvec = 1;
+              inst.operands[i].issingle = 1;
+              inst.operands[i].vectype = optype;
+              inst.operands[i].present = 1;
+            }
+        }
+      else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL, &optype))
+               != FAIL)
+        {
+          /* Case 13.  */
+          inst.operands[i].reg = val;
+          inst.operands[i].isreg = 1;
+          inst.operands[i].isvec = 1;
+          inst.operands[i].issingle = 1;
+          inst.operands[i].vectype = optype;
+          inst.operands[i++].present = 1;
         }
     }
   else
@@ -4860,10 +5342,6 @@ parse_neon_mov (char **str, int *which_operand)
   wanted_arm:
   first_error (_(reg_expected_msgs[REG_TYPE_RN]));
   return FAIL;
-
-  bad_cond:
-  first_error (_("instruction cannot be conditionalized"));
-  return FAIL;
 }
 
 /* Matcher codes for parse_operands.  */
@@ -4882,7 +5360,9 @@ enum operand_parse_code
   OP_RVD,	/* VFP double precision register (0..15) */
   OP_RND,       /* Neon double precision register (0..31) */
   OP_RNQ,	/* Neon quad precision register */
+  OP_RVSD,	/* VFP single or double precision register */
   OP_RNDQ,      /* Neon double or quad precision register */
+  OP_RNSDQ,	/* Neon single, double or quad precision register */
   OP_RNSC,      /* Neon scalar D[X] */
   OP_RVC,	/* VFP control register */
   OP_RMF,	/* Maverick F register */
@@ -4899,17 +5379,21 @@ enum operand_parse_code
   OP_REGLST,	/* ARM register list */
   OP_VRSLST,	/* VFP single-precision register list */
   OP_VRDLST,	/* VFP double-precision register list */
+  OP_VRSDLST,   /* VFP single or double-precision register list (& quad) */
   OP_NRDLST,    /* Neon double-precision register list (d0-d31, qN aliases) */
   OP_NSTRLST,   /* Neon element/structure list */
 
   OP_NILO,      /* Neon immediate/logic operands 2 or 2+3. (VBIC, VORR...)  */
   OP_RNDQ_I0,   /* Neon D or Q reg, or immediate zero.  */
+  OP_RVSD_I0,	/* VFP S or D reg, or immediate zero.  */
   OP_RR_RNSC,   /* ARM reg or Neon scalar.  */
+  OP_RNSDQ_RNSC, /* Vector S, D or Q reg, or Neon scalar.  */
   OP_RNDQ_RNSC, /* Neon D or Q reg, or Neon scalar.  */
   OP_RND_RNSC,  /* Neon D reg, or Neon scalar.  */
   OP_VMOV,      /* Neon VMOV operands.  */
   OP_RNDQ_IMVNb,/* Neon D or Q reg, or immediate good for VMVN.  */
   OP_RNDQ_I63b, /* Neon D or Q reg, or immediate for shift.  */
+  OP_RIWR_I32z, /* iWMMXt wR register, or immediate 0 .. 32 for iWMMXt2.  */
 
   OP_I0,        /* immediate zero */
   OP_I7,	/* immediate value 0 .. 7 */
@@ -4925,7 +5409,6 @@ enum operand_parse_code
   OP_I64,	/*		   1 .. 64 */
   OP_I64z,	/*		   0 .. 64 */
   OP_I255,	/*		   0 .. 255 */
-  OP_Iffff,	/*		   0 .. 65535 */
 
   OP_I4b,	/* immediate, prefix optional, 1 .. 4 */
   OP_I7b,	/*			       0 .. 7 */
@@ -4933,10 +5416,15 @@ enum operand_parse_code
   OP_I31b,	/*			       0 .. 31 */
 
   OP_SH,	/* shifter operand */
+  OP_SHG,	/* shifter operand with possible group relocation */
   OP_ADDR,	/* Memory address expression (any mode) */
+  OP_ADDRGLDR,	/* Mem addr expr (any mode) with possible LDR group reloc */
+  OP_ADDRGLDRS, /* Mem addr expr (any mode) with possible LDRS group reloc */
+  OP_ADDRGLDC,  /* Mem addr expr (any mode) with possible LDC group reloc */
   OP_EXP,	/* arbitrary expression */
   OP_EXPi,	/* same, with optional immediate prefix */
   OP_EXPr,	/* same, with optional relocation suffix */
+  OP_HALF,	/* 0 .. 65535 or low/high reloc.  */
 
   OP_CPSF,	/* CPS flags */
   OP_ENDI,	/* Endianness specifier */
@@ -4944,11 +5432,15 @@ enum operand_parse_code
   OP_COND,	/* conditional code */
   OP_TB,	/* Table branch.  */
 
+  OP_RVC_PSR,	/* CPSR/SPSR mask for msr, or VFP control register.  */
+  OP_APSR_RR,   /* ARM register or "APSR_nzcv".  */
+
   OP_RRnpc_I0,	/* ARM register or literal 0 */
   OP_RR_EXr,	/* ARM register or expression with opt. reloc suff. */
   OP_RR_EXi,	/* ARM register or expression with imm prefix */
   OP_RF_IF,	/* FPA register or immediate */
   OP_RIWR_RIWC, /* iWMMXt R or C reg */
+  OP_RIWC_RIWG, /* iWMMXt wC or wCG reg */
 
   /* Optional operands.	 */
   OP_oI7b,	 /* immediate, prefix optional, 0 .. 7 */
@@ -4962,6 +5454,7 @@ enum operand_parse_code
   OP_oRND,       /* Optional Neon double precision register */
   OP_oRNQ,       /* Optional Neon quad precision register */
   OP_oRNDQ,      /* Optional Neon double or quad precision register */
+  OP_oRNSDQ,	 /* Optional single, double or quad precision vector register */
   OP_oSHll,	 /* LSL immediate */
   OP_oSHar,	 /* ASR immediate */
   OP_oSHllar,	 /* LSL or ASR immediate */
@@ -4983,6 +5476,7 @@ parse_operands (char *str, const unsigned char *pattern)
   const char *backtrack_error = 0;
   int i, val, backtrack_index = 0;
   enum arm_reg_type rtype;
+  parse_operand_result result;
 
 #define po_char_or_fail(chr) do {		\
   if (skip_past_char (&str, chr) == FAIL)	\
@@ -5000,6 +5494,10 @@ parse_operands (char *str, const unsigned char *pattern)
   inst.operands[i].reg = val;					\
   inst.operands[i].isreg = 1;					\
   inst.operands[i].isquad = (rtype == REG_TYPE_NQ);		\
+  inst.operands[i].issingle = (rtype == REG_TYPE_VFS);		\
+  inst.operands[i].isvec = (rtype == REG_TYPE_VFS		\
+                            || rtype == REG_TYPE_VFD		\
+                            || rtype == REG_TYPE_NQ);		\
 } while (0)
 
 #define po_reg_or_goto(regtype, label) do {			\
@@ -5011,6 +5509,10 @@ parse_operands (char *str, const unsigned char *pattern)
   inst.operands[i].reg = val;					\
   inst.operands[i].isreg = 1;					\
   inst.operands[i].isquad = (rtype == REG_TYPE_NQ);		\
+  inst.operands[i].issingle = (rtype == REG_TYPE_VFS);		\
+  inst.operands[i].isvec = (rtype == REG_TYPE_VFS		\
+                            || rtype == REG_TYPE_VFD		\
+                            || rtype == REG_TYPE_NQ);		\
 } while (0)
 
 #define po_imm_or_fail(min, max, popt) do {			\
@@ -5032,6 +5534,14 @@ parse_operands (char *str, const unsigned char *pattern)
     goto failure;				\
 } while (0)
 
+#define po_misc_or_fail_no_backtrack(expr) do {	\
+  result = expr;				\
+  if (result == PARSE_OPERAND_FAIL_NO_BACKTRACK)\
+    backtrack_pos = 0;				\
+  if (result != PARSE_OPERAND_SUCCESS)		\
+    goto failure;				\
+} while (0)
+
   skip_whitespace (str);
 
   for (i = 0; upat[i] != OP_stop; i++)
@@ -5077,6 +5587,9 @@ parse_operands (char *str, const unsigned char *pattern)
 	case OP_RNQ:   po_reg_or_fail (REG_TYPE_NQ);      break;
         case OP_oRNDQ:
 	case OP_RNDQ:  po_reg_or_fail (REG_TYPE_NDQ);     break;
+        case OP_RVSD:  po_reg_or_fail (REG_TYPE_VFSD);    break;
+        case OP_oRNSDQ:
+        case OP_RNSDQ: po_reg_or_fail (REG_TYPE_NSDQ);    break;
 
         /* Neon scalar. Using an element size of 8 means that some invalid
            scalars are accepted here, so deal with those in later code.  */
@@ -5089,6 +5602,7 @@ parse_operands (char *str, const unsigned char *pattern)
         case OP_NILO:
           {
             po_reg_or_goto (REG_TYPE_NDQ, try_imm);
+	    inst.operands[i].present = 1;
             i++;
             skip_past_comma (&str);
             po_reg_or_goto (REG_TYPE_NDQ, one_reg_only);
@@ -5101,8 +5615,13 @@ parse_operands (char *str, const unsigned char *pattern)
             inst.operands[i-1].present = 0;
             break;
             try_imm:
-            /* Immediate gets verified properly later, so accept any now.  */
-            po_imm_or_fail (INT_MIN, INT_MAX, TRUE);
+	    /* There's a possibility of getting a 64-bit immediate here, so
+	       we need special handling.  */
+	    if (parse_big_immediate (&str, i) == FAIL)
+	      {
+		inst.error = _("immediate value is out of range");
+		goto failure;
+	      }
           }
           break;
 
@@ -5115,6 +5634,10 @@ parse_operands (char *str, const unsigned char *pattern)
           }
           break;
 
+        case OP_RVSD_I0:
+          po_reg_or_goto (REG_TYPE_VFSD, try_imm0);
+          break;
+
         case OP_RR_RNSC:
           {
             po_scalar_or_goto (8, try_rr);
@@ -5124,6 +5647,15 @@ parse_operands (char *str, const unsigned char *pattern)
           }
           break;
 
+        case OP_RNSDQ_RNSC:
+          {
+            po_scalar_or_goto (8, try_nsdq);
+            break;
+            try_nsdq:
+            po_reg_or_fail (REG_TYPE_NSDQ);
+          }
+          break;
+
         case OP_RNDQ_RNSC:
           {
             po_scalar_or_goto (8, try_ndq);
@@ -5197,7 +5729,6 @@ parse_operands (char *str, const unsigned char *pattern)
         case OP_I64:	 po_imm_or_fail (  1,     64, FALSE);   break;
         case OP_I64z:	 po_imm_or_fail (  0,     64, FALSE);   break;
 	case OP_I255:	 po_imm_or_fail (  0,	 255, FALSE);	break;
-	case OP_Iffff:	 po_imm_or_fail (  0, 0xffff, FALSE);	break;
 
 	case OP_I4b:	 po_imm_or_fail (  1,	   4, TRUE);	break;
 	case OP_oI7b:
@@ -5263,6 +5794,11 @@ parse_operands (char *str, const unsigned char *pattern)
 	    }
 	  break;
 
+	  /* Operand for MOVW or MOVT.  */
+	case OP_HALF:
+	  po_misc_or_fail (parse_half (&str));
+	  break;
+
 	  /* Register or expression */
 	case OP_RR_EXr:	  po_reg_or_goto (REG_TYPE_RN, EXPr); break;
 	case OP_RR_EXi:	  po_reg_or_goto (REG_TYPE_RN, EXPi); break;
@@ -5285,13 +5821,17 @@ parse_operands (char *str, const unsigned char *pattern)
 	  inst.operands[i].isreg = 1;
 	  break;
 
+	case OP_RIWR_I32z: po_reg_or_goto (REG_TYPE_MMXWR, I32z); break;
+	I32z:		  po_imm_or_fail (0, 32, FALSE);	  break;
+
 	  /* Two kinds of register */
 	case OP_RIWR_RIWC:
 	  {
 	    struct reg_entry *rege = arm_reg_parse_multi (&str);
-	    if (rege->type != REG_TYPE_MMXWR
-		&& rege->type != REG_TYPE_MMXWC
-		&& rege->type != REG_TYPE_MMXWCG)
+	    if (!rege
+		|| (rege->type != REG_TYPE_MMXWR
+		    && rege->type != REG_TYPE_MMXWC
+		    && rege->type != REG_TYPE_MMXWCG))
 	      {
 		inst.error = _("iWMMXt data or control register expected");
 		goto failure;
@@ -5301,6 +5841,21 @@ parse_operands (char *str, const unsigned char *pattern)
 	  }
 	  break;
 
+	case OP_RIWC_RIWG:
+	  {
+	    struct reg_entry *rege = arm_reg_parse_multi (&str);
+	    if (!rege
+		|| (rege->type != REG_TYPE_MMXWC
+		    && rege->type != REG_TYPE_MMXWCG))
+	      {
+		inst.error = _("iWMMXt control register expected");
+		goto failure;
+	      }
+	    inst.operands[i].reg = rege->number;
+	    inst.operands[i].isreg = 1;
+	  }
+	  break;
+
 	  /* Misc */
 	case OP_CPSF:	 val = parse_cps_flags (&str);		break;
 	case OP_ENDI:	 val = parse_endian_specifier (&str);	break;
@@ -5309,11 +5864,46 @@ parse_operands (char *str, const unsigned char *pattern)
 	case OP_COND:	 val = parse_cond (&str);		break;
 	case OP_oBARRIER:val = parse_barrier (&str);		break;
 
-	case OP_TB:
-	  po_misc_or_fail (parse_tb (&str));
-	  break;
+        case OP_RVC_PSR:
+          po_reg_or_goto (REG_TYPE_VFC, try_psr);
+          inst.operands[i].isvec = 1;  /* Mark VFP control reg as vector.  */
+          break;
+          try_psr:
+          val = parse_psr (&str);
+          break;
 
-	  /* Register lists */
+        case OP_APSR_RR:
+          po_reg_or_goto (REG_TYPE_RN, try_apsr);
+          break;
+          try_apsr:
+          /* Parse "APSR_nvzc" operand (for FMSTAT-equivalent MRS
+             instruction).  */
+          if (strncasecmp (str, "APSR_", 5) == 0)
+            {
+              unsigned found = 0;
+              str += 5;
+              while (found < 15)
+                switch (*str++)
+                  {
+                  case 'c': found = (found & 1) ? 16 : found | 1; break;
+                  case 'n': found = (found & 2) ? 16 : found | 2; break;
+                  case 'z': found = (found & 4) ? 16 : found | 4; break;
+                  case 'v': found = (found & 8) ? 16 : found | 8; break;
+                  default: found = 16;
+                  }
+              if (found != 15)
+                goto failure;
+              inst.operands[i].isvec = 1;
+            }
+          else
+            goto failure;
+          break;
+
+	case OP_TB:
+	  po_misc_or_fail (parse_tb (&str));
+	  break;
+
+	  /* Register lists */
 	case OP_REGLST:
 	  val = parse_reg_list (&str);
 	  if (*str == '^')
@@ -5331,6 +5921,19 @@ parse_operands (char *str, const unsigned char *pattern)
 	  val = parse_vfp_reg_list (&str, &inst.operands[i].reg, REGLIST_VFP_D);
 	  break;
 
+        case OP_VRSDLST:
+          /* Allow Q registers too.  */
+          val = parse_vfp_reg_list (&str, &inst.operands[i].reg,
+                                    REGLIST_NEON_D);
+          if (val == FAIL)
+            {
+              inst.error = NULL;
+              val = parse_vfp_reg_list (&str, &inst.operands[i].reg,
+                                        REGLIST_VFP_S);
+              inst.operands[i].issingle = 1;
+            }
+          break;
+
         case OP_NRDLST:
           val = parse_vfp_reg_list (&str, &inst.operands[i].reg,
                                     REGLIST_NEON_D);
@@ -5346,10 +5949,30 @@ parse_operands (char *str, const unsigned char *pattern)
 	  po_misc_or_fail (parse_address (&str, i));
 	  break;
 
+	case OP_ADDRGLDR:
+	  po_misc_or_fail_no_backtrack (
+            parse_address_group_reloc (&str, i, GROUP_LDR));
+	  break;
+
+	case OP_ADDRGLDRS:
+	  po_misc_or_fail_no_backtrack (
+            parse_address_group_reloc (&str, i, GROUP_LDRS));
+	  break;
+
+	case OP_ADDRGLDC:
+	  po_misc_or_fail_no_backtrack (
+            parse_address_group_reloc (&str, i, GROUP_LDC));
+	  break;
+
 	case OP_SH:
 	  po_misc_or_fail (parse_shifter_operand (&str, i));
 	  break;
 
+	case OP_SHG:
+	  po_misc_or_fail_no_backtrack (
+            parse_shifter_operand_group_reloc (&str, i));
+	  break;
+
 	case OP_oSHll:
 	  po_misc_or_fail (parse_shift (&str, i, SHIFT_LSL_IMMEDIATE));
 	  break;
@@ -5384,11 +6007,13 @@ parse_operands (char *str, const unsigned char *pattern)
 	case OP_ENDI:
 	case OP_oROR:
 	case OP_PSR:
+        case OP_RVC_PSR:
 	case OP_COND:
 	case OP_oBARRIER:
 	case OP_REGLST:
 	case OP_VRSLST:
 	case OP_VRDLST:
+        case OP_VRSDLST:
         case OP_NRDLST:
         case OP_NSTRLST:
 	  if (val == FAIL)
@@ -5701,7 +6326,8 @@ encode_arm_addr_mode_3 (int i, bfd_boolean is_t)
    into a coprocessor load/store instruction.  If wb_ok is false,
    reject use of writeback; if unind_ok is false, reject use of
    unindexed addressing.  If reloc_override is not 0, use it instead
-   of BFD_ARM_CP_OFF_IMM.  */
+   of BFD_ARM_CP_OFF_IMM, unless the initial relocation is a group one
+   (in which case it is preserved).  */
 
 static int
 encode_arm_cp_address (int i, int wb_ok, int unind_ok, int reloc_override)
@@ -5743,10 +6369,16 @@ encode_arm_cp_address (int i, int wb_ok, int unind_ok, int reloc_override)
 
   if (reloc_override)
     inst.reloc.type = reloc_override;
-  else if (thumb_mode)
-    inst.reloc.type = BFD_RELOC_ARM_T32_CP_OFF_IMM;
-  else
-    inst.reloc.type = BFD_RELOC_ARM_CP_OFF_IMM;
+  else if ((inst.reloc.type < BFD_RELOC_ARM_ALU_PC_G0_NC
+            || inst.reloc.type > BFD_RELOC_ARM_LDC_SB_G2)
+           && inst.reloc.type != BFD_RELOC_ARM_LDR_PC_G0)
+    {
+      if (thumb_mode)
+        inst.reloc.type = BFD_RELOC_ARM_T32_CP_OFF_IMM;
+      else
+        inst.reloc.type = BFD_RELOC_ARM_CP_OFF_IMM;
+    }
+
   return SUCCESS;
 }
 
@@ -6438,15 +7070,62 @@ do_mov (void)
 static void
 do_mov16 (void)
 {
+  bfd_vma imm;
+  bfd_boolean top;
+
+  top = (inst.instruction & 0x00400000) != 0;
+  constraint (top && inst.reloc.type == BFD_RELOC_ARM_MOVW,
+	      _(":lower16: not allowed this instruction"));
+  constraint (!top && inst.reloc.type == BFD_RELOC_ARM_MOVT,
+	      _(":upper16: not allowed instruction"));
   inst.instruction |= inst.operands[0].reg << 12;
-  /* The value is in two pieces: 0:11, 16:19.  */
-  inst.instruction |= (inst.operands[1].imm & 0x00000fff);
-  inst.instruction |= (inst.operands[1].imm & 0x0000f000) << 4;
+  if (inst.reloc.type == BFD_RELOC_UNUSED)
+    {
+      imm = inst.reloc.exp.X_add_number;
+      /* The value is in two pieces: 0:11, 16:19.  */
+      inst.instruction |= (imm & 0x00000fff);
+      inst.instruction |= (imm & 0x0000f000) << 4;
+    }
+}
+
+static void do_vfp_nsyn_opcode (const char *);
+
+static int
+do_vfp_nsyn_mrs (void)
+{
+  if (inst.operands[0].isvec)
+    {
+      if (inst.operands[1].reg != 1)
+        first_error (_("operand 1 must be FPSCR"));
+      memset (&inst.operands[0], '\0', sizeof (inst.operands[0]));
+      memset (&inst.operands[1], '\0', sizeof (inst.operands[1]));
+      do_vfp_nsyn_opcode ("fmstat");
+    }
+  else if (inst.operands[1].isvec)
+    do_vfp_nsyn_opcode ("fmrx");
+  else
+    return FAIL;
+    
+  return SUCCESS;
+}
+
+static int
+do_vfp_nsyn_msr (void)
+{
+  if (inst.operands[0].isvec)
+    do_vfp_nsyn_opcode ("fmxr");
+  else
+    return FAIL;
+
+  return SUCCESS;
 }
 
 static void
 do_mrs (void)
 {
+  if (do_vfp_nsyn_mrs () == SUCCESS)
+    return;
+
   /* mrs only accepts CPSR/SPSR/CPSR_all/SPSR_all.  */
   constraint ((inst.operands[1].imm & (PSR_c|PSR_x|PSR_s|PSR_f))
 	      != (PSR_c|PSR_f),
@@ -6462,6 +7141,9 @@ do_mrs (void)
 static void
 do_msr (void)
 {
+  if (do_vfp_nsyn_msr () == SUCCESS)
+    return;
+
   inst.instruction |= inst.operands[0].imm;
   if (inst.operands[1].isreg)
     inst.instruction |= inst.operands[1].reg;
@@ -7043,16 +7725,16 @@ static void
 do_vfp_sp_const (void)
 {
   encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Sd);
-  inst.instruction |= (inst.operands[1].imm & 15) << 16;
-  inst.instruction |= (inst.operands[1].imm >> 4);
+  inst.instruction |= (inst.operands[1].imm & 0xf0) << 12;
+  inst.instruction |= (inst.operands[1].imm & 0x0f);
 }
 
 static void
 do_vfp_dp_const (void)
 {
   encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dd);
-  inst.instruction |= (inst.operands[1].imm & 15) << 16;
-  inst.instruction |= (inst.operands[1].imm >> 4);
+  inst.instruction |= (inst.operands[1].imm & 0xf0) << 12;
+  inst.instruction |= (inst.operands[1].imm & 0x0f);
 }
 
 static void
@@ -7139,6 +7821,7 @@ do_fpa_ldmstm (void)
 
   encode_arm_cp_address (2, TRUE, TRUE, 0);
 }
+
 
 /* iWMMXt instructions: strictly in alphabetical order.	 */
 
@@ -7188,6 +7871,15 @@ do_iwmmxt_waligni (void)
   inst.instruction |= inst.operands[3].imm << 20;
 }
 
+static void
+do_iwmmxt_wmerge (void)
+{
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= inst.operands[1].reg << 16;
+  inst.instruction |= inst.operands[2].reg;
+  inst.instruction |= inst.operands[3].imm << 21;
+}
+
 static void
 do_iwmmxt_wmov (void)
 {
@@ -7227,7 +7919,23 @@ static void
 do_iwmmxt_wldstd (void)
 {
   inst.instruction |= inst.operands[0].reg << 12;
-  encode_arm_cp_address (1, TRUE, FALSE, 0);
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, arm_cext_iwmmxt2)
+      && inst.operands[1].immisreg)
+    {
+      inst.instruction &= ~0x1a000ff;
+      inst.instruction |= (0xf << 28);
+      if (inst.operands[1].preind)
+	inst.instruction |= PRE_INDEX;
+      if (!inst.operands[1].negative)
+	inst.instruction |= INDEX_UP;
+      if (inst.operands[1].writeback)
+	inst.instruction |= WRITE_BACK;
+      inst.instruction |= inst.operands[1].reg << 16;
+      inst.instruction |= inst.reloc.exp.X_add_number << 4;
+      inst.instruction |= inst.operands[1].imm;
+    }
+  else
+    encode_arm_cp_address (1, TRUE, FALSE, 0);
 }
 
 static void
@@ -7247,6 +7955,56 @@ do_iwmmxt_wzero (void)
   inst.instruction |= inst.operands[0].reg << 12;
   inst.instruction |= inst.operands[0].reg << 16;
 }
+
+static void
+do_iwmmxt_wrwrwr_or_imm5 (void)
+{
+  if (inst.operands[2].isreg)
+    do_rd_rn_rm ();
+  else {
+    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_cext_iwmmxt2),
+		_("immediate operand requires iWMMXt2"));
+    do_rd_rn ();
+    if (inst.operands[2].imm == 0)
+      {
+	switch ((inst.instruction >> 20) & 0xf)
+	  {
+	  case 4:
+	  case 5:
+	  case 6:
+	  case 7: 
+	    /* w...h wrd, wrn, #0 -> wrorh wrd, wrn, #16.  */
+	    inst.operands[2].imm = 16;
+	    inst.instruction = (inst.instruction & 0xff0fffff) | (0x7 << 20);
+	    break;
+	  case 8:
+	  case 9:
+	  case 10:
+	  case 11:
+	    /* w...w wrd, wrn, #0 -> wrorw wrd, wrn, #32.  */
+	    inst.operands[2].imm = 32;
+	    inst.instruction = (inst.instruction & 0xff0fffff) | (0xb << 20);
+	    break;
+	  case 12:
+	  case 13:
+	  case 14:
+	  case 15:
+	    {
+	      /* w...d wrd, wrn, #0 -> wor wrd, wrn, wrn.  */
+	      unsigned long wrn;
+	      wrn = (inst.instruction >> 16) & 0xf;
+	      inst.instruction &= 0xff0fff0f;
+	      inst.instruction |= wrn;
+	      /* Bail out here; the instruction is now assembled.  */
+	      return;
+	    }
+	  }
+      }
+    /* Map 32 -> 0, etc.  */
+    inst.operands[2].imm &= 0x1f;
+    inst.instruction |= (0xf << 28) | ((inst.operands[2].imm & 0x10) << 4) | (inst.operands[2].imm & 0xf);
+  }
+}
 
 /* Cirrus Maverick instructions.  Simple 2-, 3-, and 4-register
    operations first, then control, shift, and load/store.  */
@@ -7599,13 +8357,13 @@ do_t_add_sub (void)
 	narrow = (current_it_mask != 0);
       if (!inst.operands[2].isreg)
 	{
+	  int add;
+
+	  add = (inst.instruction == T_MNEM_add
+		 || inst.instruction == T_MNEM_adds);
 	  opcode = 0;
 	  if (inst.size_req != 4)
 	    {
-	      int add;
-
-	      add = (inst.instruction == T_MNEM_add
-		     || inst.instruction == T_MNEM_adds);
 	      /* Attempt to use a narrow opcode, with relaxation if
 	         appropriate.  */
 	      if (Rd == REG_SP && Rs == REG_SP && !flags)
@@ -7635,12 +8393,24 @@ do_t_add_sub (void)
 	  if (inst.size_req == 4
 	      || (inst.size_req != 2 && !opcode))
 	    {
-	      /* ??? Convert large immediates to addw/subw.  */
-	      inst.instruction = THUMB_OP32 (inst.instruction);
-	      inst.instruction = (inst.instruction & 0xe1ffffff) | 0x10000000;
+	      if (Rs == REG_PC)
+		{
+		  /* Always use addw/subw.  */
+		  inst.instruction = add ? 0xf20f0000 : 0xf2af0000;
+		  inst.reloc.type = BFD_RELOC_ARM_T32_IMM12;
+		}
+	      else
+		{
+		  inst.instruction = THUMB_OP32 (inst.instruction);
+		  inst.instruction = (inst.instruction & 0xe1ffffff)
+				     | 0x10000000;
+		  if (flags)
+		    inst.reloc.type = BFD_RELOC_ARM_T32_IMMEDIATE;
+		  else
+		    inst.reloc.type = BFD_RELOC_ARM_T32_ADD_IMM;
+		}
 	      inst.instruction |= inst.operands[0].reg << 8;
 	      inst.instruction |= inst.operands[1].reg << 16;
-	      inst.reloc.type = BFD_RELOC_ARM_T32_IMMEDIATE;
 	    }
 	}
       else
@@ -8206,7 +8976,7 @@ do_t_cpy (void)
 }
 
 static void
-do_t_czb (void)
+do_t_cbz (void)
 {
   constraint (current_it_mask, BAD_NOT_IT);
   constraint (inst.operands[0].reg > 7, BAD_HIREG);
@@ -8709,11 +9479,30 @@ do_t_mov_cmp (void)
 static void
 do_t_mov16 (void)
 {
+  bfd_vma imm;
+  bfd_boolean top;
+
+  top = (inst.instruction & 0x00800000) != 0;
+  if (inst.reloc.type == BFD_RELOC_ARM_MOVW)
+    {
+      constraint (top, _(":lower16: not allowed this instruction"));
+      inst.reloc.type = BFD_RELOC_ARM_THUMB_MOVW;
+    }
+  else if (inst.reloc.type == BFD_RELOC_ARM_MOVT)
+    {
+      constraint (!top, _(":upper16: not allowed this instruction"));
+      inst.reloc.type = BFD_RELOC_ARM_THUMB_MOVT;
+    }
+
   inst.instruction |= inst.operands[0].reg << 8;
-  inst.instruction |= (inst.operands[1].imm & 0xf000) << 4;
-  inst.instruction |= (inst.operands[1].imm & 0x0800) << 15;
-  inst.instruction |= (inst.operands[1].imm & 0x0700) << 4;
-  inst.instruction |= (inst.operands[1].imm & 0x00ff);
+  if (inst.reloc.type == BFD_RELOC_UNUSED)
+    {
+      imm = inst.reloc.exp.X_add_number;
+      inst.instruction |= (imm & 0xf000) << 4;
+      inst.instruction |= (imm & 0x0800) << 15;
+      inst.instruction |= (imm & 0x0700) << 4;
+      inst.instruction |= (imm & 0x00ff);
+    }
 }
 
 static void
@@ -8787,6 +9576,10 @@ static void
 do_t_mrs (void)
 {
   int flags;
+
+  if (do_vfp_nsyn_mrs () == SUCCESS)
+    return;
+
   flags = inst.operands[1].imm & (PSR_c|PSR_x|PSR_s|PSR_f|SPSR_BIT);
   if (flags == 0)
     {
@@ -8814,6 +9607,9 @@ do_t_msr (void)
 {
   int flags;
 
+  if (do_vfp_nsyn_msr () == SUCCESS)
+    return;
+
   constraint (!inst.operands[1].isreg,
 	      _("Thumb encoding does not support an immediate here"));
   flags = inst.operands[0].imm;
@@ -9461,7 +10257,14 @@ struct neon_tab_entry
   X(vmovn,	0x1b20200, N_INV,     N_INV),		\
   X(vtrn,	0x1b20080, N_INV,     N_INV),		\
   X(vqmovn,	0x1b20200, N_INV,     N_INV),		\
-  X(vqmovun,	0x1b20240, N_INV,     N_INV)
+  X(vqmovun,	0x1b20240, N_INV,     N_INV),		\
+  X(vnmul,      0xe200a40, 0xe200b40, N_INV),		\
+  X(vnmla,      0xe000a40, 0xe000b40, N_INV),		\
+  X(vnmls,      0xe100a40, 0xe100b40, N_INV),		\
+  X(vcmp,	0xeb40a40, 0xeb40b40, N_INV),		\
+  X(vcmpz,	0xeb50a40, 0xeb50b40, N_INV),		\
+  X(vcmpe,	0xeb40ac0, 0xeb40bc0, N_INV),		\
+  X(vcmpez,     0xeb50ac0, 0xeb50bc0, N_INV)
 
 enum neon_opc
 {
@@ -9486,57 +10289,150 @@ NEON_ENC_TAB
 #define NEON_ENC_INTERLV(X) (neon_enc_tab[(X) & 0x0fffffff].integer)
 #define NEON_ENC_LANE(X)    (neon_enc_tab[(X) & 0x0fffffff].float_or_poly)
 #define NEON_ENC_DUP(X)     (neon_enc_tab[(X) & 0x0fffffff].scalar_or_imm)
+#define NEON_ENC_SINGLE(X) \
+  ((neon_enc_tab[(X) & 0x0fffffff].integer) | ((X) & 0xf0000000))
+#define NEON_ENC_DOUBLE(X) \
+  ((neon_enc_tab[(X) & 0x0fffffff].float_or_poly) | ((X) & 0xf0000000))
 
-/* Shapes for instruction operands. Some (e.g. NS_DDD_QQQ) represent multiple
-   shapes which an instruction can accept. The following mnemonic characters
-   are used in the tag names for this enumeration:
+/* Define shapes for instruction operands. The following mnemonic characters
+   are used in this table:
 
+     F - VFP S<n> register
      D - Neon D<n> register
      Q - Neon Q<n> register
      I - Immediate
      S - Scalar
      R - ARM register
      L - D<n> register list
+   
+   This table is used to generate various data:
+     - enumerations of the form NS_DDR to be used as arguments to
+       neon_select_shape.
+     - a table classifying shapes into single, double, quad, mixed.
+     - a table used to drive neon_select_shape.
 */
 
+#define NEON_SHAPE_DEF			\
+  X(3, (D, D, D), DOUBLE),		\
+  X(3, (Q, Q, Q), QUAD),		\
+  X(3, (D, D, I), DOUBLE),		\
+  X(3, (Q, Q, I), QUAD),		\
+  X(3, (D, D, S), DOUBLE),		\
+  X(3, (Q, Q, S), QUAD),		\
+  X(2, (D, D), DOUBLE),			\
+  X(2, (Q, Q), QUAD),			\
+  X(2, (D, S), DOUBLE),			\
+  X(2, (Q, S), QUAD),			\
+  X(2, (D, R), DOUBLE),			\
+  X(2, (Q, R), QUAD),			\
+  X(2, (D, I), DOUBLE),			\
+  X(2, (Q, I), QUAD),			\
+  X(3, (D, L, D), DOUBLE),		\
+  X(2, (D, Q), MIXED),			\
+  X(2, (Q, D), MIXED),			\
+  X(3, (D, Q, I), MIXED),		\
+  X(3, (Q, D, I), MIXED),		\
+  X(3, (Q, D, D), MIXED),		\
+  X(3, (D, Q, Q), MIXED),		\
+  X(3, (Q, Q, D), MIXED),		\
+  X(3, (Q, D, S), MIXED),		\
+  X(3, (D, Q, S), MIXED),		\
+  X(4, (D, D, D, I), DOUBLE),		\
+  X(4, (Q, Q, Q, I), QUAD),		\
+  X(2, (F, F), SINGLE),			\
+  X(3, (F, F, F), SINGLE),		\
+  X(2, (F, I), SINGLE),			\
+  X(2, (F, D), MIXED),			\
+  X(2, (D, F), MIXED),			\
+  X(3, (F, F, I), MIXED),		\
+  X(4, (R, R, F, F), SINGLE),		\
+  X(4, (F, F, R, R), SINGLE),		\
+  X(3, (D, R, R), DOUBLE),		\
+  X(3, (R, R, D), DOUBLE),		\
+  X(2, (S, R), SINGLE),			\
+  X(2, (R, S), SINGLE),			\
+  X(2, (F, R), SINGLE),			\
+  X(2, (R, F), SINGLE)
+
+#define S2(A,B)		NS_##A##B
+#define S3(A,B,C)	NS_##A##B##C
+#define S4(A,B,C,D)	NS_##A##B##C##D
+
+#define X(N, L, C) S##N L
+
 enum neon_shape
 {
-  NS_DDD_QQQ,
-  NS_DDD,
-  NS_QQQ,
-  NS_DDI_QQI,
-  NS_DDI,
-  NS_QQI,
-  NS_DDS_QQS,
-  NS_DDS,
-  NS_QQS,
-  NS_DD_QQ,
-  NS_DD,
-  NS_QQ,
-  NS_DS_QS,
-  NS_DS,
-  NS_QS,
-  NS_DR_QR,
-  NS_DR,
-  NS_QR,
-  NS_DI_QI,
-  NS_DI,
-  NS_QI,
-  NS_DLD,
-  NS_DQ,
-  NS_QD,
-  NS_DQI,
-  NS_QDI,
-  NS_QDD,
-  NS_QDS,
-  NS_QQD,
-  NS_DQQ,
-  NS_DDDI_QQQI,
-  NS_DDDI,
-  NS_QQQI,
-  NS_IGNORE
+  NEON_SHAPE_DEF,
+  NS_NULL
+};
+
+#undef X
+#undef S2
+#undef S3
+#undef S4
+
+enum neon_shape_class
+{
+  SC_SINGLE,
+  SC_DOUBLE,
+  SC_QUAD,
+  SC_MIXED
+};
+
+#define X(N, L, C) SC_##C
+
+static enum neon_shape_class neon_shape_class[] =
+{
+  NEON_SHAPE_DEF
+};
+
+#undef X
+
+enum neon_shape_el
+{
+  SE_F,
+  SE_D,
+  SE_Q,
+  SE_I,
+  SE_S,
+  SE_R,
+  SE_L
+};
+
+/* Register widths of above.  */
+static unsigned neon_shape_el_size[] =
+{
+  32,
+  64,
+  128,
+  0,
+  32,
+  32,
+  0
+};
+
+struct neon_shape_info
+{
+  unsigned els;
+  enum neon_shape_el el[NEON_MAX_TYPE_ELS];
 };
 
+#define S2(A,B)		{ SE_##A, SE_##B }
+#define S3(A,B,C)	{ SE_##A, SE_##B, SE_##C }
+#define S4(A,B,C,D)	{ SE_##A, SE_##B, SE_##C, SE_##D }
+
+#define X(N, L, C) { N, S##N L }
+
+static struct neon_shape_info neon_shape_tab[] =
+{
+  NEON_SHAPE_DEF
+};
+
+#undef X
+#undef S2
+#undef S3
+#undef S4
+
 /* Bit masks used in type checking given instructions.
   'N_EQK' means the type must be the same as (or based on in some way) the key
    type, which itself is marked with the 'N_KEY' bit. If the 'N_EQK' bit is
@@ -9564,8 +10460,10 @@ enum neon_type_mask
   N_P8   = 0x010000,
   N_P16  = 0x020000,
   N_F32  = 0x040000,
-  N_KEY  = 0x080000, /* key element (main type specifier).  */
-  N_EQK  = 0x100000, /* given operand has the same type & size as the key.  */
+  N_F64  = 0x080000,
+  N_KEY  = 0x100000, /* key element (main type specifier).  */
+  N_EQK  = 0x200000, /* given operand has the same type & size as the key.  */
+  N_VFP  = 0x400000, /* VFP mode: operand size must match register width.  */
   N_DBL  = 0x000001, /* if N_EQK, this operand is twice the size.  */
   N_HLF  = 0x000002, /* if N_EQK, this operand is half the size.  */
   N_SGN  = 0x000004, /* if N_EQK, this operand is forced to be signed.  */
@@ -9574,7 +10472,7 @@ enum neon_type_mask
   N_FLT  = 0x000020, /* if N_EQK, this operand is forced to be float.  */
   N_SIZ  = 0x000040, /* if N_EQK, this operand is forced to be size-only.  */
   N_UTYP = 0,
-  N_MAX_NONSPECIAL = N_F32
+  N_MAX_NONSPECIAL = N_F64
 };
 
 #define N_ALLMODS  (N_DBL | N_HLF | N_SGN | N_UNS | N_INT | N_FLT | N_SIZ)
@@ -9590,18 +10488,17 @@ enum neon_type_mask
    altogether.  */
 #define N_IGNORE_TYPE (N_KEY | N_EQK)
 
-/* Check the shape of a Neon instruction (sizes of registers). Returns the more
-   specific shape when there are two alternatives. For non-polymorphic shapes,
-   checking is done during operand parsing, so is not implemented here.  */
+/* Select a "shape" for the current instruction (describing register types or
+   sizes) from a list of alternatives. Return NS_NULL if the current instruction
+   doesn't fit. For non-polymorphic shapes, checking is usually done as a
+   function of operand parsing, so this function doesn't need to be called.
+   Shapes should be listed in order of decreasing length.  */
 
 static enum neon_shape
-neon_check_shape (enum neon_shape req)
+neon_select_shape (enum neon_shape shape, ...)
 {
-#define RR(X) (inst.operands[(X)].isreg)
-#define RD(X) (inst.operands[(X)].isreg && !inst.operands[(X)].isquad)
-#define RQ(X) (inst.operands[(X)].isreg && inst.operands[(X)].isquad)
-#define IM(X) (!inst.operands[(X)].isreg && !inst.operands[(X)].isscalar)
-#define SC(X) (!inst.operands[(X)].isreg && inst.operands[(X)].isscalar)
+  va_list ap;
+  enum neon_shape first_shape = shape;
 
   /* Fix missing optional operands. FIXME: we don't know at this point how
      many arguments we should have, so this makes the assumption that we have
@@ -9610,112 +10507,90 @@ neon_check_shape (enum neon_shape req)
   if (!inst.operands[1].present)
     inst.operands[1] = inst.operands[0];
 
-  switch (req)
-      {
-    case NS_DDD_QQQ:
-      {
-        if (RD(0) && RD(1) && RD(2))
-          return NS_DDD;
-        else if (RQ(0) && RQ(1) && RQ(2))
-          return NS_QQQ;
-        else
-          first_error (_("expected <Qd>, <Qn>, <Qm> or <Dd>, <Dn>, <Dm> "
-                         "operands"));
-      }
-      break;
-    
-    case NS_DDI_QQI:
-      {
-        if (RD(0) && RD(1) && IM(2))
-          return NS_DDI;
-        else if (RQ(0) && RQ(1) && IM(2))
-          return NS_QQI;
-        else
-          first_error (_("expected <Qd>, <Qn>, #<imm> or <Dd>, <Dn>, #<imm> "
-                         "operands"));
-      }
-        break;
+  va_start (ap, shape);
   
-    case NS_DDDI_QQQI:
-      {
-        if (RD(0) && RD(1) && RD(2) && IM(3))
-          return NS_DDDI;
-        if (RQ(0) && RQ(1) && RQ(2) && IM(3))
-          return NS_QQQI;
-        else
-          first_error (_("expected <Qd>, <Qn>, <Qm>, #<imm> or "
-                         "<Dd>, <Dn>, <Dm>, #<imm> operands"));
-      }
+  for (; shape != NS_NULL; shape = va_arg (ap, int))
+    {
+      unsigned j;
+      int matches = 1;
+
+      for (j = 0; j < neon_shape_tab[shape].els; j++)
+        {
+          if (!inst.operands[j].present)
+            {
+              matches = 0;
+              break;
+            }
+
+          switch (neon_shape_tab[shape].el[j])
+            {
+            case SE_F:
+              if (!(inst.operands[j].isreg
+                    && inst.operands[j].isvec
+                    && inst.operands[j].issingle
+                    && !inst.operands[j].isquad))
+                matches = 0;
+              break;
+
+            case SE_D:
+              if (!(inst.operands[j].isreg
+                    && inst.operands[j].isvec
+                    && !inst.operands[j].isquad
+                    && !inst.operands[j].issingle))
+                matches = 0;
+              break;
+
+            case SE_R:
+              if (!(inst.operands[j].isreg
+                    && !inst.operands[j].isvec))
+                matches = 0;
+              break;
+
+            case SE_Q:
+              if (!(inst.operands[j].isreg
+                    && inst.operands[j].isvec
+                    && inst.operands[j].isquad
+                    && !inst.operands[j].issingle))
+                matches = 0;
+              break;
+
+            case SE_I:
+              if (!(!inst.operands[j].isreg
+                    && !inst.operands[j].isscalar))
+                matches = 0;
+              break;
+
+            case SE_S:
+              if (!(!inst.operands[j].isreg
+                    && inst.operands[j].isscalar))
+                matches = 0;
+              break;
+
+            case SE_L:
+              break;
+            }
+        }
+      if (matches)
         break;
+    }
   
-    case NS_DDS_QQS:
-      {
-        if (RD(0) && RD(1) && SC(2))
-          return NS_DDS;
-        else if (RQ(0) && RQ(1) && SC(2))
-          return NS_QQS;
-        else
-          first_error (_("expected <Qd>, <Qn>, <Dm[x]> or <Dd>, <Dn>, <Dm[x]> "
-                         "operands"));
-      }
-      break;
-  
-    case NS_DD_QQ:
-      {
-        if (RD(0) && RD(1))
-          return NS_DD;
-        else if (RQ(0) && RQ(1))
-          return NS_QQ;
-        else
-          first_error (_("expected <Qd>, <Qm> or <Dd>, <Dm> operands"));
-      }
-      break;
-  
-    case NS_DS_QS:
-      {
-        if (RD(0) && SC(1))
-          return NS_DS;
-        else if (RQ(0) && SC(1))
-          return NS_QS;
-        else
-          first_error (_("expected <Qd>, <Dm[x]> or <Dd>, <Dm[x]> operands"));
-      }
-      break;
+  va_end (ap);
 
-    case NS_DR_QR:
-      {
-        if (RD(0) && RR(1))
-          return NS_DR;
-        else if (RQ(0) && RR(1))
-          return NS_QR;
-        else
-          first_error (_("expected <Qd>, <Rm> or <Dd>, <Rm> operands"));
-      }
-      break;
+  if (shape == NS_NULL && first_shape != NS_NULL)
+    first_error (_("invalid instruction shape"));
 
-    case NS_DI_QI:
-      {
-        if (RD(0) && IM(1))
-          return NS_DI;
-        else if (RQ(0) && IM(1))
-          return NS_QI;
-        else
-          first_error (_("expected <Qd>, #<imm> or <Dd>, #<imm> operands"));
-      }
-      break;
-   
-    default:
-      abort ();
-    }
+  return shape;
+}
 
-  return req;
-#undef RR
-#undef RD
-#undef RQ
-#undef IM
-#undef SC
+/* True if SHAPE is predominantly a quadword operation (most of the time, this
+   means the Q bit should be set).  */
+
+static int
+neon_quad (enum neon_shape shape)
+{
+  return neon_shape_class[shape] == SC_QUAD;
 }
-  
+
 static void
 neon_modify_type_size (unsigned typebits, enum neon_el_type *g_type,
                        unsigned *g_size)
@@ -9787,8 +10662,12 @@ type_chk_of_el_type (enum neon_el_type type, unsigned size)
       break;
 
     case NT_float:
-      if (size == 32)
-        return N_F32;
+      switch (size)
+        {
+        case 32: return N_F32;
+        case 64: return N_F64;
+        default: ;
+        }
       break;
 
     case NT_poly:
@@ -9844,7 +10723,7 @@ el_type_of_type_chk (enum neon_el_type *type, unsigned *size,
     *size = 16;
   else if ((mask & (N_S32 | N_U32 | N_I32 | N_32 | N_F32)) != 0)
     *size = 32;
-  else if ((mask & (N_S64 | N_U64 | N_I64 | N_64)) != 0)
+  else if ((mask & (N_S64 | N_U64 | N_I64 | N_64 | N_F64)) != 0)
     *size = 64;
   else
     return FAIL;
@@ -9859,7 +10738,7 @@ el_type_of_type_chk (enum neon_el_type *type, unsigned *size,
     *type = NT_untyped;
   else if ((mask & (N_P8 | N_P16)) != 0)
     *type = NT_poly;
-  else if ((mask & N_F32) != 0)
+  else if ((mask & (N_F32 | N_F64)) != 0)
     *type = NT_float;
   else
     return FAIL;
@@ -9902,8 +10781,7 @@ modify_types_allowed (unsigned allowed, unsigned mods)
    which is set on a per-instruction basis, which is the one which matters when
    only one data type is written.
    Note: this function has side-effects (e.g. filling in missing operands). All
-   Neon instructions should call it before performing bit encoding.
-*/
+   Neon instructions should call it before performing bit encoding.  */
 
 static struct neon_type_el
 neon_check_type (unsigned els, enum neon_shape ns, ...)
@@ -10024,6 +10902,26 @@ neon_check_type (unsigned els, enum neon_shape ns, ...)
             }
           else
             {
+              if ((thisarg & N_VFP) != 0)
+                {
+                  enum neon_shape_el regshape = neon_shape_tab[ns].el[i];
+                  unsigned regwidth = neon_shape_el_size[regshape], match;
+
+                  /* In VFP mode, operands must match register widths. If we
+                     have a key operand, use its width, else use the width of
+                     the current operand.  */
+                  if (k_size != -1u)
+                    match = k_size;
+                  else
+                    match = g_size;
+
+                  if (regwidth != match)
+                    {
+                      first_error (_("operand size must match register width"));
+                      return badtype;
+                    }
+                }
+            
               if ((thisarg & N_EQK) == 0)
                 {
                   unsigned given_type = type_chk_of_el_type (g_type, g_size);
@@ -10052,54 +10950,361 @@ neon_check_type (unsigned els, enum neon_shape ns, ...)
   return inst.vectype.el[key_el];
 }
 
-/* Fix up Neon data-processing instructions, ORing in the correct bits for
-   ARM mode or Thumb mode and moving the encoded bit 24 to bit 28.  */
+/* Neon-style VFP instruction forwarding.  */
 
-static unsigned
-neon_dp_fixup (unsigned i)
+/* Thumb VFP instructions have 0xE in the condition field.  */
+
+static void
+do_vfp_cond_or_thumb (void)
 {
   if (thumb_mode)
-    {
-      /* The U bit is at bit 24 by default. Move to bit 28 in Thumb mode.  */
-      if (i & (1 << 24))
-        i |= 1 << 28;
-      
-      i &= ~(1 << 24);
-      
-      i |= 0xef000000;
-    }
+    inst.instruction |= 0xe0000000;
   else
-    i |= 0xf2000000;
-  
-  return i;
+    inst.instruction |= inst.cond << 28;
 }
 
-/* Turn a size (8, 16, 32, 64) into the respective bit number minus 3
-   (0, 1, 2, 3).  */
+/* Look up and encode a simple mnemonic, for use as a helper function for the
+   Neon-style VFP syntax.  This avoids duplication of bits of the insns table,
+   etc.  It is assumed that operand parsing has already been done, and that the
+   operands are in the form expected by the given opcode (this isn't necessarily
+   the same as the form in which they were parsed, hence some massaging must
+   take place before this function is called).
+   Checks current arch version against that in the looked-up opcode.  */
 
-static unsigned
-neon_logbits (unsigned x)
+static void
+do_vfp_nsyn_opcode (const char *opname)
 {
-  return ffs (x) - 4;
-}
-
-#define LOW4(R) ((R) & 0xf)
-#define HI1(R) (((R) >> 4) & 1)
+  const struct asm_opcode *opcode;
+  
+  opcode = hash_find (arm_ops_hsh, opname);
 
-/* Encode insns with bit pattern:
+  if (!opcode)
+    abort ();
 
-  |28/24|23|22 |21 20|19 16|15 12|11    8|7|6|5|4|3  0|
-  |  U  |x |D  |size | Rn  | Rd  |x x x x|N|Q|M|x| Rm |
-  
-  SIZE is passed in bits. -1 means size field isn't changed, in case it has a
-  different meaning for some instruction.  */
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant,
+                thumb_mode ? *opcode->tvariant : *opcode->avariant),
+              _(BAD_FPU));
 
-static void
-neon_three_same (int isquad, int ubit, int size)
-{
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  if (thumb_mode)
+    {
+      inst.instruction = opcode->tvalue;
+      opcode->tencode ();
+    }
+  else
+    {
+      inst.instruction = (inst.cond << 28) | opcode->avalue;
+      opcode->aencode ();
+    }
+}
+
+static void
+do_vfp_nsyn_add_sub (enum neon_shape rs)
+{
+  int is_add = (inst.instruction & 0x0fffffff) == N_MNEM_vadd;
+
+  if (rs == NS_FFF)
+    {
+      if (is_add)
+        do_vfp_nsyn_opcode ("fadds");
+      else
+        do_vfp_nsyn_opcode ("fsubs");
+    }
+  else
+    {
+      if (is_add)
+        do_vfp_nsyn_opcode ("faddd");
+      else
+        do_vfp_nsyn_opcode ("fsubd");
+    }
+}
+
+/* Check operand types to see if this is a VFP instruction, and if so call
+   PFN ().  */
+
+static int
+try_vfp_nsyn (int args, void (*pfn) (enum neon_shape))
+{
+  enum neon_shape rs;
+  struct neon_type_el et;
+
+  switch (args)
+    {
+    case 2:
+      rs = neon_select_shape (NS_FF, NS_DD, NS_NULL);
+      et = neon_check_type (2, rs,
+        N_EQK | N_VFP, N_F32 | N_F64 | N_KEY | N_VFP);
+      break;
+    
+    case 3:
+      rs = neon_select_shape (NS_FFF, NS_DDD, NS_NULL);
+      et = neon_check_type (3, rs,
+        N_EQK | N_VFP, N_EQK | N_VFP, N_F32 | N_F64 | N_KEY | N_VFP);
+      break;
+
+    default:
+      abort ();
+    }
+
+  if (et.type != NT_invtype)
+    {
+      pfn (rs);
+      return SUCCESS;
+    }
+  else
+    inst.error = NULL;
+
+  return FAIL;
+}
+
+static void
+do_vfp_nsyn_mla_mls (enum neon_shape rs)
+{
+  int is_mla = (inst.instruction & 0x0fffffff) == N_MNEM_vmla;
+  
+  if (rs == NS_FFF)
+    {
+      if (is_mla)
+        do_vfp_nsyn_opcode ("fmacs");
+      else
+        do_vfp_nsyn_opcode ("fmscs");
+    }
+  else
+    {
+      if (is_mla)
+        do_vfp_nsyn_opcode ("fmacd");
+      else
+        do_vfp_nsyn_opcode ("fmscd");
+    }
+}
+
+static void
+do_vfp_nsyn_mul (enum neon_shape rs)
+{
+  if (rs == NS_FFF)
+    do_vfp_nsyn_opcode ("fmuls");
+  else
+    do_vfp_nsyn_opcode ("fmuld");
+}
+
+static void
+do_vfp_nsyn_abs_neg (enum neon_shape rs)
+{
+  int is_neg = (inst.instruction & 0x80) != 0;
+  neon_check_type (2, rs, N_EQK | N_VFP, N_F32 | N_F64 | N_VFP | N_KEY);
+
+  if (rs == NS_FF)
+    {
+      if (is_neg)
+        do_vfp_nsyn_opcode ("fnegs");
+      else
+        do_vfp_nsyn_opcode ("fabss");
+    }
+  else
+    {
+      if (is_neg)
+        do_vfp_nsyn_opcode ("fnegd");
+      else
+        do_vfp_nsyn_opcode ("fabsd");
+    }
+}
+
+/* Encode single-precision (only!) VFP fldm/fstm instructions. Double precision
+   insns belong to Neon, and are handled elsewhere.  */
+
+static void
+do_vfp_nsyn_ldm_stm (int is_dbmode)
+{
+  int is_ldm = (inst.instruction & (1 << 20)) != 0;
+  if (is_ldm)
+    {
+      if (is_dbmode)
+        do_vfp_nsyn_opcode ("fldmdbs");
+      else
+        do_vfp_nsyn_opcode ("fldmias");
+    }
+  else
+    {
+      if (is_dbmode)
+        do_vfp_nsyn_opcode ("fstmdbs");
+      else
+        do_vfp_nsyn_opcode ("fstmias");
+    }
+}
+
+static void
+do_vfp_nsyn_sqrt (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_FF, NS_DD, NS_NULL);
+  neon_check_type (2, rs, N_EQK | N_VFP, N_F32 | N_F64 | N_KEY | N_VFP);
+      
+  if (rs == NS_FF)
+    do_vfp_nsyn_opcode ("fsqrts");
+  else
+    do_vfp_nsyn_opcode ("fsqrtd");
+}
+
+static void
+do_vfp_nsyn_div (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_FFF, NS_DDD, NS_NULL);
+  neon_check_type (3, rs, N_EQK | N_VFP, N_EQK | N_VFP,
+    N_F32 | N_F64 | N_KEY | N_VFP);
+  
+  if (rs == NS_FFF)
+    do_vfp_nsyn_opcode ("fdivs");
+  else
+    do_vfp_nsyn_opcode ("fdivd");
+}
+
+static void
+do_vfp_nsyn_nmul (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_FFF, NS_DDD, NS_NULL);
+  neon_check_type (3, rs, N_EQK | N_VFP, N_EQK | N_VFP,
+    N_F32 | N_F64 | N_KEY | N_VFP);
+  
+  if (rs == NS_FFF)
+    {
+      inst.instruction = NEON_ENC_SINGLE (inst.instruction);
+      do_vfp_sp_dyadic ();
+    }
+  else
+    {
+      inst.instruction = NEON_ENC_DOUBLE (inst.instruction);
+      do_vfp_dp_rd_rn_rm ();
+    }
+  do_vfp_cond_or_thumb ();
+}
+
+static void
+do_vfp_nsyn_cmp (void)
+{
+  if (inst.operands[1].isreg)
+    {
+      enum neon_shape rs = neon_select_shape (NS_FF, NS_DD, NS_NULL);
+      neon_check_type (2, rs, N_EQK | N_VFP, N_F32 | N_F64 | N_KEY | N_VFP);
+      
+      if (rs == NS_FF)
+        {
+          inst.instruction = NEON_ENC_SINGLE (inst.instruction);
+          do_vfp_sp_monadic ();
+        }
+      else
+        {
+          inst.instruction = NEON_ENC_DOUBLE (inst.instruction);
+          do_vfp_dp_rd_rm ();
+        }
+    }
+  else
+    {
+      enum neon_shape rs = neon_select_shape (NS_FI, NS_DI, NS_NULL);
+      neon_check_type (2, rs, N_F32 | N_F64 | N_KEY | N_VFP, N_EQK);
+
+      switch (inst.instruction & 0x0fffffff)
+        {
+        case N_MNEM_vcmp:
+          inst.instruction += N_MNEM_vcmpz - N_MNEM_vcmp;
+          break;
+        case N_MNEM_vcmpe:
+          inst.instruction += N_MNEM_vcmpez - N_MNEM_vcmpe;
+          break;
+        default:
+          abort ();
+        }
+     
+      if (rs == NS_FI)
+        {
+          inst.instruction = NEON_ENC_SINGLE (inst.instruction);
+          do_vfp_sp_compare_z ();
+        }
+      else
+        {
+          inst.instruction = NEON_ENC_DOUBLE (inst.instruction);
+          do_vfp_dp_rd ();
+        }
+    }
+  do_vfp_cond_or_thumb ();
+}
+
+static void
+nsyn_insert_sp (void)
+{
+  inst.operands[1] = inst.operands[0];
+  memset (&inst.operands[0], '\0', sizeof (inst.operands[0]));
+  inst.operands[0].reg = 13;
+  inst.operands[0].isreg = 1;
+  inst.operands[0].writeback = 1;
+  inst.operands[0].present = 1;
+}
+
+static void
+do_vfp_nsyn_push (void)
+{
+  nsyn_insert_sp ();
+  if (inst.operands[1].issingle)
+    do_vfp_nsyn_opcode ("fstmdbs");
+  else
+    do_vfp_nsyn_opcode ("fstmdbd");
+}
+
+static void
+do_vfp_nsyn_pop (void)
+{
+  nsyn_insert_sp ();
+  if (inst.operands[1].issingle)
+    do_vfp_nsyn_opcode ("fldmdbs");
+  else
+    do_vfp_nsyn_opcode ("fldmdbd");
+}
+
+/* Fix up Neon data-processing instructions, ORing in the correct bits for
+   ARM mode or Thumb mode and moving the encoded bit 24 to bit 28.  */
+
+static unsigned
+neon_dp_fixup (unsigned i)
+{
+  if (thumb_mode)
+    {
+      /* The U bit is at bit 24 by default. Move to bit 28 in Thumb mode.  */
+      if (i & (1 << 24))
+        i |= 1 << 28;
+      
+      i &= ~(1 << 24);
+      
+      i |= 0xef000000;
+    }
+  else
+    i |= 0xf2000000;
+  
+  return i;
+}
+
+/* Turn a size (8, 16, 32, 64) into the respective bit number minus 3
+   (0, 1, 2, 3).  */
+
+static unsigned
+neon_logbits (unsigned x)
+{
+  return ffs (x) - 4;
+}
+
+#define LOW4(R) ((R) & 0xf)
+#define HI1(R) (((R) >> 4) & 1)
+
+/* Encode insns with bit pattern:
+
+  |28/24|23|22 |21 20|19 16|15 12|11    8|7|6|5|4|3  0|
+  |  U  |x |D  |size | Rn  | Rd  |x x x x|N|Q|M|x| Rm |
+  
+  SIZE is passed in bits. -1 means size field isn't changed, in case it has a
+  different meaning for some instruction.  */
+
+static void
+neon_three_same (int isquad, int ubit, int size)
+{
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
   inst.instruction |= HI1 (inst.operands[1].reg) << 7;
   inst.instruction |= LOW4 (inst.operands[2].reg);
   inst.instruction |= HI1 (inst.operands[2].reg) << 5;
@@ -10139,19 +11344,19 @@ neon_two_same (int qbit, int ubit, int size)
 static void
 do_neon_dyadic_i_su (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
   struct neon_type_el et = neon_check_type (3, rs,
     N_EQK, N_EQK, N_SU_32 | N_KEY);
-  neon_three_same (rs == NS_QQQ, et.type == NT_unsigned, et.size);
+  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
 }
 
 static void
 do_neon_dyadic_i64_su (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
   struct neon_type_el et = neon_check_type (3, rs,
     N_EQK, N_EQK, N_SU_ALL | N_KEY);
-  neon_three_same (rs == NS_QQQ, et.type == NT_unsigned, et.size);
+  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
 }
 
 static void
@@ -10178,18 +11383,18 @@ do_neon_shl_imm (void)
 {
   if (!inst.operands[2].isreg)
     {
-      enum neon_shape rs = neon_check_shape (NS_DDI_QQI);
+      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
       struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
       inst.instruction = NEON_ENC_IMMED (inst.instruction);
-      neon_imm_shift (FALSE, 0, rs == NS_QQI, et, inst.operands[2].imm);
+      neon_imm_shift (FALSE, 0, neon_quad (rs), et, inst.operands[2].imm);
     }
   else
     {
-      enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
       struct neon_type_el et = neon_check_type (3, rs,
         N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
       inst.instruction = NEON_ENC_INTEGER (inst.instruction);
-      neon_three_same (rs == NS_QQQ, et.type == NT_unsigned, et.size);
+      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
     }
 }
 
@@ -10198,64 +11403,71 @@ do_neon_qshl_imm (void)
 {
   if (!inst.operands[2].isreg)
     {
-      enum neon_shape rs = neon_check_shape (NS_DDI_QQI);
+      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
       struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
       inst.instruction = NEON_ENC_IMMED (inst.instruction);
-      neon_imm_shift (TRUE, et.type == NT_unsigned, rs == NS_QQI, et,
+      neon_imm_shift (TRUE, et.type == NT_unsigned, neon_quad (rs), et,
                       inst.operands[2].imm);
     }
   else
     {
-      enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
       struct neon_type_el et = neon_check_type (3, rs,
         N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
       inst.instruction = NEON_ENC_INTEGER (inst.instruction);
-      neon_three_same (rs == NS_QQQ, et.type == NT_unsigned, et.size);
+      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
     }
 }
 
 static int
 neon_cmode_for_logic_imm (unsigned immediate, unsigned *immbits, int size)
 {
-  /* Handle .I8 and .I64 as pseudo-instructions.  */
-  switch (size)
+  /* Handle .I8 pseudo-instructions.  */
+  if (size == 8)
     {
-    case 8:
       /* Unfortunately, this will make everything apart from zero out-of-range.
          FIXME is this the intended semantics? There doesn't seem much point in
          accepting .I8 if so.  */
       immediate |= immediate << 8;
       size = 16;
-      break;
-    case 64:
-      /* Similarly, anything other than zero will be replicated in bits [63:32],
-         which probably isn't want we want if we specified .I64.  */
-      if (immediate != 0)
-        goto bad_immediate;
-      size = 32;
-      break;
-    default: ;
+    }
+
+  if (size >= 32)
+    {
+      if (immediate == (immediate & 0x000000ff))
+	{
+	  *immbits = immediate;
+	  return 0x1;
+	}
+      else if (immediate == (immediate & 0x0000ff00))
+	{
+	  *immbits = immediate >> 8;
+	  return 0x3;
+	}
+      else if (immediate == (immediate & 0x00ff0000))
+	{
+	  *immbits = immediate >> 16;
+	  return 0x5;
+	}
+      else if (immediate == (immediate & 0xff000000))
+	{
+	  *immbits = immediate >> 24;
+	  return 0x7;
+	}
+      if ((immediate & 0xffff) != (immediate >> 16))
+	goto bad_immediate;
+      immediate &= 0xffff;
     }
 
   if (immediate == (immediate & 0x000000ff))
     {
       *immbits = immediate;
-      return (size == 16) ? 0x9 : 0x1;
+      return 0x9;
     }
   else if (immediate == (immediate & 0x0000ff00))
     {
       *immbits = immediate >> 8;
-      return (size == 16) ? 0xb : 0x3;
-    }
-  else if (immediate == (immediate & 0x00ff0000))
-    {
-      *immbits = immediate >> 16;
-      return 0x5;
-    }
-  else if (immediate == (immediate & 0xff000000))
-    {
-      *immbits = immediate >> 24;
-      return 0x7;
+      return 0xb;
     }
 
   bad_immediate:
@@ -10296,7 +11508,8 @@ neon_qfloat_bits (unsigned imm)
    the instruction. *OP is passed as the initial value of the op field, and
    may be set to a different value depending on the constant (i.e.
    "MOV I64, 0bAAAAAAAABBBB..." which uses OP = 1 despite being MOV not
-   MVN).  */
+   MVN).  If the immediate looks like a repeated parttern then also
+   try smaller element sizes.  */
 
 static int
 neon_cmode_for_move_imm (unsigned immlo, unsigned immhi, unsigned *immbits,
@@ -10309,63 +11522,87 @@ neon_cmode_for_move_imm (unsigned immlo, unsigned immhi, unsigned *immbits,
       *immbits = neon_qfloat_bits (immlo);
       return 0xf;
     }
-  else if (size == 64 && neon_bits_same_in_bytes (immhi)
-      && neon_bits_same_in_bytes (immlo))
-    {
-      /* Check this one first so we don't have to bother with immhi in later
-         tests.  */
-      if (*op == 1)
-        return FAIL;
-      *immbits = (neon_squash_bits (immhi) << 4) | neon_squash_bits (immlo);
-      *op = 1;
-      return 0xe;
-    }
-  else if (immhi != 0)
-    return FAIL;
-  else if (immlo == (immlo & 0x000000ff))
-    {
-      /* 64-bit case was already handled. Don't allow MVN with 8-bit
-         immediate.  */
-      if ((size != 8 && size != 16 && size != 32)
-          || (size == 8 && *op == 1))
-        return FAIL;
-      *immbits = immlo;
-      return (size == 8) ? 0xe : (size == 16) ? 0x8 : 0x0;
-    }
-  else if (immlo == (immlo & 0x0000ff00))
-    {
-      if (size != 16 && size != 32)
-        return FAIL;
-      *immbits = immlo >> 8;
-      return (size == 16) ? 0xa : 0x2;
-    }
-  else if (immlo == (immlo & 0x00ff0000))
+
+  if (size == 64)
     {
-      if (size != 32)
-        return FAIL;
-      *immbits = immlo >> 16;
-      return 0x4;
+      if (neon_bits_same_in_bytes (immhi)
+	  && neon_bits_same_in_bytes (immlo))
+	{
+	  if (*op == 1)
+	    return FAIL;
+	  *immbits = (neon_squash_bits (immhi) << 4)
+		     | neon_squash_bits (immlo);
+	  *op = 1;
+	  return 0xe;
+	}
+
+      if (immhi != immlo)
+	return FAIL;
     }
-  else if (immlo == (immlo & 0xff000000))
+
+  if (size >= 32)
     {
-      if (size != 32)
-        return FAIL;
-      *immbits = immlo >> 24;
-      return 0x6;
+      if (immlo == (immlo & 0x000000ff))
+	{
+	  *immbits = immlo;
+	  return 0x0;
+	}
+      else if (immlo == (immlo & 0x0000ff00))
+	{
+	  *immbits = immlo >> 8;
+	  return 0x2;
+	}
+      else if (immlo == (immlo & 0x00ff0000))
+	{
+	  *immbits = immlo >> 16;
+	  return 0x4;
+	}
+      else if (immlo == (immlo & 0xff000000))
+	{
+	  *immbits = immlo >> 24;
+	  return 0x6;
+	}
+      else if (immlo == ((immlo & 0x0000ff00) | 0x000000ff))
+	{
+	  *immbits = (immlo >> 8) & 0xff;
+	  return 0xc;
+	}
+      else if (immlo == ((immlo & 0x00ff0000) | 0x0000ffff))
+	{
+	  *immbits = (immlo >> 16) & 0xff;
+	  return 0xd;
+	}
+
+      if ((immlo & 0xffff) != (immlo >> 16))
+	return FAIL;
+      immlo &= 0xffff;
     }
-  else if (immlo == ((immlo & 0x0000ff00) | 0x000000ff))
+
+  if (size >= 16)
     {
-      if (size != 32)
-        return FAIL;
-      *immbits = (immlo >> 8) & 0xff;
-      return 0xc;
+      if (immlo == (immlo & 0x000000ff))
+	{
+	  *immbits = immlo;
+	  return 0x8;
+	}
+      else if (immlo == (immlo & 0x0000ff00))
+	{
+	  *immbits = immlo >> 8;
+	  return 0xa;
+	}
+
+      if ((immlo & 0xff) != (immlo >> 8))
+	return FAIL;
+      immlo &= 0xff;
     }
-  else if (immlo == ((immlo & 0x00ff0000) | 0x0000ffff))
+
+  if (immlo == (immlo & 0x000000ff))
     {
-      if (size != 32)
-        return FAIL;
-      *immbits = (immlo >> 16) & 0xff;
-      return 0xd;
+      /* Don't allow MVN with 8-bit immediate.  */
+      if (*op == 1)
+	return FAIL;
+      *immbits = immlo;
+      return 0xe;
     }
 
   return FAIL;
@@ -10428,17 +11665,17 @@ do_neon_logic (void)
 {
   if (inst.operands[2].present && inst.operands[2].isreg)
     {
-      enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
       neon_check_type (3, rs, N_IGNORE_TYPE);
       /* U bit and size field were set as part of the bitmask.  */
       inst.instruction = NEON_ENC_INTEGER (inst.instruction);
-      neon_three_same (rs == NS_QQQ, 0, -1);
+      neon_three_same (neon_quad (rs), 0, -1);
     }
   else
     {
-      enum neon_shape rs = neon_check_shape (NS_DI_QI);
-      struct neon_type_el et = neon_check_type (1, rs, N_I8 | N_I16 | N_I32
-                                                | N_I64 | N_F32);
+      enum neon_shape rs = neon_select_shape (NS_DI, NS_QI, NS_NULL);
+      struct neon_type_el et = neon_check_type (2, rs,
+        N_I8 | N_I16 | N_I32 | N_I64 | N_F32 | N_KEY, N_EQK);
       enum neon_opc opcode = inst.instruction & 0x0fffffff;
       unsigned immbits;
       int cmode;
@@ -10448,28 +11685,37 @@ do_neon_logic (void)
       
       inst.instruction = NEON_ENC_IMMED (inst.instruction);
 
+      immbits = inst.operands[1].imm;
+      if (et.size == 64)
+	{
+	  /* .i64 is a pseudo-op, so the immediate must be a repeating
+	     pattern.  */
+	  if (immbits != (inst.operands[1].regisimm ?
+			  inst.operands[1].reg : 0))
+	    {
+	      /* Set immbits to an invalid constant.  */
+	      immbits = 0xdeadbeef;
+	    }
+	}
+
       switch (opcode)
         {
         case N_MNEM_vbic:
-          cmode = neon_cmode_for_logic_imm (inst.operands[1].imm, &immbits,
-                                            et.size);
+          cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
           break;
         
         case N_MNEM_vorr:
-          cmode = neon_cmode_for_logic_imm (inst.operands[1].imm, &immbits,
-                                            et.size);
+          cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
           break;
         
         case N_MNEM_vand:
           /* Pseudo-instruction for VBIC.  */
-          immbits = inst.operands[1].imm;
           neon_invert_size (&immbits, 0, et.size);
           cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
           break;
         
         case N_MNEM_vorn:
           /* Pseudo-instruction for VORR.  */
-          immbits = inst.operands[1].imm;
           neon_invert_size (&immbits, 0, et.size);
           cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
           break;
@@ -10481,7 +11727,7 @@ do_neon_logic (void)
       if (cmode == FAIL)
         return;
 
-      inst.instruction |= (rs == NS_QI) << 6;
+      inst.instruction |= neon_quad (rs) << 6;
       inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
       inst.instruction |= HI1 (inst.operands[0].reg) << 22;
       inst.instruction |= cmode << 8;
@@ -10494,27 +11740,27 @@ do_neon_logic (void)
 static void
 do_neon_bitfield (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
   neon_check_type (3, rs, N_IGNORE_TYPE);
-  neon_three_same (rs == NS_QQQ, 0, -1);
+  neon_three_same (neon_quad (rs), 0, -1);
 }
 
 static void
 neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
                   unsigned destbits)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
   struct neon_type_el et = neon_check_type (3, rs, N_EQK | destbits, N_EQK,
                                             types | N_KEY);
   if (et.type == NT_float)
     {
       inst.instruction = NEON_ENC_FLOAT (inst.instruction);
-      neon_three_same (rs == NS_QQQ, 0, -1);
+      neon_three_same (neon_quad (rs), 0, -1);
     }
   else
     {
       inst.instruction = NEON_ENC_INTEGER (inst.instruction);
-      neon_three_same (rs == NS_QQQ, et.type == ubit_meaning, et.size);
+      neon_three_same (neon_quad (rs), et.type == ubit_meaning, et.size);
     }
 }
 
@@ -10533,20 +11779,72 @@ do_neon_dyadic_if_su_d (void)
 }
 
 static void
-do_neon_dyadic_if_i (void)
+do_neon_dyadic_if_i_d (void)
 {
-  neon_dyadic_misc (NT_unsigned, N_IF_32, 0);
+  /* The "untyped" case can't happen. Do this to stop the "U" bit being
+     affected if we specify unsigned args.  */
+  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
 }
 
-static void
-do_neon_dyadic_if_i_d (void)
+enum vfp_or_neon_is_neon_bits
+{
+  NEON_CHECK_CC = 1,
+  NEON_CHECK_ARCH = 2
+};
+
+/* Call this function if an instruction which may have belonged to the VFP or
+   Neon instruction sets, but turned out to be a Neon instruction (due to the
+   operand types involved, etc.). We have to check and/or fix-up a couple of
+   things:
+
+     - Make sure the user hasn't attempted to make a Neon instruction
+       conditional.
+     - Alter the value in the condition code field if necessary.
+     - Make sure that the arch supports Neon instructions.
+
+   Which of these operations take place depends on bits from enum
+   vfp_or_neon_is_neon_bits.
+
+   WARNING: This function has side effects! If NEON_CHECK_CC is used and the
+   current instruction's condition is COND_ALWAYS, the condition field is
+   changed to inst.uncond_value. This is necessary because instructions shared
+   between VFP and Neon may be conditional for the VFP variants only, and the
+   unconditional Neon version must have, e.g., 0xF in the condition field.  */
+
+static int
+vfp_or_neon_is_neon (unsigned check)
 {
-  neon_dyadic_misc (NT_unsigned, N_IF_32, 0);
+  /* Conditions are always legal in Thumb mode (IT blocks).  */
+  if (!thumb_mode && (check & NEON_CHECK_CC))
+    {
+      if (inst.cond != COND_ALWAYS)
+        {
+          first_error (_(BAD_COND));
+          return FAIL;
+        }
+      if (inst.uncond_value != -1)
+        inst.instruction |= inst.uncond_value << 28;
+    }
+  
+  if ((check & NEON_CHECK_ARCH)
+      && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+    {
+      first_error (_(BAD_FPU));
+      return FAIL;
+    }
+  
+  return SUCCESS;
 }
 
 static void
 do_neon_addsub_if_i (void)
 {
+  if (try_vfp_nsyn (3, do_vfp_nsyn_add_sub) == SUCCESS)
+    return;
+
+  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+    return;
+
   /* The "untyped" case can't happen. Do this to stop the "U" bit being
      affected if we specify unsigned args.  */
   neon_dyadic_misc (NT_untyped, N_IF_32 | N_I64, 0);
@@ -10590,7 +11888,7 @@ neon_compare (unsigned regtypes, unsigned immtypes, int invert)
     }
   else
     {
-      enum neon_shape rs = neon_check_shape (NS_DDI_QQI);
+      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
       struct neon_type_el et = neon_check_type (2, rs,
         N_EQK | N_SIZ, immtypes | N_KEY);
 
@@ -10599,7 +11897,7 @@ neon_compare (unsigned regtypes, unsigned immtypes, int invert)
       inst.instruction |= HI1 (inst.operands[0].reg) << 22;
       inst.instruction |= LOW4 (inst.operands[1].reg);
       inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-      inst.instruction |= (rs == NS_QQI) << 6;
+      inst.instruction |= neon_quad (rs) << 6;
       inst.instruction |= (et.type == NT_float) << 10;
       inst.instruction |= neon_logbits (et.size) << 18;
       
@@ -10685,25 +11983,35 @@ neon_mul_mac (struct neon_type_el et, int ubit)
 static void
 do_neon_mac_maybe_scalar (void)
 {
+  if (try_vfp_nsyn (3, do_vfp_nsyn_mla_mls) == SUCCESS)
+    return;
+
+  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+    return;
+
   if (inst.operands[2].isscalar)
     {
-      enum neon_shape rs = neon_check_shape (NS_DDS_QQS);
+      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
       struct neon_type_el et = neon_check_type (3, rs,
         N_EQK, N_EQK, N_I16 | N_I32 | N_F32 | N_KEY);
       inst.instruction = NEON_ENC_SCALAR (inst.instruction);
-      neon_mul_mac (et, rs == NS_QQS);
+      neon_mul_mac (et, neon_quad (rs));
     }
   else
-    do_neon_dyadic_if_i ();
+    {
+      /* The "untyped" case can't happen.  Do this to stop the "U" bit being
+	 affected if we specify unsigned args.  */
+      neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+    }
 }
 
 static void
 do_neon_tst (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
   struct neon_type_el et = neon_check_type (3, rs,
     N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
-  neon_three_same (rs == NS_QQQ, 0, et.size);
+  neon_three_same (neon_quad (rs), 0, et.size);
 }
 
 /* VMUL with 3 registers allows the P8 type. The scalar version supports the
@@ -10713,6 +12021,12 @@ do_neon_tst (void)
 static void
 do_neon_mul (void)
 {
+  if (try_vfp_nsyn (3, do_vfp_nsyn_mul) == SUCCESS)
+    return;
+
+  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+    return;
+
   if (inst.operands[2].isscalar)
     do_neon_mac_maybe_scalar ();
   else
@@ -10724,30 +12038,30 @@ do_neon_qdmulh (void)
 {
   if (inst.operands[2].isscalar)
     {
-      enum neon_shape rs = neon_check_shape (NS_DDS_QQS);
+      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
       struct neon_type_el et = neon_check_type (3, rs,
         N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
       inst.instruction = NEON_ENC_SCALAR (inst.instruction);
-      neon_mul_mac (et, rs == NS_QQS);
+      neon_mul_mac (et, neon_quad (rs));
     }
   else
     {
-      enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
       struct neon_type_el et = neon_check_type (3, rs,
         N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
       inst.instruction = NEON_ENC_INTEGER (inst.instruction);
       /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (rs == NS_QQQ, 0, et.size);
+      neon_three_same (neon_quad (rs), 0, et.size);
     }
 }
 
 static void
 do_neon_fcmp_absolute (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
   neon_check_type (3, rs, N_EQK, N_EQK, N_F32 | N_KEY);
   /* Size field comes from bit mask.  */
-  neon_three_same (rs == NS_QQQ, 1, -1);
+  neon_three_same (neon_quad (rs), 1, -1);
 }
 
 static void
@@ -10760,22 +12074,31 @@ do_neon_fcmp_absolute_inv (void)
 static void
 do_neon_step (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDD_QQQ);
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
   neon_check_type (3, rs, N_EQK, N_EQK, N_F32 | N_KEY);
-  neon_three_same (rs == NS_QQQ, 0, -1);
+  neon_three_same (neon_quad (rs), 0, -1);
 }
 
 static void
 do_neon_abs_neg (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_F32 | N_KEY);
+  enum neon_shape rs;
+  struct neon_type_el et;
+  
+  if (try_vfp_nsyn (2, do_vfp_nsyn_abs_neg) == SUCCESS)
+    return;
+
+  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+    return;
+
+  rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_F32 | N_KEY);
+  
   inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
   inst.instruction |= HI1 (inst.operands[0].reg) << 22;
   inst.instruction |= LOW4 (inst.operands[1].reg);
   inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-  inst.instruction |= (rs == NS_QQ) << 6;
+  inst.instruction |= neon_quad (rs) << 6;
   inst.instruction |= (et.type == NT_float) << 10;
   inst.instruction |= neon_logbits (et.size) << 18;
   
@@ -10785,31 +12108,31 @@ do_neon_abs_neg (void)
 static void
 do_neon_sli (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDI_QQI);
+  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
   int imm = inst.operands[2].imm;
   constraint (imm < 0 || (unsigned)imm >= et.size,
               _("immediate out of range for insert"));
-  neon_imm_shift (FALSE, 0, rs == NS_QQI, et, imm);
+  neon_imm_shift (FALSE, 0, neon_quad (rs), et, imm);
 }
 
 static void
 do_neon_sri (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDI_QQI);
+  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
   int imm = inst.operands[2].imm;
   constraint (imm < 1 || (unsigned)imm > et.size,
               _("immediate out of range for insert"));
-  neon_imm_shift (FALSE, 0, rs == NS_QQI, et, et.size - imm);
+  neon_imm_shift (FALSE, 0, neon_quad (rs), et, et.size - imm);
 }
 
 static void
 do_neon_qshlu_imm (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDI_QQI);
+  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK | N_UNS, N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
   int imm = inst.operands[2].imm;
@@ -10820,7 +12143,7 @@ do_neon_qshlu_imm (void)
      Unsigned types have OP set to 1.  */
   inst.instruction |= (et.type == NT_unsigned) << 8;
   /* The rest of the bits are the same as other immediate shifts.  */
-  neon_imm_shift (FALSE, 0, rs == NS_QQI, et, imm);
+  neon_imm_shift (FALSE, 0, neon_quad (rs), et, imm);
 }
 
 static void
@@ -10969,79 +12292,229 @@ do_neon_shll (void)
     }
 }
 
-/* Check the various types for the VCVT instruction, and return the one that
+/* Check the various types for the VCVT instruction, and return which version
    the current instruction is.  */
 
 static int
 neon_cvt_flavour (enum neon_shape rs)
 {
-#define CVT_VAR(C,X,Y)				\
-  et = neon_check_type (2, rs, (X), (Y));	\
-  if (et.type != NT_invtype)			\
-    {						\
-      inst.error = NULL;			\
-      return (C);				\
+#define CVT_VAR(C,X,Y)							\
+  et = neon_check_type (2, rs, whole_reg | (X), whole_reg | (Y));	\
+  if (et.type != NT_invtype)						\
+    {									\
+      inst.error = NULL;						\
+      return (C);							\
     }
   struct neon_type_el et;
+  unsigned whole_reg = (rs == NS_FFI || rs == NS_FD || rs == NS_DF
+                        || rs == NS_FF) ? N_VFP : 0;
+  /* The instruction versions which take an immediate take one register
+     argument, which is extended to the width of the full register. Thus the
+     "source" and "destination" registers must have the same width.  Hack that
+     here by making the size equal to the key (wider, in this case) operand.  */
+  unsigned key = (rs == NS_QQI || rs == NS_DDI || rs == NS_FFI) ? N_KEY : 0;
   
   CVT_VAR (0, N_S32, N_F32);
   CVT_VAR (1, N_U32, N_F32);
   CVT_VAR (2, N_F32, N_S32);
   CVT_VAR (3, N_F32, N_U32);
   
+  whole_reg = N_VFP;
+  
+  /* VFP instructions.  */
+  CVT_VAR (4, N_F32, N_F64);
+  CVT_VAR (5, N_F64, N_F32);
+  CVT_VAR (6, N_S32, N_F64 | key);
+  CVT_VAR (7, N_U32, N_F64 | key);
+  CVT_VAR (8, N_F64 | key, N_S32);
+  CVT_VAR (9, N_F64 | key, N_U32);
+  /* VFP instructions with bitshift.  */
+  CVT_VAR (10, N_F32 | key, N_S16);
+  CVT_VAR (11, N_F32 | key, N_U16);
+  CVT_VAR (12, N_F64 | key, N_S16);
+  CVT_VAR (13, N_F64 | key, N_U16);
+  CVT_VAR (14, N_S16, N_F32 | key);
+  CVT_VAR (15, N_U16, N_F32 | key);
+  CVT_VAR (16, N_S16, N_F64 | key);
+  CVT_VAR (17, N_U16, N_F64 | key);
+  
   return -1;
 #undef CVT_VAR
 }
 
+/* Neon-syntax VFP conversions.  */
+
 static void
-do_neon_cvt (void)
+do_vfp_nsyn_cvt (enum neon_shape rs, int flavour)
 {
-  /* Fixed-point conversion with #0 immediate is encoded as an integer
-     conversion.  */
-  if (inst.operands[2].present && inst.operands[2].imm != 0)
+  const char *opname = 0;
+  
+  if (rs == NS_DDI || rs == NS_QQI || rs == NS_FFI)
     {
-      enum neon_shape rs = neon_check_shape (NS_DDI_QQI);
-      int flavour = neon_cvt_flavour (rs);
-      unsigned immbits = 32 - inst.operands[2].imm;
-      unsigned enctab[] = { 0x0000100, 0x1000100, 0x0, 0x1000000 };
-      inst.instruction = NEON_ENC_IMMED (inst.instruction);
-      if (flavour != -1)
-        inst.instruction |= enctab[flavour];
-      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-      inst.instruction |= LOW4 (inst.operands[1].reg);
-      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-      inst.instruction |= (rs == NS_QQI) << 6;
-      inst.instruction |= 1 << 21;
-      inst.instruction |= immbits << 16;
+      /* Conversions with immediate bitshift.  */
+      const char *enc[] =
+        {
+          "ftosls",
+          "ftouls",
+          "fsltos",
+          "fultos",
+          NULL,
+          NULL,
+          "ftosld",
+          "ftould",
+          "fsltod",
+          "fultod",
+          "fshtos",
+          "fuhtos",
+          "fshtod",
+          "fuhtod",
+          "ftoshs",
+          "ftouhs",
+          "ftoshd",
+          "ftouhd"
+        };
+
+      if (flavour >= 0 && flavour < (int) ARRAY_SIZE (enc))
+        {
+          opname = enc[flavour];
+          constraint (inst.operands[0].reg != inst.operands[1].reg,
+                      _("operands 0 and 1 must be the same register"));
+          inst.operands[1] = inst.operands[2];
+          memset (&inst.operands[2], '\0', sizeof (inst.operands[2]));
+        }
     }
   else
     {
-      enum neon_shape rs = neon_check_shape (NS_DD_QQ);
-      int flavour = neon_cvt_flavour (rs);
-      unsigned enctab[] = { 0x100, 0x180, 0x0, 0x080 };
-      inst.instruction = NEON_ENC_INTEGER (inst.instruction);
-      if (flavour != -1)
-        inst.instruction |= enctab[flavour];
-      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-      inst.instruction |= LOW4 (inst.operands[1].reg);
-      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-      inst.instruction |= (rs == NS_QQ) << 6;
-      inst.instruction |= 2 << 18;
+      /* Conversions without bitshift.  */
+      const char *enc[] =
+        {
+          "ftosis",
+          "ftouis",
+          "fsitos",
+          "fuitos",
+          "fcvtsd",
+          "fcvtds",
+          "ftosid",
+          "ftouid",
+          "fsitod",
+          "fuitod"
+        };
+
+      if (flavour >= 0 && flavour < (int) ARRAY_SIZE (enc))
+        opname = enc[flavour];
+    }
+
+  if (opname)
+    do_vfp_nsyn_opcode (opname);
+}
+
+static void
+do_vfp_nsyn_cvtz (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_FF, NS_FD, NS_NULL);
+  int flavour = neon_cvt_flavour (rs);
+  const char *enc[] =
+    {
+      "ftosizs",
+      "ftouizs",
+      NULL,
+      NULL,
+      NULL,
+      NULL,
+      "ftosizd",
+      "ftouizd"
+    };
+
+  if (flavour >= 0 && flavour < (int) ARRAY_SIZE (enc) && enc[flavour])
+    do_vfp_nsyn_opcode (enc[flavour]);
+}
+
+static void
+do_neon_cvt (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_FFI, NS_DD, NS_QQ,
+    NS_FD, NS_DF, NS_FF, NS_NULL);
+  int flavour = neon_cvt_flavour (rs);
+
+  /* VFP rather than Neon conversions.  */
+  if (flavour >= 4)
+    {
+      do_vfp_nsyn_cvt (rs, flavour);
+      return;
+    }
+
+  switch (rs)
+    {
+    case NS_DDI:
+    case NS_QQI:
+      {
+        if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+          return;
+
+        /* Fixed-point conversion with #0 immediate is encoded as an
+           integer conversion.  */
+        if (inst.operands[2].present && inst.operands[2].imm == 0)
+          goto int_encode;
+        unsigned immbits = 32 - inst.operands[2].imm;
+        unsigned enctab[] = { 0x0000100, 0x1000100, 0x0, 0x1000000 };
+        inst.instruction = NEON_ENC_IMMED (inst.instruction);
+        if (flavour != -1)
+          inst.instruction |= enctab[flavour];
+        inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+        inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+        inst.instruction |= LOW4 (inst.operands[1].reg);
+        inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+        inst.instruction |= neon_quad (rs) << 6;
+        inst.instruction |= 1 << 21;
+        inst.instruction |= immbits << 16;
+
+        inst.instruction = neon_dp_fixup (inst.instruction);
+      }
+      break;
+
+    case NS_DD:
+    case NS_QQ:
+    int_encode:
+      {
+        unsigned enctab[] = { 0x100, 0x180, 0x0, 0x080 };
+
+        inst.instruction = NEON_ENC_INTEGER (inst.instruction);
+
+        if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+          return;
+
+        if (flavour != -1)
+          inst.instruction |= enctab[flavour];
+
+        inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+        inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+        inst.instruction |= LOW4 (inst.operands[1].reg);
+        inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+        inst.instruction |= neon_quad (rs) << 6;
+        inst.instruction |= 2 << 18;
+
+        inst.instruction = neon_dp_fixup (inst.instruction);
+      }
+    break;
+
+    default:
+      /* Some VFP conversions go here (s32 <-> f32, u32 <-> f32).  */
+      do_vfp_nsyn_cvt (rs, flavour);
     }
-  inst.instruction = neon_dp_fixup (inst.instruction);
 }
 
 static void
 neon_move_immediate (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DI_QI);
-  struct neon_type_el et = neon_check_type (1, rs,
-    N_I8 | N_I16 | N_I32 | N_I64 | N_F32);
+  enum neon_shape rs = neon_select_shape (NS_DI, NS_QI, NS_NULL);
+  struct neon_type_el et = neon_check_type (2, rs,
+    N_I8 | N_I16 | N_I32 | N_I64 | N_F32 | N_KEY, N_EQK);
   unsigned immlo, immhi = 0, immbits;
   int op, cmode;
 
+  constraint (et.type == NT_invtype,
+              _("operand size must be specified for immediate VMOV"));
+
   /* We start out as an MVN instruction if OP = 1, MOV otherwise.  */
   op = (inst.instruction & (1 << 5)) != 0;
 
@@ -11074,7 +12547,7 @@ neon_move_immediate (void)
 
   inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
   inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= (rs == NS_QI) << 6;
+  inst.instruction |= neon_quad (rs) << 6;
   inst.instruction |= cmode << 8;
 
   neon_write_immbits (immbits);
@@ -11085,14 +12558,14 @@ do_neon_mvn (void)
 {
   if (inst.operands[1].isreg)
     {
-      enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+      enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
       
       inst.instruction = NEON_ENC_INTEGER (inst.instruction);
       inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
       inst.instruction |= HI1 (inst.operands[0].reg) << 22;
       inst.instruction |= LOW4 (inst.operands[1].reg);
       inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-      inst.instruction |= (rs == NS_QQ) << 6;
+      inst.instruction |= neon_quad (rs) << 6;
     }
   else
     {
@@ -11180,6 +12653,9 @@ do_neon_dyadic_narrow (void)
 {
   struct neon_type_el et = neon_check_type (3, NS_QDD,
     N_EQK | N_DBL, N_EQK, N_I16 | N_I32 | N_I64 | N_KEY);
+  /* Operand sign is unimportant, and the U bit is part of the opcode,
+     so force the operand type to integer.  */
+  et.type = NT_integer;
   neon_mixed_length (et, et.size / 2);
 }
 
@@ -11211,7 +12687,7 @@ do_neon_vmull (void)
 static void
 do_neon_ext (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDDI_QQQI);
+  enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
   struct neon_type_el et = neon_check_type (3, rs,
     N_EQK, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
   unsigned imm = (inst.operands[3].imm * et.size) / 8;
@@ -11221,7 +12697,7 @@ do_neon_ext (void)
   inst.instruction |= HI1 (inst.operands[1].reg) << 7;
   inst.instruction |= LOW4 (inst.operands[2].reg);
   inst.instruction |= HI1 (inst.operands[2].reg) << 5;
-  inst.instruction |= (rs == NS_QQQI) << 6;
+  inst.instruction |= neon_quad (rs) << 6;
   inst.instruction |= imm << 8;
   
   inst.instruction = neon_dp_fixup (inst.instruction);
@@ -11230,7 +12706,7 @@ do_neon_ext (void)
 static void
 do_neon_rev (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_8 | N_16 | N_32 | N_KEY);
   unsigned op = (inst.instruction >> 7) & 3;
@@ -11241,7 +12717,7 @@ do_neon_rev (void)
   assert (elsize != 0);
   constraint (et.size >= elsize,
               _("elements must be smaller than reversal region"));
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
@@ -11249,19 +12725,23 @@ do_neon_dup (void)
 {
   if (inst.operands[1].isscalar)
     {
-      enum neon_shape rs = neon_check_shape (NS_DS_QS);
+      enum neon_shape rs = neon_select_shape (NS_DS, NS_QS, NS_NULL);
       struct neon_type_el et = neon_check_type (2, rs,
         N_EQK, N_8 | N_16 | N_32 | N_KEY);
       unsigned sizebits = et.size >> 3;
       unsigned dm = NEON_SCALAR_REG (inst.operands[1].reg);
       int logsize = neon_logbits (et.size);
       unsigned x = NEON_SCALAR_INDEX (inst.operands[1].reg) << logsize;
+
+      if (vfp_or_neon_is_neon (NEON_CHECK_CC) == FAIL)
+        return;
+
       inst.instruction = NEON_ENC_SCALAR (inst.instruction);
       inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
       inst.instruction |= HI1 (inst.operands[0].reg) << 22;
       inst.instruction |= LOW4 (dm);
       inst.instruction |= HI1 (dm) << 5;
-      inst.instruction |= (rs == NS_QS) << 6;
+      inst.instruction |= neon_quad (rs) << 6;
       inst.instruction |= x << 17;
       inst.instruction |= sizebits << 16;
       
@@ -11269,10 +12749,9 @@ do_neon_dup (void)
     }
   else
     {
-      enum neon_shape rs = neon_check_shape (NS_DR_QR);
-      struct neon_type_el et = neon_check_type (1, rs,
-        N_8 | N_16 | N_32 | N_KEY);
-      unsigned save_cond = inst.instruction & 0xf0000000;
+      enum neon_shape rs = neon_select_shape (NS_DR, NS_QR, NS_NULL);
+      struct neon_type_el et = neon_check_type (2, rs,
+        N_8 | N_16 | N_32 | N_KEY, N_EQK);
       /* Duplicate ARM register to lanes of vector.  */
       inst.instruction = NEON_ENC_ARMREG (inst.instruction);
       switch (et.size)
@@ -11285,13 +12764,10 @@ do_neon_dup (void)
       inst.instruction |= LOW4 (inst.operands[1].reg) << 12;
       inst.instruction |= LOW4 (inst.operands[0].reg) << 16;
       inst.instruction |= HI1 (inst.operands[0].reg) << 7;
-      inst.instruction |= (rs == NS_QR) << 21;
+      inst.instruction |= neon_quad (rs) << 21;
       /* The encoding for this instruction is identical for the ARM and Thumb
          variants, except for the condition field.  */
-      if (thumb_mode)
-        inst.instruction |= 0xe0000000;
-      else
-        inst.instruction |= save_cond;
+      do_vfp_cond_or_thumb ();
     }
 }
 
@@ -11310,11 +12786,23 @@ do_neon_dup (void)
    (Scalar to ARM register.)
      7. VMOV<c><q> <Rd>, <Rn>, <Dm>
    (Vector to two ARM registers.)
+     8. VMOV.F32 <Sd>, <Sm>
+     9. VMOV.F64 <Dd>, <Dm>
+   (VFP register moves.)
+    10. VMOV.F32 <Sd>, #imm
+    11. VMOV.F64 <Dd>, #imm
+   (VFP float immediate load.)
+    12. VMOV <Rd>, <Sm>
+   (VFP single to ARM reg.)
+    13. VMOV <Sd>, <Rm>
+   (ARM reg to VFP single.)
+    14. VMOV <Rd>, <Re>, <Sn>, <Sm>
+   (Two ARM regs to two VFP singles.)
+    15. VMOV <Sd>, <Se>, <Rn>, <Rm>
+   (Two VFP singles to two ARM regs.)
   
-   We should have just enough information to be able to disambiguate most of
-   these, apart from "Two ARM registers to vector" and "Vector to two ARM
-   registers" cases. For these, abuse the .regisimm operand field to signify a
-   Neon register.
+   These cases can be disambiguated using neon_select_shape, except cases 1/9
+   and 3/11 which depend on the operand type too.
    
    All the encoded bits are hardcoded by this function.
    
@@ -11328,139 +12816,203 @@ do_neon_dup (void)
 static void
 do_neon_mov (void)
 {
-  int nargs = inst.operands[0].present + inst.operands[1].present
-              + inst.operands[2].present;
-  unsigned save_cond = thumb_mode ? 0xe0000000 : inst.instruction & 0xf0000000;
-  const char *vfp_vers = "selected FPU does not support instruction";
+  enum neon_shape rs = neon_select_shape (NS_RRFF, NS_FFRR, NS_DRR, NS_RRD,
+    NS_QQ, NS_DD, NS_QI, NS_DI, NS_SR, NS_RS, NS_FF, NS_FI, NS_RF, NS_FR,
+    NS_NULL);
+  struct neon_type_el et;
+  const char *ldconst = 0;
 
-  switch (nargs)
+  switch (rs)
     {
-    case 2:
-      /* Cases 0, 1, 2, 3, 4, 6.  */
-      if (inst.operands[1].isscalar)
+    case NS_DD:  /* case 1/9.  */
+      et = neon_check_type (2, rs, N_EQK, N_F64 | N_KEY);
+      /* It is not an error here if no type is given.  */
+      inst.error = NULL;
+      if (et.type == NT_float && et.size == 64)
         {
-          /* Case 6.  */
-          struct neon_type_el et = neon_check_type (2, NS_IGNORE,
-            N_EQK, N_S8 | N_S16 | N_U8 | N_U16 | N_32 | N_KEY);
-          unsigned logsize = neon_logbits (et.size);
-          unsigned dn = NEON_SCALAR_REG (inst.operands[1].reg);
-          unsigned x = NEON_SCALAR_INDEX (inst.operands[1].reg);
-          unsigned abcdebits = 0;
-
-          constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
-                      _(vfp_vers));
-          constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
-                      && et.size != 32, _(vfp_vers));
-          constraint (et.type == NT_invtype, _("bad type for scalar"));
-          constraint (x >= 64 / et.size, _("scalar index out of range"));
-
-          switch (et.size)
-            {
-            case 8:  abcdebits = (et.type == NT_signed) ? 0x08 : 0x18; break;
-            case 16: abcdebits = (et.type == NT_signed) ? 0x01 : 0x11; break;
-            case 32: abcdebits = 0x00; break;
-            default: ;
-            }
-
-          abcdebits |= x << logsize;
-          inst.instruction = save_cond;
-          inst.instruction |= 0xe100b10;
-          inst.instruction |= LOW4 (dn) << 16;
-          inst.instruction |= HI1 (dn) << 7;
-          inst.instruction |= inst.operands[0].reg << 12;
-          inst.instruction |= (abcdebits & 3) << 5;
-          inst.instruction |= (abcdebits >> 2) << 21;
+          do_vfp_nsyn_opcode ("fcpyd");
+          break;
         }
-      else if (inst.operands[1].isreg)
-        {
-          /* Cases 0, 1, 4.  */
-          if (inst.operands[0].isscalar)
-            {
-              /* Case 4.  */
-              unsigned bcdebits = 0;
-              struct neon_type_el et = neon_check_type (2, NS_IGNORE,
-                N_8 | N_16 | N_32 | N_KEY, N_EQK);
-              int logsize = neon_logbits (et.size);
-              unsigned dn = NEON_SCALAR_REG (inst.operands[0].reg);
-              unsigned x = NEON_SCALAR_INDEX (inst.operands[0].reg);
-
-              constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
-                          _(vfp_vers));
-              constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
-                          && et.size != 32, _(vfp_vers));
-              constraint (et.type == NT_invtype, _("bad type for scalar"));
-              constraint (x >= 64 / et.size, _("scalar index out of range"));
-
-              switch (et.size)
-                {
-                case 8:  bcdebits = 0x8; break;
-                case 16: bcdebits = 0x1; break;
-                case 32: bcdebits = 0x0; break;
-                default: ;
-                }
+      /* fall through.  */
 
-              bcdebits |= x << logsize;
-              inst.instruction = save_cond;
-              inst.instruction |= 0xe000b10;
-              inst.instruction |= LOW4 (dn) << 16;
-              inst.instruction |= HI1 (dn) << 7;
-              inst.instruction |= inst.operands[1].reg << 12;
-              inst.instruction |= (bcdebits & 3) << 5;
-              inst.instruction |= (bcdebits >> 2) << 21;
-            }
-          else
-            {
-              /* Cases 0, 1.  */
-              enum neon_shape rs = neon_check_shape (NS_DD_QQ);
-              /* The architecture manual I have doesn't explicitly state which
-                 value the U bit should have for register->register moves, but
-                 the equivalent VORR instruction has U = 0, so do that.  */
-              inst.instruction = 0x0200110;
-              inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-              inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-              inst.instruction |= LOW4 (inst.operands[1].reg);
-              inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-              inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
-              inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-              inst.instruction |= (rs == NS_QQ) << 6;
-              
-              inst.instruction = neon_dp_fixup (inst.instruction);
-            }
-        }
-      else
+    case NS_QQ:  /* case 0/1.  */
+      {
+        if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+          return;
+        /* The architecture manual I have doesn't explicitly state which
+           value the U bit should have for register->register moves, but
+           the equivalent VORR instruction has U = 0, so do that.  */
+        inst.instruction = 0x0200110;
+        inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+        inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+        inst.instruction |= LOW4 (inst.operands[1].reg);
+        inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+        inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+        inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+        inst.instruction |= neon_quad (rs) << 6;
+
+        inst.instruction = neon_dp_fixup (inst.instruction);
+      }
+      break;
+        
+    case NS_DI:  /* case 3/11.  */
+      et = neon_check_type (2, rs, N_EQK, N_F64 | N_KEY);
+      inst.error = NULL;
+      if (et.type == NT_float && et.size == 64)
         {
-          /* Cases 2, 3.  */
-          inst.instruction = 0x0800010;
-          neon_move_immediate ();
-          inst.instruction = neon_dp_fixup (inst.instruction);
+          /* case 11 (fconstd).  */
+          ldconst = "fconstd";
+          goto encode_fconstd;
         }
+      /* fall through.  */
+
+    case NS_QI:  /* case 2/3.  */
+      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+        return;
+      inst.instruction = 0x0800010;
+      neon_move_immediate ();
+      inst.instruction = neon_dp_fixup (inst.instruction);
+      break;
+    
+    case NS_SR:  /* case 4.  */
+      {
+        unsigned bcdebits = 0;
+        struct neon_type_el et = neon_check_type (2, NS_NULL,
+          N_8 | N_16 | N_32 | N_KEY, N_EQK);
+        int logsize = neon_logbits (et.size);
+        unsigned dn = NEON_SCALAR_REG (inst.operands[0].reg);
+        unsigned x = NEON_SCALAR_INDEX (inst.operands[0].reg);
+
+        constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
+                    _(BAD_FPU));
+        constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
+                    && et.size != 32, _(BAD_FPU));
+        constraint (et.type == NT_invtype, _("bad type for scalar"));
+        constraint (x >= 64 / et.size, _("scalar index out of range"));
+
+        switch (et.size)
+          {
+          case 8:  bcdebits = 0x8; break;
+          case 16: bcdebits = 0x1; break;
+          case 32: bcdebits = 0x0; break;
+          default: ;
+          }
+
+        bcdebits |= x << logsize;
+
+        inst.instruction = 0xe000b10;
+        do_vfp_cond_or_thumb ();
+        inst.instruction |= LOW4 (dn) << 16;
+        inst.instruction |= HI1 (dn) << 7;
+        inst.instruction |= inst.operands[1].reg << 12;
+        inst.instruction |= (bcdebits & 3) << 5;
+        inst.instruction |= (bcdebits >> 2) << 21;
+      }
+      break;
+    
+    case NS_DRR:  /* case 5 (fmdrr).  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
+                  _(BAD_FPU));
+
+      inst.instruction = 0xc400b10;
+      do_vfp_cond_or_thumb ();
+      inst.instruction |= LOW4 (inst.operands[0].reg);
+      inst.instruction |= HI1 (inst.operands[0].reg) << 5;
+      inst.instruction |= inst.operands[1].reg << 12;
+      inst.instruction |= inst.operands[2].reg << 16;
+      break;
+    
+    case NS_RS:  /* case 6.  */
+      {
+        struct neon_type_el et = neon_check_type (2, NS_NULL,
+          N_EQK, N_S8 | N_S16 | N_U8 | N_U16 | N_32 | N_KEY);
+        unsigned logsize = neon_logbits (et.size);
+        unsigned dn = NEON_SCALAR_REG (inst.operands[1].reg);
+        unsigned x = NEON_SCALAR_INDEX (inst.operands[1].reg);
+        unsigned abcdebits = 0;
+
+        constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
+                    _(BAD_FPU));
+        constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
+                    && et.size != 32, _(BAD_FPU));
+        constraint (et.type == NT_invtype, _("bad type for scalar"));
+        constraint (x >= 64 / et.size, _("scalar index out of range"));
+
+        switch (et.size)
+          {
+          case 8:  abcdebits = (et.type == NT_signed) ? 0x08 : 0x18; break;
+          case 16: abcdebits = (et.type == NT_signed) ? 0x01 : 0x11; break;
+          case 32: abcdebits = 0x00; break;
+          default: ;
+          }
+
+        abcdebits |= x << logsize;
+        inst.instruction = 0xe100b10;
+        do_vfp_cond_or_thumb ();
+        inst.instruction |= LOW4 (dn) << 16;
+        inst.instruction |= HI1 (dn) << 7;
+        inst.instruction |= inst.operands[0].reg << 12;
+        inst.instruction |= (abcdebits & 3) << 5;
+        inst.instruction |= (abcdebits >> 2) << 21;
+      }
+      break;
+    
+    case NS_RRD:  /* case 7 (fmrrd).  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
+                  _(BAD_FPU));
+
+      inst.instruction = 0xc500b10;
+      do_vfp_cond_or_thumb ();
+      inst.instruction |= inst.operands[0].reg << 12;
+      inst.instruction |= inst.operands[1].reg << 16;
+      inst.instruction |= LOW4 (inst.operands[2].reg);
+      inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+      break;
+    
+    case NS_FF:  /* case 8 (fcpys).  */
+      do_vfp_nsyn_opcode ("fcpys");
       break;
     
-    case 3:
-      /* Cases 5, 7.  */
-      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
-                  _(vfp_vers));
-
-      if (inst.operands[0].regisimm)
+    case NS_FI:  /* case 10 (fconsts).  */
+      ldconst = "fconsts";
+      encode_fconstd:
+      if (is_quarter_float (inst.operands[1].imm))
         {
-          /* Case 5.  */
-          inst.instruction = save_cond;
-          inst.instruction |= 0xc400b10;
-          inst.instruction |= LOW4 (inst.operands[0].reg);
-          inst.instruction |= HI1 (inst.operands[0].reg) << 5;
-          inst.instruction |= inst.operands[1].reg << 12;
-          inst.instruction |= inst.operands[2].reg << 16;
+          inst.operands[1].imm = neon_qfloat_bits (inst.operands[1].imm);
+          do_vfp_nsyn_opcode (ldconst);
         }
       else
-        {
-          /* Case 7.  */
-          inst.instruction = save_cond;
-          inst.instruction |= 0xc500b10;
-          inst.instruction |= inst.operands[0].reg << 12;
-          inst.instruction |= inst.operands[1].reg << 16;
-          inst.instruction |= LOW4 (inst.operands[2].reg);
-          inst.instruction |= HI1 (inst.operands[2].reg) << 5;
-        }
+        first_error (_("immediate out of range"));
+      break;
+    
+    case NS_RF:  /* case 12 (fmrs).  */
+      do_vfp_nsyn_opcode ("fmrs");
+      break;
+    
+    case NS_FR:  /* case 13 (fmsr).  */
+      do_vfp_nsyn_opcode ("fmsr");
+      break;
+    
+    /* The encoders for the fmrrs and fmsrr instructions expect three operands
+       (one of which is a list), but we have parsed four.  Do some fiddling to
+       make the operands what do_vfp_reg2_from_sp2 and do_vfp_sp2_from_reg2
+       expect.  */
+    case NS_RRFF:  /* case 14 (fmrrs).  */
+      constraint (inst.operands[3].reg != inst.operands[2].reg + 1,
+                  _("VFP registers must be adjacent"));
+      inst.operands[2].imm = 2;
+      memset (&inst.operands[3], '\0', sizeof (inst.operands[3]));
+      do_vfp_nsyn_opcode ("fmrrs");
+      break;
+    
+    case NS_FFRR:  /* case 15 (fmsrr).  */
+      constraint (inst.operands[1].reg != inst.operands[0].reg + 1,
+                  _("VFP registers must be adjacent"));
+      inst.operands[1] = inst.operands[2];
+      inst.operands[2] = inst.operands[3];
+      inst.operands[0].imm = 2;
+      memset (&inst.operands[3], '\0', sizeof (inst.operands[3]));
+      do_vfp_nsyn_opcode ("fmsrr");
       break;
     
     default:
@@ -11471,7 +13023,7 @@ do_neon_mov (void)
 static void
 do_neon_rshift_round_imm (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DDI_QQI);
+  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
   int imm = inst.operands[2].imm;
 
@@ -11485,7 +13037,7 @@ do_neon_rshift_round_imm (void)
 
   constraint (imm < 1 || (unsigned)imm > et.size,
               _("immediate out of range for shift"));
-  neon_imm_shift (TRUE, et.type == NT_unsigned, rs == NS_QQI, et,
+  neon_imm_shift (TRUE, et.type == NT_unsigned, neon_quad (rs), et,
                   et.size - imm);
 }
 
@@ -11502,17 +13054,17 @@ do_neon_movl (void)
 static void
 do_neon_trn (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_8 | N_16 | N_32 | N_KEY);
   inst.instruction = NEON_ENC_INTEGER (inst.instruction);
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
 do_neon_zip_uzp (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_8 | N_16 | N_32 | N_KEY);
   if (rs == NS_DD && et.size == 32)
@@ -11522,70 +13074,70 @@ do_neon_zip_uzp (void)
       do_neon_trn ();
       return;
     }
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
 do_neon_sat_abs_neg (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
 do_neon_pair_long (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_32 | N_KEY);
   /* Unsigned is encoded in OP field (bit 7) for these instruction.  */
   inst.instruction |= (et.type == NT_unsigned) << 7;
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
 do_neon_recip_est (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK | N_FLT, N_F32 | N_U32 | N_KEY);
   inst.instruction |= (et.type == NT_float) << 8;
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
 do_neon_cls (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
 do_neon_clz (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_I8 | N_I16 | N_I32 | N_KEY);
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
 do_neon_cnt (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK | N_INT, N_8 | N_KEY);
-  neon_two_same (rs == NS_QQ, 1, et.size);
+  neon_two_same (neon_quad (rs), 1, et.size);
 }
 
 static void
 do_neon_swp (void)
 {
-  enum neon_shape rs = neon_check_shape (NS_DD_QQ);
-  neon_two_same (rs == NS_QQ, 1, -1);
+  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  neon_two_same (neon_quad (rs), 1, -1);
 }
 
 static void
@@ -11619,6 +13171,12 @@ do_neon_ldm_stm (void)
   int is_dbmode = (inst.instruction & (1 << 24)) != 0;
   unsigned offsetbits = inst.operands[1].imm * 2;
 
+  if (inst.operands[1].issingle)
+    {
+      do_vfp_nsyn_ldm_stm (is_dbmode);
+      return;
+    }
+
   constraint (is_dbmode && !inst.operands[0].writeback,
               _("writeback (!) must be used for VLDMDB and VSTMDB"));
 
@@ -11633,60 +13191,28 @@ do_neon_ldm_stm (void)
 
   inst.instruction |= offsetbits;
   
-  if (thumb_mode)
-    inst.instruction |= 0xe0000000;
+  do_vfp_cond_or_thumb ();
 }
 
 static void
 do_neon_ldr_str (void)
 {
-  unsigned offsetbits;
-  int offset_up = 1;
   int is_ldr = (inst.instruction & (1 << 20)) != 0;
   
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  
-  constraint (inst.reloc.pc_rel && !is_ldr,
-              _("PC-relative addressing unavailable with VSTR"));
-  
-  constraint (!inst.reloc.pc_rel && inst.reloc.exp.X_op != O_constant,
-              _("Immediate value must be a constant"));
-  
-  if (inst.reloc.exp.X_add_number < 0)
+  if (inst.operands[0].issingle)
     {
-      offset_up = 0;
-      offsetbits = -inst.reloc.exp.X_add_number / 4;
+      if (is_ldr)
+        do_vfp_nsyn_opcode ("flds");
+      else
+        do_vfp_nsyn_opcode ("fsts");
     }
   else
-    offsetbits = inst.reloc.exp.X_add_number / 4;
-  
-  /* FIXME: Does this catch everything?  */
-  constraint (!inst.operands[1].isreg || !inst.operands[1].preind
-              || inst.operands[1].postind || inst.operands[1].writeback
-              || inst.operands[1].immisreg || inst.operands[1].shifted,
-              BAD_ADDR_MODE);
-  constraint ((inst.operands[1].imm & 3) != 0,
-              _("Offset must be a multiple of 4"));
-  constraint (offsetbits != (offsetbits & 0xff),
-              _("Immediate offset out of range"));
-
-  inst.instruction |= inst.operands[1].reg << 16;
-  inst.instruction |= offsetbits & 0xff;
-  inst.instruction |= offset_up << 23;
-  
-  if (thumb_mode)
-    inst.instruction |= 0xe0000000;
-
-  if (inst.reloc.pc_rel)
     {
-      if (thumb_mode)
-        inst.reloc.type = BFD_RELOC_ARM_T32_CP_OFF_IMM;
+      if (is_ldr)
+        do_vfp_nsyn_opcode ("fldd");
       else
-        inst.reloc.type = BFD_RELOC_ARM_CP_OFF_IMM;
+        do_vfp_nsyn_opcode ("fstd");
     }
-  else
-    inst.reloc.type = BFD_RELOC_UNUSED;
 }
 
 /* "interleave" version also handles non-interleaving register VLD1/VST1
@@ -11695,7 +13221,7 @@ do_neon_ldr_str (void)
 static void
 do_neon_ld_st_interleave (void)
 {
-  struct neon_type_el et = neon_check_type (1, NS_IGNORE,
+  struct neon_type_el et = neon_check_type (1, NS_NULL,
                                             N_8 | N_16 | N_32 | N_64);
   unsigned alignbits = 0;
   unsigned idx;
@@ -11799,7 +13325,7 @@ neon_alignment_bit (int size, int align, int *do_align, ...)
 static void
 do_neon_ld_st_lane (void)
 {
-  struct neon_type_el et = neon_check_type (1, NS_IGNORE, N_8 | N_16 | N_32);
+  struct neon_type_el et = neon_check_type (1, NS_NULL, N_8 | N_16 | N_32);
   int align_good, do_align = 0;
   int logsize = neon_logbits (et.size);
   int align = inst.operands[1].imm >> 8;
@@ -11886,7 +13412,7 @@ do_neon_ld_st_lane (void)
 static void
 do_neon_ld_dup (void)
 {
-  struct neon_type_el et = neon_check_type (1, NS_IGNORE, N_8 | N_16 | N_32);
+  struct neon_type_el et = neon_check_type (1, NS_NULL, N_8 | N_16 | N_32);
   int align_good, do_align = 0;
 
   if (et.type == NT_invtype)
@@ -12053,11 +13579,9 @@ output_relax_insn (void)
   symbolS *sym;
   int offset;
 
-#ifdef OBJ_ELF
   /* The size of the instruction is unknown, so tie the debug info to the
      start of the instruction.  */
   dwarf2_emit_insn (0);
-#endif
 
   switch (inst.reloc.exp.X_op)
     {
@@ -12125,9 +13649,7 @@ output_inst (const char * str)
 		 inst.size, & inst.reloc.exp, inst.reloc.pc_rel,
 		 inst.reloc.type);
 
-#ifdef OBJ_ELF
   dwarf2_emit_insn (inst.size);
-#endif
 }
 
 /* Tag values used in struct asm_opcode's tag field.  */
@@ -12138,6 +13660,9 @@ enum opcode_tag
   OT_unconditionalF,	/* Instruction cannot be conditionalized
 			   and carries 0xF in its ARM condition field.  */
   OT_csuffix,		/* Instruction takes a conditional suffix.  */
+  OT_csuffixF,		/* Some forms of the instruction take a conditional
+                           suffix, others place 0xF where the condition field
+                           would be.  */
   OT_cinfix3,		/* Instruction takes a conditional infix,
 			   beginning at character index 3.  (In
 			   unified mode, it becomes a suffix.)  */
@@ -12221,11 +13746,14 @@ opcode_lookup (char **str)
   const struct asm_opcode *opcode;
   const struct asm_cond *cond;
   char save[2];
+  bfd_boolean neon_supported;
+  
+  neon_supported = ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1);
 
   /* Scan up to the end of the mnemonic, which must end in white space,
-     '.' (in unified mode only), or end of string.  */
+     '.' (in unified mode, or for Neon instructions), or end of string.  */
   for (base = end = *str; *end != '\0'; end++)
-    if (*end == ' ' || (unified_syntax && *end == '.'))
+    if (*end == ' ' || ((unified_syntax || neon_supported) && *end == '.'))
       break;
 
   if (end == base)
@@ -12236,9 +13764,11 @@ opcode_lookup (char **str)
     {
       int offset = 2;
       
-      if (end[1] == 'w')
+      /* The .w and .n suffixes are only valid if the unified syntax is in
+         use.  */
+      if (unified_syntax && end[1] == 'w')
 	inst.size_req = 4;
-      else if (end[1] == 'n')
+      else if (unified_syntax && end[1] == 'n')
 	inst.size_req = 2;
       else
         offset = 0;
@@ -12249,7 +13779,8 @@ opcode_lookup (char **str)
 
       if (end[offset] == '.')      
 	{
-	  /* See if we have a Neon type suffix.  */
+	  /* See if we have a Neon type suffix (possible in either unified or
+             non-unified ARM syntax mode).  */
           if (parse_neon_type (&inst.vectype, str) == FAIL)
 	    return 0;
         }
@@ -12306,6 +13837,7 @@ opcode_lookup (char **str)
 	  /* else fall through */
 
 	case OT_csuffix:
+        case OT_csuffixF:
 	case OT_csuf_or_in3:
 	  inst.cond = cond->value;
 	  return opcode;
@@ -12397,6 +13929,10 @@ md_assemble (char *str)
   if (opcode->tag == OT_cinfix3_deprecated)
     as_warn (_("s suffix on comparison instruction is deprecated"));
 
+  /* The value which unconditional instructions should have in place of the
+     condition field.  */
+  inst.uncond_value = (opcode->tag == OT_csuffixF) ? 0xf : -1;
+
   if (thumb_mode)
     {
       arm_feature_set variant;
@@ -12472,7 +14008,7 @@ md_assemble (char *str)
 	ARM_MERGE_FEATURE_SETS (thumb_arch_used, thumb_arch_used,
 				arm_ext_v6t2);
     }
-  else
+  else if (ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v1))
     {
       /* Check that this instruction is supported for this CPU.  */
       if (!opcode->avariant ||
@@ -12505,6 +14041,12 @@ md_assemble (char *str)
 	ARM_MERGE_FEATURE_SETS (arm_arch_used, arm_arch_used,
 				*opcode->avariant);
     }
+  else
+    {
+      as_bad (_("attempt to use an ARM instruction on a Thumb-only processor "
+		"-- `%s'"), str);
+      return;
+    }
   output_inst (str);
 }
 
@@ -12570,9 +14112,7 @@ arm_frob_label (symbolS * sym)
       label_is_thumb_function_name = FALSE;
     }
 
-#ifdef OBJ_ELF
   dwarf2_emit_label (sym);
-#endif
 }
 
 int
@@ -13032,15 +14572,27 @@ static struct asm_barrier_opt barrier_opt_names[] =
 
 /* Neon insn with conditional suffix for the ARM version, non-overloaded
    version.  */
-#define NCE(mnem, op, nops, ops, enc)					\
-  { #mnem, OPS##nops ops, OT_csuffix, 0x##op, 0x##op, ARM_VARIANT,	\
+#define NCE_tag(mnem, op, nops, ops, enc, tag)				\
+  { #mnem, OPS##nops ops, tag, 0x##op, 0x##op, ARM_VARIANT,		\
     THUMB_VARIANT, do_##enc, do_##enc }
 
+#define NCE(mnem, op, nops, ops, enc)					\
+  NCE_tag(mnem, op, nops, ops, enc, OT_csuffix)
+
+#define NCEF(mnem, op, nops, ops, enc)					\
+  NCE_tag(mnem, op, nops, ops, enc, OT_csuffixF)
+
 /* Neon insn with conditional suffix for the ARM version, overloaded types.  */
-#define nCE(mnem, op, nops, ops, enc)					\
-  { #mnem, OPS##nops ops, OT_csuffix, N_MNEM_##op, N_MNEM_##op,		\
+#define nCE_tag(mnem, op, nops, ops, enc, tag)				\
+  { #mnem, OPS##nops ops, tag, N_MNEM_##op, N_MNEM_##op,		\
     ARM_VARIANT, THUMB_VARIANT, do_##enc, do_##enc }
 
+#define nCE(mnem, op, nops, ops, enc)					\
+  nCE_tag(mnem, op, nops, ops, enc, OT_csuffix)
+
+#define nCEF(mnem, op, nops, ops, enc)					\
+  nCE_tag(mnem, op, nops, ops, enc, OT_csuffixF)
+
 #define do_0 0
 
 /* Thumb-only, unconditional.  */
@@ -13056,8 +14608,8 @@ static const struct asm_opcode insns[] =
  tC3(eors,	0300000, eors,	   3, (RR, oRR, SH), arit, t_arit3c),
  tCE(sub,	0400000, sub,	   3, (RR, oRR, SH), arit, t_add_sub),
  tC3(subs,	0500000, subs,	   3, (RR, oRR, SH), arit, t_add_sub),
- tCE(add,	0800000, add,	   3, (RR, oRR, SH), arit, t_add_sub),
- tC3(adds,	0900000, adds,	   3, (RR, oRR, SH), arit, t_add_sub),
+ tCE(add,	0800000, add,	   3, (RR, oRR, SHG), arit, t_add_sub),
+ tC3(adds,	0900000, adds,	   3, (RR, oRR, SHG), arit, t_add_sub),
  tCE(adc,	0a00000, adc,	   3, (RR, oRR, SH), arit, t_arit3c),
  tC3(adcs,	0b00000, adcs,	   3, (RR, oRR, SH), arit, t_arit3c),
  tCE(sbc,	0c00000, sbc,	   3, (RR, oRR, SH), arit, t_arit3),
@@ -13085,10 +14637,10 @@ static const struct asm_opcode insns[] =
  tCE(mvn,	1e00000, mvn,	   2, (RR, SH),      mov,  t_mvn_tst),
  tC3(mvns,	1f00000, mvns,	   2, (RR, SH),      mov,  t_mvn_tst),
 
- tCE(ldr,	4100000, ldr,	   2, (RR, ADDR),    ldst, t_ldst),
- tC3(ldrb,	4500000, ldrb,	   2, (RR, ADDR),    ldst, t_ldst),
- tCE(str,	4000000, str,	   2, (RR, ADDR),    ldst, t_ldst),
- tC3(strb,	4400000, strb,	   2, (RR, ADDR),    ldst, t_ldst),
+ tCE(ldr,	4100000, ldr,	   2, (RR, ADDRGLDR),ldst, t_ldst),
+ tC3(ldrb,	4500000, ldrb,	   2, (RR, ADDRGLDR),ldst, t_ldst),
+ tCE(str,	4000000, str,	   2, (RR, ADDRGLDR),ldst, t_ldst),
+ tC3(strb,	4400000, strb,	   2, (RR, ADDRGLDR),ldst, t_ldst),
 
  tCE(stm,	8800000, stmia,    2, (RRw, REGLST), ldmstm, t_ldmstm),
  tC3(stmia,	8800000, stmia,    2, (RRw, REGLST), ldmstm, t_ldmstm),
@@ -13172,10 +14724,10 @@ static const struct asm_opcode insns[] =
 
   /* Generic coprocessor instructions.	*/
  TCE(cdp,	e000000, ee000000, 6, (RCP, I15b, RCN, RCN, RCN, oI7b), cdp,    cdp),
- TCE(ldc,	c100000, ec100000, 3, (RCP, RCN, ADDR),		        lstc,   lstc),
- TC3(ldcl,	c500000, ec500000, 3, (RCP, RCN, ADDR),		        lstc,   lstc),
- TCE(stc,	c000000, ec000000, 3, (RCP, RCN, ADDR),		        lstc,   lstc),
- TC3(stcl,	c400000, ec400000, 3, (RCP, RCN, ADDR),		        lstc,   lstc),
+ TCE(ldc,	c100000, ec100000, 3, (RCP, RCN, ADDRGLDC),	        lstc,   lstc),
+ TC3(ldcl,	c500000, ec500000, 3, (RCP, RCN, ADDRGLDC),	        lstc,   lstc),
+ TCE(stc,	c000000, ec000000, 3, (RCP, RCN, ADDRGLDC),	        lstc,   lstc),
+ TC3(stcl,	c400000, ec400000, 3, (RCP, RCN, ADDRGLDC),	        lstc,   lstc),
  TCE(mcr,	e000010, ee000010, 6, (RCP, I7b, RR, RCN, RCN, oI7b),   co_reg, co_reg),
  TCE(mrc,	e100010, ee100010, 6, (RCP, I7b, RR, RCN, RCN, oI7b),   co_reg, co_reg),
 
@@ -13186,8 +14738,8 @@ static const struct asm_opcode insns[] =
 
 #undef ARM_VARIANT
 #define ARM_VARIANT &arm_ext_v3	/* ARM 6 Status register instructions.	*/
- TCE(mrs,	10f0000, f3ef8000, 2, (RR, PSR),     mrs, t_mrs),
- TCE(msr,	120f000, f3808000, 2, (PSR, RR_EXi), msr, t_msr),
+ TCE(mrs,	10f0000, f3ef8000, 2, (APSR_RR, RVC_PSR), mrs, t_mrs),
+ TCE(msr,	120f000, f3808000, 2, (RVC_PSR, RR_EXi), msr, t_msr),
 
 #undef ARM_VARIANT
 #define ARM_VARIANT &arm_ext_v3m	 /* ARM 7M long multiplies.  */
@@ -13204,12 +14756,12 @@ static const struct asm_opcode insns[] =
 #define ARM_VARIANT &arm_ext_v4	/* ARM Architecture 4.	*/
 #undef THUMB_VARIANT
 #define THUMB_VARIANT &arm_ext_v4t
- tC3(ldrh,	01000b0, ldrh,     2, (RR, ADDR), ldstv4, t_ldst),
- tC3(strh,	00000b0, strh,     2, (RR, ADDR), ldstv4, t_ldst),
- tC3(ldrsh,	01000f0, ldrsh,    2, (RR, ADDR), ldstv4, t_ldst),
- tC3(ldrsb,	01000d0, ldrsb,    2, (RR, ADDR), ldstv4, t_ldst),
- tCM(ld,sh,	01000f0, ldrsh,    2, (RR, ADDR), ldstv4, t_ldst),
- tCM(ld,sb,	01000d0, ldrsb,    2, (RR, ADDR), ldstv4, t_ldst),
+ tC3(ldrh,	01000b0, ldrh,     2, (RR, ADDRGLDRS), ldstv4, t_ldst),
+ tC3(strh,	00000b0, strh,     2, (RR, ADDRGLDRS), ldstv4, t_ldst),
+ tC3(ldrsh,	01000f0, ldrsh,    2, (RR, ADDRGLDRS), ldstv4, t_ldst),
+ tC3(ldrsb,	01000d0, ldrsb,    2, (RR, ADDRGLDRS), ldstv4, t_ldst),
+ tCM(ld,sh,	01000f0, ldrsh,    2, (RR, ADDRGLDRS), ldstv4, t_ldst),
+ tCM(ld,sb,	01000d0, ldrsb,    2, (RR, ADDRGLDRS), ldstv4, t_ldst),
 
 #undef ARM_VARIANT
 #define ARM_VARIANT &arm_ext_v4t_5
@@ -13230,10 +14782,10 @@ static const struct asm_opcode insns[] =
 #undef THUMB_VARIANT
 #define THUMB_VARIANT &arm_ext_v6t2
  TCE(clz,	16f0f10, fab0f080, 2, (RRnpc, RRnpc),		        rd_rm,  t_clz),
- TUF(ldc2,	c100000, fc100000, 3, (RCP, RCN, ADDR),		        lstc,	lstc),
- TUF(ldc2l,	c500000, fc500000, 3, (RCP, RCN, ADDR),		        lstc,	lstc),
- TUF(stc2,	c000000, fc000000, 3, (RCP, RCN, ADDR),		        lstc,	lstc),
- TUF(stc2l,	c400000, fc400000, 3, (RCP, RCN, ADDR),		        lstc,	lstc),
+ TUF(ldc2,	c100000, fc100000, 3, (RCP, RCN, ADDRGLDC),	        lstc,	lstc),
+ TUF(ldc2l,	c500000, fc500000, 3, (RCP, RCN, ADDRGLDC),		        lstc,	lstc),
+ TUF(stc2,	c000000, fc000000, 3, (RCP, RCN, ADDRGLDC),	        lstc,	lstc),
+ TUF(stc2l,	c400000, fc400000, 3, (RCP, RCN, ADDRGLDC),		        lstc,	lstc),
  TUF(cdp2,	e000000, fe000000, 6, (RCP, I15b, RCN, RCN, RCN, oI7b), cdp,    cdp),
  TUF(mcr2,	e000010, fe000010, 6, (RCP, I7b, RR, RCN, RCN, oI7b),   co_reg, co_reg),
  TUF(mrc2,	e100010, fe100010, 6, (RCP, I7b, RR, RCN, RCN, oI7b),   co_reg, co_reg),
@@ -13269,8 +14821,8 @@ static const struct asm_opcode insns[] =
 #undef ARM_VARIANT
 #define ARM_VARIANT &arm_ext_v5e /*  ARM Architecture 5TE.  */
  TUF(pld,	450f000, f810f000, 1, (ADDR),		     pld,  t_pld),
- TC3(ldrd,	00000d0, e9500000, 3, (RRnpc, oRRnpc, ADDR), ldrd, t_ldstd),
- TC3(strd,	00000f0, e9400000, 3, (RRnpc, oRRnpc, ADDR), ldrd, t_ldstd),
+ TC3(ldrd,	00000d0, e9500000, 3, (RRnpc, oRRnpc, ADDRGLDRS), ldrd, t_ldstd),
+ TC3(strd,	00000f0, e9400000, 3, (RRnpc, oRRnpc, ADDRGLDRS), ldrd, t_ldstd),
 
  TCE(mcrr,	c400000, ec400000, 5, (RCP, I15b, RRnpc, RRnpc, RCN), co_reg2c, co_reg2c),
  TCE(mrrc,	c500000, ec500000, 5, (RCP, I15b, RRnpc, RRnpc, RCN), co_reg2c, co_reg2c),
@@ -13425,18 +14977,20 @@ static const struct asm_opcode insns[] =
  TCE(ubfx,	7e00050, f3c00000, 4, (RR, RR, I31, I32),	   bfx, t_bfx),
 
  TCE(mls,	0600090, fb000010, 4, (RRnpc, RRnpc, RRnpc, RRnpc), mlas, t_mla),
- TCE(movw,	3000000, f2400000, 2, (RRnpc, Iffff),		    mov16, t_mov16),
- TCE(movt,	3400000, f2c00000, 2, (RRnpc, Iffff),		    mov16, t_mov16),
- TCE(rbit,	3ff0f30, fa90f0a0, 2, (RR, RR),			    rd_rm, t_rbit),
+ TCE(movw,	3000000, f2400000, 2, (RRnpc, HALF),		    mov16, t_mov16),
+ TCE(movt,	3400000, f2c00000, 2, (RRnpc, HALF),		    mov16, t_mov16),
+ TCE(rbit,	6ff0f30, fa90f0a0, 2, (RR, RR),			    rd_rm, t_rbit),
 
  TC3(ldrht,	03000b0, f8300e00, 2, (RR, ADDR), ldsttv4, t_ldstt),
  TC3(ldrsht,	03000f0, f9300e00, 2, (RR, ADDR), ldsttv4, t_ldstt),
  TC3(ldrsbt,	03000d0, f9100e00, 2, (RR, ADDR), ldsttv4, t_ldstt),
  TC3(strht,	02000b0, f8200e00, 2, (RR, ADDR), ldsttv4, t_ldstt),
 
-  UT(cbnz,      b900,    2, (RR, EXP), t_czb),
-  UT(cbz,       b100,    2, (RR, EXP), t_czb),
- /* ARM does not really have an IT instruction.  */
+  UT(cbnz,      b900,    2, (RR, EXP), t_cbz),
+  UT(cbz,       b100,    2, (RR, EXP), t_cbz),
+ /* ARM does not really have an IT instruction, so always allow it.  */
+#undef ARM_VARIANT
+#define ARM_VARIANT &arm_ext_v1
  TUE(it,        0, bf08, 1, (COND),    it, t_it),
  TUE(itt,       0, bf0c, 1, (COND),    it, t_it),
  TUE(ite,       0, bf04, 1, (COND),    it, t_it),
@@ -13486,15 +15040,15 @@ static const struct asm_opcode insns[] =
  cCE(wfc,	e400110, 1, (RR),	     rd),
  cCE(rfc,	e500110, 1, (RR),	     rd),
 
- cCL(ldfs,	c100100, 2, (RF, ADDR),	     rd_cpaddr),
- cCL(ldfd,	c108100, 2, (RF, ADDR),	     rd_cpaddr),
- cCL(ldfe,	c500100, 2, (RF, ADDR),	     rd_cpaddr),
- cCL(ldfp,	c508100, 2, (RF, ADDR),	     rd_cpaddr),
+ cCL(ldfs,	c100100, 2, (RF, ADDRGLDC),  rd_cpaddr),
+ cCL(ldfd,	c108100, 2, (RF, ADDRGLDC),  rd_cpaddr),
+ cCL(ldfe,	c500100, 2, (RF, ADDRGLDC),  rd_cpaddr),
+ cCL(ldfp,	c508100, 2, (RF, ADDRGLDC),  rd_cpaddr),
 
- cCL(stfs,	c000100, 2, (RF, ADDR),	     rd_cpaddr),
- cCL(stfd,	c008100, 2, (RF, ADDR),	     rd_cpaddr),
- cCL(stfe,	c400100, 2, (RF, ADDR),	     rd_cpaddr),
- cCL(stfp,	c408100, 2, (RF, ADDR),	     rd_cpaddr),
+ cCL(stfs,	c000100, 2, (RF, ADDRGLDC),  rd_cpaddr),
+ cCL(stfd,	c008100, 2, (RF, ADDRGLDC),  rd_cpaddr),
+ cCL(stfe,	c400100, 2, (RF, ADDRGLDC),  rd_cpaddr),
+ cCL(stfp,	c408100, 2, (RF, ADDRGLDC),  rd_cpaddr),
 
  cCL(mvfs,	e008100, 2, (RF, RF_IF),     rd_rm),
  cCL(mvfsp,	e008120, 2, (RF, RF_IF),     rd_rm),
@@ -13937,8 +15491,8 @@ static const struct asm_opcode insns[] =
  cCE(fmxr,	ee00a10, 2, (RVC, RR),	      rn_rd),
 
   /* Memory operations.	 */
- cCE(flds,	d100a00, 2, (RVS, ADDR),      vfp_sp_ldst),
- cCE(fsts,	d000a00, 2, (RVS, ADDR),      vfp_sp_ldst),
+ cCE(flds,	d100a00, 2, (RVS, ADDRGLDC),  vfp_sp_ldst),
+ cCE(fsts,	d000a00, 2, (RVS, ADDRGLDC),  vfp_sp_ldst),
  cCE(fldmias,	c900a00, 2, (RRw, VRSLST),    vfp_sp_ldstmia),
  cCE(fldmfds,	c900a00, 2, (RRw, VRSLST),    vfp_sp_ldstmia),
  cCE(fldmdbs,	d300a00, 2, (RRw, VRSLST),    vfp_sp_ldstmdb),
@@ -13996,8 +15550,8 @@ static const struct asm_opcode insns[] =
  cCE(ftouizd,	ebc0bc0, 2, (RVS, RVD),	      vfp_sp_dp_cvt),
 
   /* Memory operations.	 */
- cCE(fldd,	d100b00, 2, (RVD, ADDR),      vfp_dp_ldst),
- cCE(fstd,	d000b00, 2, (RVD, ADDR),      vfp_dp_ldst),
+ cCE(fldd,	d100b00, 2, (RVD, ADDRGLDC),  vfp_dp_ldst),
+ cCE(fstd,	d000b00, 2, (RVD, ADDRGLDC),  vfp_dp_ldst),
  cCE(fldmiad,	c900b00, 2, (RRw, VRDLST),    vfp_dp_ldstmia),
  cCE(fldmfdd,	c900b00, 2, (RRw, VRDLST),    vfp_dp_ldstmia),
  cCE(fldmdbd,	d300b00, 2, (RRw, VRDLST),    vfp_dp_ldstmdb),
@@ -14036,6 +15590,50 @@ static const struct asm_opcode insns[] =
  cCE(fmdrr,	c400b10, 3, (RVD, RR, RR),    vfp_dp_rm_rd_rn),
  cCE(fmrrd,	c500b10, 3, (RR, RR, RVD),    vfp_dp_rd_rn_rm),
 
+/* Instructions which may belong to either the Neon or VFP instruction sets.
+   Individual encoder functions perform additional architecture checks.  */
+#undef ARM_VARIANT
+#define ARM_VARIANT &fpu_vfp_ext_v1xd
+#undef THUMB_VARIANT
+#define THUMB_VARIANT &fpu_vfp_ext_v1xd
+  /* These mnemonics are unique to VFP.  */
+ NCE(vsqrt,     0,       2, (RVSD, RVSD),       vfp_nsyn_sqrt),
+ NCE(vdiv,      0,       3, (RVSD, RVSD, RVSD), vfp_nsyn_div),
+ nCE(vnmul,     vnmul,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
+ nCE(vnmla,     vnmla,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
+ nCE(vnmls,     vnmls,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
+ nCE(vcmp,      vcmp,    2, (RVSD, RVSD_I0),    vfp_nsyn_cmp),
+ nCE(vcmpe,     vcmpe,   2, (RVSD, RVSD_I0),    vfp_nsyn_cmp),
+ NCE(vpush,     0,       1, (VRSDLST),          vfp_nsyn_push),
+ NCE(vpop,      0,       1, (VRSDLST),          vfp_nsyn_pop),
+ NCE(vcvtz,     0,       2, (RVSD, RVSD),       vfp_nsyn_cvtz),
+
+  /* Mnemonics shared by Neon and VFP.  */
+ nCEF(vmul,     vmul,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mul),
+ nCEF(vmla,     vmla,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
+ nCEF(vmls,     vmls,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
+
+ nCEF(vadd,     vadd,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_addsub_if_i),
+ nCEF(vsub,     vsub,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_addsub_if_i),
+
+ NCEF(vabs,     1b10300, 2, (RNSDQ, RNSDQ), neon_abs_neg),
+ NCEF(vneg,     1b10380, 2, (RNSDQ, RNSDQ), neon_abs_neg),
+
+ NCE(vldm,      c900b00, 2, (RRw, VRSDLST), neon_ldm_stm),
+ NCE(vldmia,    c900b00, 2, (RRw, VRSDLST), neon_ldm_stm),
+ NCE(vldmdb,    d100b00, 2, (RRw, VRSDLST), neon_ldm_stm),
+ NCE(vstm,      c800b00, 2, (RRw, VRSDLST), neon_ldm_stm),
+ NCE(vstmia,    c800b00, 2, (RRw, VRSDLST), neon_ldm_stm),
+ NCE(vstmdb,    d000b00, 2, (RRw, VRSDLST), neon_ldm_stm),
+ NCE(vldr,      d100b00, 2, (RVSD, ADDRGLDC), neon_ldr_str),
+ NCE(vstr,      d000b00, 2, (RVSD, ADDRGLDC), neon_ldr_str),
+
+ nCEF(vcvt,     vcvt,    3, (RNSDQ, RNSDQ, oI32b), neon_cvt),
+
+  /* NOTE: All VMOV encoding is special-cased!  */
+ NCE(vmov,      0,       1, (VMOV), neon_mov),
+ NCE(vmovq,     0,       1, (VMOV), neon_mov),
+
 #undef THUMB_VARIANT
 #define THUMB_VARIANT &fpu_neon_ext_v1
 #undef ARM_VARIANT
@@ -14101,30 +15699,24 @@ static const struct asm_opcode insns[] =
  nUF(vcltq,     vclt,    3, (RNQ,  oRNQ,  RNDQ_I0), neon_cmp_inv),
  nUF(vcle,      vcle,    3, (RNDQ, oRNDQ, RNDQ_I0), neon_cmp_inv),
  nUF(vcleq,     vcle,    3, (RNQ,  oRNQ,  RNDQ_I0), neon_cmp_inv),
-  /* Comparison. Type I8 I16 I32 F32. Non-immediate -> neon_dyadic_if_i.  */
+  /* Comparison. Type I8 I16 I32 F32.  */
  nUF(vceq,      vceq,    3, (RNDQ, oRNDQ, RNDQ_I0), neon_ceq),
  nUF(vceqq,     vceq,    3, (RNQ,  oRNQ,  RNDQ_I0), neon_ceq),
   /* As above, D registers only.  */
  nUF(vpmax,     vpmax,   3, (RND, oRND, RND), neon_dyadic_if_su_d),
  nUF(vpmin,     vpmin,   3, (RND, oRND, RND), neon_dyadic_if_su_d),
   /* Int and float variants, signedness unimportant.  */
-  /* If not scalar, fall back to neon_dyadic_if_i.  */
- nUF(vmla,      vmla,    3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_mac_maybe_scalar),
  nUF(vmlaq,     vmla,    3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_mac_maybe_scalar),
- nUF(vmls,      vmls,    3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_mac_maybe_scalar),
  nUF(vmlsq,     vmls,    3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_mac_maybe_scalar),
  nUF(vpadd,     vpadd,   3, (RND,  oRND,  RND),       neon_dyadic_if_i_d),
   /* Add/sub take types I8 I16 I32 I64 F32.  */
- nUF(vadd,      vadd,    3, (RNDQ, oRNDQ, RNDQ), neon_addsub_if_i),
  nUF(vaddq,     vadd,    3, (RNQ,  oRNQ,  RNQ),  neon_addsub_if_i),
- nUF(vsub,      vsub,    3, (RNDQ, oRNDQ, RNDQ), neon_addsub_if_i),
  nUF(vsubq,     vsub,    3, (RNQ,  oRNQ,  RNQ),  neon_addsub_if_i),
   /* vtst takes sizes 8, 16, 32.  */
  NUF(vtst,      0000810, 3, (RNDQ, oRNDQ, RNDQ), neon_tst),
  NUF(vtstq,     0000810, 3, (RNQ,  oRNQ,  RNQ),  neon_tst),
   /* VMUL takes I8 I16 I32 F32 P8.  */
- nUF(vmul,      vmul,    3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_mul),
- nUF(vmulq,     vmul,    3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_mul),
+ nUF(vmulq,     vmul,     3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_mul),
   /* VQD{R}MULH takes S16 S32.  */
  nUF(vqdmulh,   vqdmulh,  3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
  nUF(vqdmulhq,  vqdmulh,  3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
@@ -14144,9 +15736,7 @@ static const struct asm_opcode insns[] =
  NUF(vrsqrtsq,  0200f10,  3, (RNQ,  oRNQ,  RNQ),  neon_step),
 
   /* Two address, int/float. Types S8 S16 S32 F32.  */
- NUF(vabs,      1b10300, 2, (RNDQ, RNDQ),     neon_abs_neg),
  NUF(vabsq,     1b10300, 2, (RNQ,  RNQ),      neon_abs_neg),
- NUF(vneg,      1b10380, 2, (RNDQ, RNDQ),     neon_abs_neg),
  NUF(vnegq,     1b10380, 2, (RNQ,  RNQ),      neon_abs_neg),
 
   /* Data processing with two registers and a shift amount.  */
@@ -14181,21 +15771,8 @@ static const struct asm_opcode insns[] =
   /* Special case. Types S8 S16 S32 U8 U16 U32. Handles max shift variant.  */
  nUF(vshll,     vshll,   3, (RNQ, RND, I32),  neon_shll),
   /* CVT with optional immediate for fixed-point variant.  */
- nUF(vcvt,      vcvt,    3, (RNDQ, RNDQ, oI32b), neon_cvt),
- nUF(vcvtq,     vcvt,    3, (RNQ,  RNQ,  oI32b), neon_cvt),
-
-  /* One register and an immediate value. All encoding special-cased!  */
-#undef THUMB_VARIANT
-#define THUMB_VARIANT &fpu_vfp_ext_v1
-#undef ARM_VARIANT
-#define ARM_VARIANT &fpu_vfp_ext_v1
- NCE(vmov,      0,       1, (VMOV),             neon_mov),
+ nUF(vcvtq,     vcvt,    3, (RNQ, RNQ, oI32b), neon_cvt),
 
-#undef THUMB_VARIANT
-#define THUMB_VARIANT &fpu_neon_ext_v1
-#undef ARM_VARIANT
-#define ARM_VARIANT &fpu_neon_ext_v1
- NCE(vmovq,     0,       1, (VMOV),             neon_mov),
  nUF(vmvn,      vmvn,    2, (RNDQ, RNDQ_IMVNb), neon_mvn),
  nUF(vmvnq,     vmvn,    2, (RNQ,  RNDQ_IMVNb), neon_mvn),
 
@@ -14288,26 +15865,10 @@ static const struct asm_opcode insns[] =
  NUF(vtbl,      1b00800, 3, (RND, NRDLST, RND), neon_tbl_tbx),
  NUF(vtbx,      1b00840, 3, (RND, NRDLST, RND), neon_tbl_tbx),
 
-#undef THUMB_VARIANT
-#define THUMB_VARIANT &fpu_vfp_ext_v1xd
-#undef ARM_VARIANT
-#define ARM_VARIANT &fpu_vfp_ext_v1xd
-
-  /* Load/store instructions. Available in Neon or VFPv3.  */
- NCE(vldm,      c900b00, 2, (RRw, NRDLST),    neon_ldm_stm),
- NCE(vldmia,    c900b00, 2, (RRw, NRDLST),    neon_ldm_stm),
- NCE(vldmdb,    d100b00, 2, (RRw, NRDLST),    neon_ldm_stm),
- NCE(vstm,      c800b00, 2, (RRw, NRDLST),    neon_ldm_stm),
- NCE(vstmia,    c800b00, 2, (RRw, NRDLST),    neon_ldm_stm),
- NCE(vstmdb,    d000b00, 2, (RRw, NRDLST),    neon_ldm_stm),
- NCE(vldr,      d100b00, 2, (RND, ADDR),      neon_ldr_str),
- NCE(vstr,      d000b00, 2, (RND, ADDR),      neon_ldr_str),
-
 #undef THUMB_VARIANT
 #define THUMB_VARIANT &fpu_vfp_v3_or_neon_ext
 #undef ARM_VARIANT
 #define ARM_VARIANT &fpu_vfp_v3_or_neon_ext
-
   /* Neon element/structure load/store.  */
  nUF(vld1,      vld1,    2, (NSTRLST, ADDR),  neon_ldx_stx),
  nUF(vst1,      vst1,    2, (NSTRLST, ADDR),  neon_ldx_stx),
@@ -14322,7 +15883,6 @@ static const struct asm_opcode insns[] =
 #define THUMB_VARIANT &fpu_vfp_ext_v3
 #undef ARM_VARIANT
 #define ARM_VARIANT &fpu_vfp_ext_v3
-
  cCE(fconsts,   eb00a00, 2, (RVS, I255),      vfp_sp_const),
  cCE(fconstd,   eb00b00, 2, (RVD, I255),      vfp_dp_const),
  cCE(fshtos,    eba0a40, 2, (RVS, I16z),      vfp_sp_conv_16),
@@ -14374,7 +15934,7 @@ static const struct asm_opcode insns[] =
  cCE(tinsrb,	e600010, 3, (RIWR, RR, I7),	    iwmmxt_tinsr),
  cCE(tinsrh,	e600050, 3, (RIWR, RR, I7),	    iwmmxt_tinsr),
  cCE(tinsrw,	e600090, 3, (RIWR, RR, I7),	    iwmmxt_tinsr),
- cCE(tmcr,	e000110, 2, (RIWC, RR),		    rn_rd),
+ cCE(tmcr,	e000110, 2, (RIWC_RIWG, RR),	    rn_rd),
  cCE(tmcrr,	c400000, 3, (RIWR, RR, RR),	    rm_rd_rn),
  cCE(tmia,	e200010, 3, (RIWR, RR, RR),	    iwmmxt_tmia),
  cCE(tmiaph,	e280010, 3, (RIWR, RR, RR),	    iwmmxt_tmia),
@@ -14385,7 +15945,7 @@ static const struct asm_opcode insns[] =
  cCE(tmovmskb,	e100030, 2, (RR, RIWR),		    rd_rn),
  cCE(tmovmskh,	e500030, 2, (RR, RIWR),		    rd_rn),
  cCE(tmovmskw,	e900030, 2, (RR, RIWR),		    rd_rn),
- cCE(tmrc,	e100110, 2, (RR, RIWC),		    rd_rn),
+ cCE(tmrc,	e100110, 2, (RR, RIWC_RIWG),	    rd_rn),
  cCE(tmrrc,	c500000, 3, (RR, RR, RIWR),	    rd_rn_rm),
  cCE(torcb,	e13f150, 1, (RR),		    iwmmxt_tandorc),
  cCE(torch,	e53f150, 1, (RR),		    iwmmxt_tandorc),
@@ -14456,34 +16016,34 @@ static const struct asm_opcode insns[] =
  cCE(wpackwus,	e900080, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
  cCE(wpackdss,	ef00080, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
  cCE(wpackdus,	ed00080, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
- cCE(wrorh,	e700040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wrorh,	e700040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wrorhg,	e700148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wrorw,	eb00040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wrorw,	eb00040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wrorwg,	eb00148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wrord,	ef00040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wrord,	ef00040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wrordg,	ef00148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
  cCE(wsadb,	e000120, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
  cCE(wsadbz,	e100120, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
  cCE(wsadh,	e400120, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
  cCE(wsadhz,	e500120, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
  cCE(wshufh,	e0001e0, 3, (RIWR, RIWR, I255),	    iwmmxt_wshufh),
- cCE(wsllh,	e500040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wsllh,	e500040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wsllhg,	e500148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wsllw,	e900040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wsllw,	e900040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wsllwg,	e900148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wslld,	ed00040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wslld,	ed00040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wslldg,	ed00148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wsrah,	e400040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wsrah,	e400040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wsrahg,	e400148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wsraw,	e800040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wsraw,	e800040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wsrawg,	e800148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wsrad,	ec00040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wsrad,	ec00040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wsradg,	ec00148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wsrlh,	e600040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wsrlh,	e600040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wsrlhg,	e600148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wsrlw,	ea00040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wsrlw,	ea00040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wsrlwg,	ea00148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
- cCE(wsrld,	ee00040, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
+ cCE(wsrld,	ee00040, 3, (RIWR, RIWR, RIWR_I32z),iwmmxt_wrwrwr_or_imm5),
  cCE(wsrldg,	ee00148, 3, (RIWR, RIWR, RIWG),	    rd_rn_rm),
  cCE(wstrb,	c000000, 2, (RIWR, ADDR),	    iwmmxt_wldstbh),
  cCE(wstrh,	c400000, 2, (RIWR, ADDR),	    iwmmxt_wldstbh),
@@ -14519,16 +16079,76 @@ static const struct asm_opcode insns[] =
  cCE(wxor,	e100000, 3, (RIWR, RIWR, RIWR),	    rd_rn_rm),
  cCE(wzero,	e300000, 1, (RIWR),		    iwmmxt_wzero),
 
+#undef ARM_VARIANT
+#define ARM_VARIANT &arm_cext_iwmmxt2 /* Intel Wireless MMX technology, version 2.  */
+ cCE(torvscb,   e13f190, 1, (RR),		    iwmmxt_tandorc),
+ cCE(torvsch,   e53f190, 1, (RR),		    iwmmxt_tandorc),
+ cCE(torvscw,   e93f190, 1, (RR),		    iwmmxt_tandorc),
+ cCE(wabsb,     e2001c0, 2, (RIWR, RIWR),           rd_rn),
+ cCE(wabsh,     e6001c0, 2, (RIWR, RIWR),           rd_rn),
+ cCE(wabsw,     ea001c0, 2, (RIWR, RIWR),           rd_rn),
+ cCE(wabsdiffb, e1001c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wabsdiffh, e5001c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wabsdiffw, e9001c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(waddbhusl, e2001a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(waddbhusm, e6001a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(waddhc,    e600180, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(waddwc,    ea00180, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(waddsubhx, ea001a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wavg4,	e400000, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wavg4r,    e500000, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmaddsn,   ee00100, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmaddsx,   eb00100, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmaddun,   ec00100, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmaddux,   e900100, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmerge,    e000080, 4, (RIWR, RIWR, RIWR, I7), iwmmxt_wmerge),
+ cCE(wmiabb,    e0000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiabt,    e1000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiatb,    e2000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiatt,    e3000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiabbn,   e4000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiabtn,   e5000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiatbn,   e6000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiattn,   e7000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiawbb,   e800120, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiawbt,   e900120, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiawtb,   ea00120, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiawtt,   eb00120, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiawbbn,  ec00120, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiawbtn,  ed00120, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiawtbn,  ee00120, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmiawttn,  ef00120, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmulsmr,   ef00100, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmulumr,   ed00100, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmulwumr,  ec000c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmulwsmr,  ee000c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmulwum,   ed000c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmulwsm,   ef000c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wmulwl,    eb000c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmiabb,   e8000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmiabt,   e9000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmiatb,   ea000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmiatt,   eb000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmiabbn,  ec000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmiabtn,  ed000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmiatbn,  ee000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmiattn,  ef000a0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmulm,    e100080, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmulmr,   e300080, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmulwm,   ec000e0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wqmulwmr,  ee000e0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+ cCE(wsubaddhx, ed001c0, 3, (RIWR, RIWR, RIWR),     rd_rn_rm),
+
 #undef ARM_VARIANT
 #define ARM_VARIANT &arm_cext_maverick /* Cirrus Maverick instructions.	*/
- cCE(cfldrs,	c100400, 2, (RMF, ADDR),	      rd_cpaddr),
- cCE(cfldrd,	c500400, 2, (RMD, ADDR),	      rd_cpaddr),
- cCE(cfldr32,	c100500, 2, (RMFX, ADDR),	      rd_cpaddr),
- cCE(cfldr64,	c500500, 2, (RMDX, ADDR),	      rd_cpaddr),
- cCE(cfstrs,	c000400, 2, (RMF, ADDR),	      rd_cpaddr),
- cCE(cfstrd,	c400400, 2, (RMD, ADDR),	      rd_cpaddr),
- cCE(cfstr32,	c000500, 2, (RMFX, ADDR),	      rd_cpaddr),
- cCE(cfstr64,	c400500, 2, (RMDX, ADDR),	      rd_cpaddr),
+ cCE(cfldrs,	c100400, 2, (RMF, ADDRGLDC),	      rd_cpaddr),
+ cCE(cfldrd,	c500400, 2, (RMD, ADDRGLDC),	      rd_cpaddr),
+ cCE(cfldr32,	c100500, 2, (RMFX, ADDRGLDC),	      rd_cpaddr),
+ cCE(cfldr64,	c500500, 2, (RMDX, ADDRGLDC),	      rd_cpaddr),
+ cCE(cfstrs,	c000400, 2, (RMF, ADDRGLDC),	      rd_cpaddr),
+ cCE(cfstrd,	c400400, 2, (RMD, ADDRGLDC),	      rd_cpaddr),
+ cCE(cfstr32,	c000500, 2, (RMFX, ADDRGLDC),	      rd_cpaddr),
+ cCE(cfstr64,	c400500, 2, (RMDX, ADDRGLDC),	      rd_cpaddr),
  cCE(cfmvsr,	e000450, 2, (RMF, RR),		      rn_rd),
  cCE(cfmvrs,	e100450, 2, (RR, RMF),		      rd_rn),
  cCE(cfmvdlr,	e000410, 2, (RMD, RR),		      rn_rd),
@@ -14810,7 +16430,10 @@ md_convert_frag (bfd *abfd, segT asec ATTRIBUTE_UNUSED, fragS *fragp)
 	  insn = THUMB_OP32 (opcode);
 	  insn |= (old_op & 0xf0) << 4;
 	  put_thumb32_insn (buf, insn);
-	  reloc_type = BFD_RELOC_ARM_T32_IMMEDIATE;
+	  if (opcode == T_MNEM_add_pc)
+	    reloc_type = BFD_RELOC_ARM_T32_IMM12;
+	  else
+	    reloc_type = BFD_RELOC_ARM_T32_ADD_IMM;
 	}
       else
 	reloc_type = BFD_RELOC_ARM_THUMB_ADD;
@@ -14827,7 +16450,10 @@ md_convert_frag (bfd *abfd, segT asec ATTRIBUTE_UNUSED, fragS *fragp)
 	  insn |= (old_op & 0xf0) << 4;
 	  insn |= (old_op & 0xf) << 16;
 	  put_thumb32_insn (buf, insn);
-	  reloc_type = BFD_RELOC_ARM_T32_IMMEDIATE;
+	  if (insn & (1 << 20))
+	    reloc_type = BFD_RELOC_ARM_T32_ADD_IMM;
+	  else
+	    reloc_type = BFD_RELOC_ARM_T32_IMMEDIATE;
 	}
       else
 	reloc_type = BFD_RELOC_ARM_THUMB_ADD;
@@ -15018,12 +16644,22 @@ valueT
 md_section_align (segT	 segment ATTRIBUTE_UNUSED,
 		  valueT size)
 {
-#ifdef OBJ_ELF
-  return size;
-#else
-  /* Round all sects to multiple of 4.	*/
-  return (size + 3) & ~3;
+#if (defined (OBJ_AOUT) || defined (OBJ_MAYBE_AOUT))
+  if (OUTPUT_FLAVOR == bfd_target_aout_flavour)
+    {
+      /* For a.out, force the section size to be aligned.  If we don't do
+	 this, BFD will align it for us, but it will not write out the
+	 final bytes of the section.  This may be a bug in BFD, but it is
+	 easier to fix it here since that is how the other a.out targets
+	 work.  */
+      int align;
+
+      align = bfd_get_section_alignment (stdoutput, segment);
+      size = ((size + (1 << align) - 1) & ((valueT) -1 << align));
+    }
 #endif
+
+  return size;
 }
 
 /* This is called from HANDLE_ALIGN in write.c.	 Fill in the contents
@@ -15523,12 +17159,22 @@ create_unwind_entry (int have_data)
   return 0;
 }
 
+
+/* Initialize the DWARF-2 unwind information for this procedure.  */
+
+void
+tc_arm_frame_initial_instructions (void)
+{
+  cfi_add_CFA_def_cfa (REG_SP, 0);
+}
+#endif /* OBJ_ELF */
+
 /* Convert REGNAME to a DWARF-2 register number.  */
 
 int
-tc_arm_regname_to_dw2regnum (const char *regname)
+tc_arm_regname_to_dw2regnum (char *regname)
 {
-  int reg = arm_reg_parse ((char **) &regname, REG_TYPE_RN);
+  int reg = arm_reg_parse (&regname, REG_TYPE_RN);
 
   if (reg == FAIL)
     return -1;
@@ -15536,15 +17182,18 @@ tc_arm_regname_to_dw2regnum (const char *regname)
   return reg;
 }
 
-/* Initialize the DWARF-2 unwind information for this procedure.  */
-
+#ifdef TE_PE
 void
-tc_arm_frame_initial_instructions (void)
+tc_pe_dwarf2_emit_offset (symbolS *symbol, unsigned int size)
 {
-  cfi_add_CFA_def_cfa (REG_SP, 0);
-}
-#endif /* OBJ_ELF */
+  expressionS expr;
 
+  expr.X_op = O_secrel;
+  expr.X_add_symbol = symbol;
+  expr.X_add_number = 0;
+  emit_expr (&expr, size);
+}
+#endif
 
 /* MD interface: Symbol and relocation handling.  */
 
@@ -15561,10 +17210,16 @@ md_pcrel_from_section (fixS * fixP, segT seg)
 
   /* If this is pc-relative and we are going to emit a relocation
      then we just want to put out any pipeline compensation that the linker
-     will need.  Otherwise we want to use the calculated base.  */
+     will need.  Otherwise we want to use the calculated base.
+     For WinCE we skip the bias for externals as well, since this
+     is how the MS ARM-CE assembler behaves and we want to be compatible.  */
   if (fixP->fx_pcrel 
       && ((fixP->fx_addsy && S_GET_SEGMENT (fixP->fx_addsy) != seg)
-	  || arm_force_relocation (fixP)))
+	  || (arm_force_relocation (fixP)
+#ifdef TE_WINCE
+	      && !S_IS_EXTERNAL (fixP->fx_addsy)
+#endif
+	      )))
     base = 0;
 
   switch (fixP->fx_r_type)
@@ -15601,6 +17256,17 @@ md_pcrel_from_section (fixS * fixP, segT seg)
     case BFD_RELOC_ARM_PCREL_BLX:
     case BFD_RELOC_ARM_PLT32:
 #ifdef TE_WINCE
+      /* When handling fixups immediately, because we have already 
+         discovered the value of a symbol, or the address of the frag involved
+	 we must account for the offset by +8, as the OS loader will never see the reloc.
+         see fixup_segment() in write.c
+         The S_IS_EXTERNAL test handles the case of global symbols.
+         Those need the calculated base, not just the pipe compensation the linker will need.  */
+      if (fixP->fx_pcrel
+	  && fixP->fx_addsy != NULL
+	  && (S_GET_SEGMENT (fixP->fx_addsy) == seg)
+	  && (S_IS_EXTERNAL (fixP->fx_addsy) || !arm_force_relocation (fixP)))
+	return base + 8;
       return base;
 #else
       return base + 8;
@@ -15788,11 +17454,11 @@ negate_data_op (unsigned long * instruction,
 /* Like negate_data_op, but for Thumb-2.   */
 
 static unsigned int
-thumb32_negate_data_op (offsetT *instruction, offsetT value)
+thumb32_negate_data_op (offsetT *instruction, unsigned int value)
 {
   int op, new_inst;
   int rd;
-  offsetT negated, inverted;
+  unsigned int negated, inverted;
 
   negated = encode_thumb32_immediate (-value);
   inverted = encode_thumb32_immediate (~value);
@@ -15853,7 +17519,7 @@ thumb32_negate_data_op (offsetT *instruction, offsetT value)
       return FAIL;
     }
 
-  if (value == FAIL)
+  if (value == (unsigned int)FAIL)
     return FAIL;
 
   *instruction &= T2_OPCODE_MASK;
@@ -15910,6 +17576,7 @@ md_apply_fix (fixS *	fixP,
   assert (fixP->fx_r_type <= BFD_RELOC_UNUSED);
 
   /* Note whether this will delete the relocation.  */
+
   if (fixP->fx_addsy == 0 && !fixP->fx_pcrel)
     fixP->fx_done = 1;
 
@@ -16217,6 +17884,7 @@ md_apply_fix (fixS *	fixP,
       break;
 
     case BFD_RELOC_ARM_T32_IMMEDIATE:
+    case BFD_RELOC_ARM_T32_ADD_IMM:
     case BFD_RELOC_ARM_T32_IMM12:
     case BFD_RELOC_ARM_T32_ADD_PC12:
       /* We claim that this fixup has been processed here,
@@ -16237,15 +17905,21 @@ md_apply_fix (fixS *	fixP,
       newval <<= 16;
       newval |= md_chars_to_number (buf+2, THUMB_SIZE);
 
-      /* FUTURE: Implement analogue of negate_data_op for T32.  */
-      if (fixP->fx_r_type == BFD_RELOC_ARM_T32_IMMEDIATE)
+      newimm = FAIL;
+      if (fixP->fx_r_type == BFD_RELOC_ARM_T32_IMMEDIATE
+	  || fixP->fx_r_type == BFD_RELOC_ARM_T32_ADD_IMM)
 	{
 	  newimm = encode_thumb32_immediate (value);
 	  if (newimm == (unsigned int) FAIL)
 	    newimm = thumb32_negate_data_op (&newval, value);
 	}
-      else
+      if (fixP->fx_r_type != BFD_RELOC_ARM_T32_IMMEDIATE
+	  && newimm == (unsigned int) FAIL)
 	{
+	  /* Turn add/sum into addw/subw.  */
+	  if (fixP->fx_r_type == BFD_RELOC_ARM_T32_ADD_IMM)
+	    newval = (newval & 0xfeffffff) | 0x02000000;
+
 	  /* 12 bit immediate for addw/subw.  */
 	  if (value < 0)
 	    {
@@ -16359,8 +18033,8 @@ md_apply_fix (fixS *	fixP,
 	}
       break;
 
-    case BFD_RELOC_THUMB_PCREL_BRANCH7: /* CZB */
-      /* CZB can only branch forward.  */
+    case BFD_RELOC_THUMB_PCREL_BRANCH7: /* CBZ */
+      /* CBZ can only branch forward.  */
       if (value & ~0x7e)
 	as_bad_where (fixP->fx_file, fixP->fx_line,
 		      _("branch out of range"));
@@ -16511,8 +18185,15 @@ md_apply_fix (fixS *	fixP,
     case BFD_RELOC_ARM_ROSEGREL32:
     case BFD_RELOC_ARM_SBREL32:
     case BFD_RELOC_32_PCREL:
+#ifdef TE_PE
+    case BFD_RELOC_32_SECREL:
+#endif
       if (fixP->fx_done || !seg->use_rela_p)
-	md_number_to_chars (buf, value, 4);
+#ifdef TE_WINCE
+	/* For WinCE we only do this for pcrel fixups.  */
+	if (fixP->fx_done || fixP->fx_pcrel)
+#endif
+	  md_number_to_chars (buf, value, 4);
       break;
 
 #ifdef OBJ_ELF
@@ -16547,8 +18228,6 @@ md_apply_fix (fixS *	fixP,
 	newval = get_thumb32_insn (buf);
       newval &= 0xff7fff00;
       newval |= (value >> 2) | (sign ? INDEX_UP : 0);
-      if (value == 0)
-	newval &= ~WRITE_BACK;
       if (fixP->fx_r_type == BFD_RELOC_ARM_CP_OFF_IMM
 	  || fixP->fx_r_type == BFD_RELOC_ARM_CP_OFF_IMM_S2)
 	md_number_to_chars (buf, newval, INSN_SIZE);
@@ -16740,6 +18419,216 @@ md_apply_fix (fixS *	fixP,
       fixP->fx_done = 0;
       return;
 
+    case BFD_RELOC_ARM_MOVW:
+    case BFD_RELOC_ARM_MOVT:
+    case BFD_RELOC_ARM_THUMB_MOVW:
+    case BFD_RELOC_ARM_THUMB_MOVT:
+      if (fixP->fx_done || !seg->use_rela_p)
+	{
+	  /* REL format relocations are limited to a 16-bit addend.  */
+	  if (!fixP->fx_done)
+	    {
+	      if (value < -0x1000 || value > 0xffff)
+		  as_bad_where (fixP->fx_file, fixP->fx_line,
+				_("offset too big"));
+	    }
+	  else if (fixP->fx_r_type == BFD_RELOC_ARM_MOVT
+		   || fixP->fx_r_type == BFD_RELOC_ARM_THUMB_MOVT)
+	    {
+	      value >>= 16;
+	    }
+
+	  if (fixP->fx_r_type == BFD_RELOC_ARM_THUMB_MOVW
+	      || fixP->fx_r_type == BFD_RELOC_ARM_THUMB_MOVT)
+	    {
+	      newval = get_thumb32_insn (buf);
+	      newval &= 0xfbf08f00;
+	      newval |= (value & 0xf000) << 4;
+	      newval |= (value & 0x0800) << 15;
+	      newval |= (value & 0x0700) << 4;
+	      newval |= (value & 0x00ff);
+	      put_thumb32_insn (buf, newval);
+	    }
+	  else
+	    {
+	      newval = md_chars_to_number (buf, 4);
+	      newval &= 0xfff0f000;
+	      newval |= value & 0x0fff;
+	      newval |= (value & 0xf000) << 4;
+	      md_number_to_chars (buf, newval, 4);
+	    }
+	}
+      return;
+
+   case BFD_RELOC_ARM_ALU_PC_G0_NC:
+   case BFD_RELOC_ARM_ALU_PC_G0:
+   case BFD_RELOC_ARM_ALU_PC_G1_NC:
+   case BFD_RELOC_ARM_ALU_PC_G1:
+   case BFD_RELOC_ARM_ALU_PC_G2:
+   case BFD_RELOC_ARM_ALU_SB_G0_NC:
+   case BFD_RELOC_ARM_ALU_SB_G0:
+   case BFD_RELOC_ARM_ALU_SB_G1_NC:
+   case BFD_RELOC_ARM_ALU_SB_G1:
+   case BFD_RELOC_ARM_ALU_SB_G2:
+     assert (!fixP->fx_done);
+     if (!seg->use_rela_p)
+       {
+         bfd_vma insn;
+         bfd_vma encoded_addend;
+         bfd_vma addend_abs = abs (value);
+
+         /* Check that the absolute value of the addend can be
+            expressed as an 8-bit constant plus a rotation.  */
+         encoded_addend = encode_arm_immediate (addend_abs);
+         if (encoded_addend == (unsigned int) FAIL)
+	   as_bad_where (fixP->fx_file, fixP->fx_line,
+	                 _("the offset 0x%08lX is not representable"),
+                         addend_abs);
+
+         /* Extract the instruction.  */
+         insn = md_chars_to_number (buf, INSN_SIZE);
+
+         /* If the addend is positive, use an ADD instruction.
+            Otherwise use a SUB.  Take care not to destroy the S bit.  */
+         insn &= 0xff1fffff;
+         if (value < 0)
+           insn |= 1 << 22;
+         else
+           insn |= 1 << 23;
+
+         /* Place the encoded addend into the first 12 bits of the
+            instruction.  */
+         insn &= 0xfffff000;
+         insn |= encoded_addend;
+   
+         /* Update the instruction.  */  
+         md_number_to_chars (buf, insn, INSN_SIZE);
+       }
+     break;
+
+    case BFD_RELOC_ARM_LDR_PC_G0:
+    case BFD_RELOC_ARM_LDR_PC_G1:
+    case BFD_RELOC_ARM_LDR_PC_G2:
+    case BFD_RELOC_ARM_LDR_SB_G0:
+    case BFD_RELOC_ARM_LDR_SB_G1:
+    case BFD_RELOC_ARM_LDR_SB_G2:
+      assert (!fixP->fx_done);
+      if (!seg->use_rela_p)
+        {
+          bfd_vma insn;
+          bfd_vma addend_abs = abs (value);
+
+          /* Check that the absolute value of the addend can be
+             encoded in 12 bits.  */
+          if (addend_abs >= 0x1000)
+	    as_bad_where (fixP->fx_file, fixP->fx_line,
+	  	          _("bad offset 0x%08lX (only 12 bits available for the magnitude)"),
+                          addend_abs);
+
+          /* Extract the instruction.  */
+          insn = md_chars_to_number (buf, INSN_SIZE);
+
+          /* If the addend is negative, clear bit 23 of the instruction.
+             Otherwise set it.  */
+          if (value < 0)
+            insn &= ~(1 << 23);
+          else
+            insn |= 1 << 23;
+
+          /* Place the absolute value of the addend into the first 12 bits
+             of the instruction.  */
+          insn &= 0xfffff000;
+          insn |= addend_abs;
+    
+          /* Update the instruction.  */  
+          md_number_to_chars (buf, insn, INSN_SIZE);
+        }
+      break;
+
+    case BFD_RELOC_ARM_LDRS_PC_G0:
+    case BFD_RELOC_ARM_LDRS_PC_G1:
+    case BFD_RELOC_ARM_LDRS_PC_G2:
+    case BFD_RELOC_ARM_LDRS_SB_G0:
+    case BFD_RELOC_ARM_LDRS_SB_G1:
+    case BFD_RELOC_ARM_LDRS_SB_G2:
+      assert (!fixP->fx_done);
+      if (!seg->use_rela_p)
+        {
+          bfd_vma insn;
+          bfd_vma addend_abs = abs (value);
+
+          /* Check that the absolute value of the addend can be
+             encoded in 8 bits.  */
+          if (addend_abs >= 0x100)
+	    as_bad_where (fixP->fx_file, fixP->fx_line,
+	  	          _("bad offset 0x%08lX (only 8 bits available for the magnitude)"),
+                          addend_abs);
+
+          /* Extract the instruction.  */
+          insn = md_chars_to_number (buf, INSN_SIZE);
+
+          /* If the addend is negative, clear bit 23 of the instruction.
+             Otherwise set it.  */
+          if (value < 0)
+            insn &= ~(1 << 23);
+          else
+            insn |= 1 << 23;
+
+          /* Place the first four bits of the absolute value of the addend
+             into the first 4 bits of the instruction, and the remaining
+             four into bits 8 .. 11.  */
+          insn &= 0xfffff0f0;
+          insn |= (addend_abs & 0xf) | ((addend_abs & 0xf0) << 4);
+    
+          /* Update the instruction.  */  
+          md_number_to_chars (buf, insn, INSN_SIZE);
+        }
+      break;
+
+    case BFD_RELOC_ARM_LDC_PC_G0:
+    case BFD_RELOC_ARM_LDC_PC_G1:
+    case BFD_RELOC_ARM_LDC_PC_G2:
+    case BFD_RELOC_ARM_LDC_SB_G0:
+    case BFD_RELOC_ARM_LDC_SB_G1:
+    case BFD_RELOC_ARM_LDC_SB_G2:
+      assert (!fixP->fx_done);
+      if (!seg->use_rela_p)
+        {
+          bfd_vma insn;
+          bfd_vma addend_abs = abs (value);
+
+          /* Check that the absolute value of the addend is a multiple of
+             four and, when divided by four, fits in 8 bits.  */
+          if (addend_abs & 0x3)
+	    as_bad_where (fixP->fx_file, fixP->fx_line,
+	  	          _("bad offset 0x%08lX (must be word-aligned)"),
+                          addend_abs);
+
+          if ((addend_abs >> 2) > 0xff)
+	    as_bad_where (fixP->fx_file, fixP->fx_line,
+	  	          _("bad offset 0x%08lX (must be an 8-bit number of words)"),
+                          addend_abs);
+
+          /* Extract the instruction.  */
+          insn = md_chars_to_number (buf, INSN_SIZE);
+
+          /* If the addend is negative, clear bit 23 of the instruction.
+             Otherwise set it.  */
+          if (value < 0)
+            insn &= ~(1 << 23);
+          else
+            insn |= 1 << 23;
+
+          /* Place the addend (divided by four) into the first eight
+             bits of the instruction.  */
+          insn &= 0xfffffff0;
+          insn |= addend_abs >> 2;
+    
+          /* Update the instruction.  */  
+          md_number_to_chars (buf, insn, INSN_SIZE);
+        }
+      break;
+
     case BFD_RELOC_UNUSED:
     default:
       as_bad_where (fixP->fx_file, fixP->fx_line,
@@ -16794,6 +18683,34 @@ tc_gen_reloc (asection *section, fixS *fixp)
 	  break;
 	}
 
+    case BFD_RELOC_ARM_MOVW:
+      if (fixp->fx_pcrel)
+	{
+	  code = BFD_RELOC_ARM_MOVW_PCREL;
+	  break;
+	}
+
+    case BFD_RELOC_ARM_MOVT:
+      if (fixp->fx_pcrel)
+	{
+	  code = BFD_RELOC_ARM_MOVT_PCREL;
+	  break;
+	}
+
+    case BFD_RELOC_ARM_THUMB_MOVW:
+      if (fixp->fx_pcrel)
+	{
+	  code = BFD_RELOC_ARM_THUMB_MOVW_PCREL;
+	  break;
+	}
+
+    case BFD_RELOC_ARM_THUMB_MOVT:
+      if (fixp->fx_pcrel)
+	{
+	  code = BFD_RELOC_ARM_THUMB_MOVT_PCREL;
+	  break;
+	}
+
     case BFD_RELOC_NONE:
     case BFD_RELOC_ARM_PCREL_BRANCH:
     case BFD_RELOC_ARM_PCREL_BLX:
@@ -16807,6 +18724,9 @@ tc_gen_reloc (asection *section, fixS *fixp)
     case BFD_RELOC_THUMB_PCREL_BLX:
     case BFD_RELOC_VTABLE_ENTRY:
     case BFD_RELOC_VTABLE_INHERIT:
+#ifdef TE_PE
+    case BFD_RELOC_32_SECREL:
+#endif
       code = fixp->fx_r_type;
       break;
 
@@ -16831,6 +18751,34 @@ tc_gen_reloc (asection *section, fixS *fixp)
     case BFD_RELOC_ARM_TLS_LDO32:
     case BFD_RELOC_ARM_PCREL_CALL:
     case BFD_RELOC_ARM_PCREL_JUMP:
+    case BFD_RELOC_ARM_ALU_PC_G0_NC:
+    case BFD_RELOC_ARM_ALU_PC_G0:
+    case BFD_RELOC_ARM_ALU_PC_G1_NC:
+    case BFD_RELOC_ARM_ALU_PC_G1:
+    case BFD_RELOC_ARM_ALU_PC_G2:
+    case BFD_RELOC_ARM_LDR_PC_G0:
+    case BFD_RELOC_ARM_LDR_PC_G1:
+    case BFD_RELOC_ARM_LDR_PC_G2:
+    case BFD_RELOC_ARM_LDRS_PC_G0:
+    case BFD_RELOC_ARM_LDRS_PC_G1:
+    case BFD_RELOC_ARM_LDRS_PC_G2:
+    case BFD_RELOC_ARM_LDC_PC_G0:
+    case BFD_RELOC_ARM_LDC_PC_G1:
+    case BFD_RELOC_ARM_LDC_PC_G2:
+    case BFD_RELOC_ARM_ALU_SB_G0_NC:
+    case BFD_RELOC_ARM_ALU_SB_G0:
+    case BFD_RELOC_ARM_ALU_SB_G1_NC:
+    case BFD_RELOC_ARM_ALU_SB_G1:
+    case BFD_RELOC_ARM_ALU_SB_G2:
+    case BFD_RELOC_ARM_LDR_SB_G0:
+    case BFD_RELOC_ARM_LDR_SB_G1:
+    case BFD_RELOC_ARM_LDR_SB_G2:
+    case BFD_RELOC_ARM_LDRS_SB_G0:
+    case BFD_RELOC_ARM_LDRS_SB_G1:
+    case BFD_RELOC_ARM_LDRS_SB_G2:
+    case BFD_RELOC_ARM_LDC_SB_G0:
+    case BFD_RELOC_ARM_LDC_SB_G1:
+    case BFD_RELOC_ARM_LDC_SB_G2:
       code = fixp->fx_r_type;
       break;
 
@@ -16961,6 +18909,14 @@ cons_fix_new_arm (fragS *	frag,
       break;
     }
 
+#ifdef TE_PE
+  if (exp->X_op == O_secrel)
+  {
+    exp->X_op = O_symbol;
+    type = BFD_RELOC_32_SECREL;
+  }
+#endif
+
   fix_new_exp (frag, where, (int) size, exp, pcrel, type);
 }
 
@@ -16994,47 +18950,25 @@ arm_force_relocation (struct fix * fixp)
   if (fixp->fx_r_type == BFD_RELOC_ARM_IMMEDIATE
       || fixp->fx_r_type == BFD_RELOC_ARM_OFFSET_IMM
       || fixp->fx_r_type == BFD_RELOC_ARM_ADRL_IMMEDIATE
+      || fixp->fx_r_type == BFD_RELOC_ARM_T32_ADD_IMM
       || fixp->fx_r_type == BFD_RELOC_ARM_T32_IMMEDIATE
       || fixp->fx_r_type == BFD_RELOC_ARM_T32_IMM12
       || fixp->fx_r_type == BFD_RELOC_ARM_T32_ADD_PC12)
     return 0;
 
-  return generic_force_reloc (fixp);
-}
-
-#ifdef OBJ_COFF
-bfd_boolean
-arm_fix_adjustable (fixS * fixP)
-{
-  /* This is a little hack to help the gas/arm/adrl.s test.  It prevents
-     local labels from being added to the output symbol table when they
-     are used with the ADRL pseudo op.  The ADRL relocation should always
-     be resolved before the binbary is emitted, so it is safe to say that
-     it is adjustable.  */
-  if (fixP->fx_r_type == BFD_RELOC_ARM_ADRL_IMMEDIATE)
+  /* Always leave these relocations for the linker.  */
+  if ((fixp->fx_r_type >= BFD_RELOC_ARM_ALU_PC_G0_NC
+       && fixp->fx_r_type <= BFD_RELOC_ARM_LDC_SB_G2)
+      || fixp->fx_r_type == BFD_RELOC_ARM_LDR_PC_G0)
     return 1;
 
-  /* This is a hack for the gas/all/redef2.s test.  This test causes symbols
-     to be cloned, and without this test relocs would still be generated
-     against the original, pre-cloned symbol.  Such symbols would not appear
-     in the symbol table however, and so a valid reloc could not be
-     generated.  So check to see if the fixup is against a symbol which has
-     been removed from the symbol chain, and if it is, then allow it to be
-     adjusted into a reloc against a section symbol. */
-  if (fixP->fx_addsy != NULL
-      && ! S_IS_LOCAL (fixP->fx_addsy)
-      && symbol_next (fixP->fx_addsy) == NULL
-      && symbol_next (fixP->fx_addsy) == symbol_previous (fixP->fx_addsy))
-    return 1;
-  
-  return 0;
+  return generic_force_reloc (fixp);
 }
-#endif
 
-#ifdef OBJ_ELF
-/* Relocations against Thumb function names must be left unadjusted,
-   so that the linker can use this information to correctly set the
-   bottom bit of their addresses.  The MIPS version of this function
+#if defined (OBJ_ELF) || defined (OBJ_COFF)
+/* Relocations against function names must be left unadjusted,
+   so that the linker can use this information to generate interworking
+   stubs.  The MIPS version of this function
    also prevents relocations that are mips-16 specific, but I do not
    know why it does this.
 
@@ -17051,6 +18985,10 @@ arm_fix_adjustable (fixS * fixP)
   if (fixP->fx_addsy == NULL)
     return 1;
 
+  /* Preserve relocations against symbols with function type.  */
+  if (symbol_get_bfdsym (fixP->fx_addsy)->flags & BSF_FUNCTION)
+    return 0;
+
   if (THUMB_IS_FUNC (fixP->fx_addsy)
       && fixP->fx_subsy == NULL)
     return 0;
@@ -17072,8 +19010,17 @@ arm_fix_adjustable (fixS * fixP)
       || fixP->fx_r_type == BFD_RELOC_ARM_TARGET2)
     return 0;
 
+  /* Similarly for group relocations.  */
+  if ((fixP->fx_r_type >= BFD_RELOC_ARM_ALU_PC_G0_NC
+       && fixP->fx_r_type <= BFD_RELOC_ARM_LDC_SB_G2)
+      || fixP->fx_r_type == BFD_RELOC_ARM_LDR_PC_G0)
+    return 0;
+
   return 1;
 }
+#endif /* defined (OBJ_ELF) || defined (OBJ_COFF) */
+
+#ifdef OBJ_ELF
 
 const char *
 elf32_arm_target_format (void)
@@ -17184,14 +19131,15 @@ arm_adjust_symtab (void)
 	  elf_sym = elf_symbol (symbol_get_bfdsym (sym));
 	  bind = ELF_ST_BIND (elf_sym->internal_elf_sym.st_info);
 
-	  if (! bfd_is_arm_mapping_symbol_name (elf_sym->symbol.name))
+	  if (! bfd_is_arm_special_symbol_name (elf_sym->symbol.name,
+		BFD_ARM_SPECIAL_SYM_TYPE_ANY))
 	    {
 	      /* If it's a .thumb_func, declare it as so,
 		 otherwise tag label as .code 16.  */
 	      if (THUMB_IS_FUNC (sym))
 		elf_sym->internal_elf_sym.st_info =
 		  ELF_ST_INFO (bind, STT_ARM_TFUNC);
-	      else
+	      else if (EF_ARM_EABI_VERSION (meabi_flags) < EF_ARM_EABI_VER4)
 		elf_sym->internal_elf_sym.st_info =
 		  ELF_ST_INFO (bind, STT_ARM_16BIT);
 	    }
@@ -17212,6 +19160,16 @@ set_constant_flonums (void)
       abort ();
 }
 
+/* Auto-select Thumb mode if it's the only available instruction set for the
+   given architecture.  */
+
+static void
+autoselect_thumb_from_cpu_variant (void)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v1))
+    opcode_select (16);
+}
+
 void
 md_begin (void)
 {
@@ -17312,6 +19270,8 @@ md_begin (void)
 
   ARM_MERGE_FEATURE_SETS (cpu_variant, *mcpu_cpu_opt, *mfpu_opt);
 
+  autoselect_thumb_from_cpu_variant ();
+
   arm_arch_used = thumb_arch_used = arm_arch_none;
 
 #if defined OBJ_COFF || defined OBJ_ELF
@@ -17387,7 +19347,9 @@ md_begin (void)
 #endif
 
   /* Record the CPU type as well.  */
-  if (ARM_CPU_HAS_FEATURE (cpu_variant, arm_cext_iwmmxt))
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, arm_cext_iwmmxt2))
+    mach = bfd_mach_arm_iWMMXt2;
+  else if (ARM_CPU_HAS_FEATURE (cpu_variant, arm_cext_iwmmxt))
     mach = bfd_mach_arm_iWMMXt;
   else if (ARM_CPU_HAS_FEATURE (cpu_variant, arm_cext_xscale))
     mach = bfd_mach_arm_XScale;
@@ -17766,6 +19728,7 @@ static const struct arm_cpu_option_table arm_cpus[] =
   {"xscale",		ARM_ARCH_XSCALE, FPU_ARCH_VFP_V2, NULL},
   /* ??? iwmmxt is not a processor.  */
   {"iwmmxt",		ARM_ARCH_IWMMXT, FPU_ARCH_VFP_V2, NULL},
+  {"iwmmxt2",		ARM_ARCH_IWMMXT2,FPU_ARCH_VFP_V2, NULL},
   {"i80200",		ARM_ARCH_XSCALE, FPU_ARCH_VFP_V2, NULL},
   /* Maverick */
   {"ep9312",	ARM_FEATURE(ARM_AEXT_V4T, ARM_CEXT_MAVERICK), FPU_ARCH_MAVERICK, "ARM920T"},
@@ -17815,6 +19778,7 @@ static const struct arm_arch_option_table arm_archs[] =
   {"armv7m",		ARM_ARCH_V7M,	 FPU_ARCH_VFP},
   {"xscale",		ARM_ARCH_XSCALE, FPU_ARCH_VFP},
   {"iwmmxt",		ARM_ARCH_IWMMXT, FPU_ARCH_VFP},
+  {"iwmmxt2",		ARM_ARCH_IWMMXT2,FPU_ARCH_VFP},
   {NULL,		ARM_ARCH_NONE,	 ARM_ARCH_NONE}
 };
 
@@ -17830,6 +19794,7 @@ static const struct arm_option_cpu_value_table arm_extensions[] =
   {"maverick",		ARM_FEATURE (0, ARM_CEXT_MAVERICK)},
   {"xscale",		ARM_FEATURE (0, ARM_CEXT_XSCALE)},
   {"iwmmxt",		ARM_FEATURE (0, ARM_CEXT_IWMMXT)},
+  {"iwmmxt2",		ARM_FEATURE (0, ARM_CEXT_IWMMXT2)},
   {NULL,		ARM_ARCH_NONE}
 };
 
@@ -18254,7 +20219,13 @@ aeabi_set_public_attributes (void)
   ARM_MERGE_FEATURE_SETS (flags, arm_arch_used, thumb_arch_used);
   ARM_MERGE_FEATURE_SETS (flags, flags, *mfpu_opt);
   ARM_MERGE_FEATURE_SETS (flags, flags, selected_cpu);
-    
+  /*Allow the user to override the reported architecture.  */
+  if (object_arch)
+    {
+      ARM_CLEAR_FEATURE (flags, flags, arm_arch_any);
+      ARM_MERGE_FEATURE_SETS (flags, flags, *object_arch);
+    }
+
   tmp = flags;
   arch = 0;
   for (p = cpu_arch_ver; p->val; p++)
@@ -18418,6 +20389,37 @@ s_arm_arch (int ignored ATTRIBUTE_UNUSED)
 }
 
 
+/* Parse a .object_arch directive.  */
+
+static void
+s_arm_object_arch (int ignored ATTRIBUTE_UNUSED)
+{
+  const struct arm_arch_option_table *opt;
+  char saved_char;
+  char *name;
+
+  name = input_line_pointer;
+  while (*input_line_pointer && !ISSPACE(*input_line_pointer))
+    input_line_pointer++;
+  saved_char = *input_line_pointer;
+  *input_line_pointer = 0;
+
+  /* Skip the first "all" entry.  */
+  for (opt = arm_archs + 1; opt->name != NULL; opt++)
+    if (streq (opt->name, name))
+      {
+	object_arch = &opt->value;
+	*input_line_pointer = saved_char;
+	demand_empty_rest_of_line ();
+	return;
+      }
+
+  as_bad (_("unknown architecture `%s'\n"), name);
+  *input_line_pointer = saved_char;
+  ignore_rest_of_line ();
+}
+
+
 /* Parse a .fpu directive.  */
 
 static void