X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=gas%2Fconfig%2Ftc-arm.c;h=9a367ca72895f268400bc5b3007ba8d89c081bd0;hb=c4a23bf878f2e9a64034006c91596401faf6db3e;hp=d2f04d742795746a4a31594fb511788cb49cc421;hpb=35c228db7089caf9525c1ef4cb35f6a8335eeea9;p=deliverable%2Fbinutils-gdb.git

diff --git a/gas/config/tc-arm.c b/gas/config/tc-arm.c
index d2f04d7427..9a367ca728 100644
--- a/gas/config/tc-arm.c
+++ b/gas/config/tc-arm.c
@@ -508,8 +508,12 @@ struct arm_it
     struct neon_type_el vectype;
     unsigned present	: 1;  /* Operand present.  */
     unsigned isreg	: 1;  /* Operand was a register.  */
-    unsigned immisreg	: 1;  /* .imm field is a second register.  */
-    unsigned isscalar   : 1;  /* Operand is a (Neon) scalar.  */
+    unsigned immisreg	: 2;  /* .imm field is a second register.
+				 0: imm, 1: gpr, 2: MVE Q-register.  */
+    unsigned isscalar   : 2;  /* Operand is a (SIMD) scalar:
+				 0) not scalar,
+				 1) Neon scalar,
+				 2) MVE scalar.  */
     unsigned immisalign : 1;  /* Immediate is an alignment specifier.  */
     unsigned immisfloat : 1;  /* Immediate was parsed as a float.  */
     /* Note: we abuse "regisimm" to mean "is Neon register" in VMOV
@@ -518,6 +522,7 @@ struct arm_it
     unsigned isvec      : 1;  /* Is a single, double or quad VFP/Neon reg.  */
     unsigned isquad     : 1;  /* Operand is SIMD quad register.  */
     unsigned issingle   : 1;  /* Operand is VFP single-precision register.  */
+    unsigned iszr	: 1;  /* Operand is ZR register.  */
     unsigned hasreloc	: 1;  /* Operand has relocation suffix.  */
     unsigned writeback	: 1;  /* Operand has trailing !  */
     unsigned preind	: 1;  /* Preindexed address.  */
@@ -642,6 +647,7 @@ enum arm_reg_type
   REG_TYPE_MMXWCG,
   REG_TYPE_XSCALE,
   REG_TYPE_RNB,
+  REG_TYPE_ZR
 };
 
 /* Structure for a hash table entry for a register.
@@ -894,6 +900,7 @@ struct asm_opcode
 #define BAD_MVE_SRCDEST	_("Warning: 32-bit element size and same destination "\
 			  "and source operands makes instruction UNPREDICTABLE")
 #define BAD_EL_TYPE	_("bad element type for instruction")
+#define MVE_BAD_QREG	_("MVE vector register Q[0..7] expected")
 
 static struct hash_control * arm_ops_hsh;
 static struct hash_control * arm_cond_hsh;
@@ -1002,6 +1009,9 @@ static void it_fsm_post_encode (void);
     }							\
   while (0)
 
+/* Toggle value[pos].  */
+#define TOGGLE_BIT(value, pos) (value ^ (1 << pos))
+
 /* Pure syntax.	 */
 
 /* This array holds the chars that always start a comment.  If the
@@ -1027,7 +1037,7 @@ const char EXP_CHARS[] = "eE";
 /* As in 0f12.456  */
 /* or	 0d1.2345e12  */
 
-const char FLT_CHARS[] = "rRsSfFdDxXeEpP";
+const char FLT_CHARS[] = "rRsSfFdDxXeEpPHh";
 
 /* Prefix characters that indicate the start of an immediate
    value.  */
@@ -1037,6 +1047,16 @@ const char FLT_CHARS[] = "rRsSfFdDxXeEpP";
 
 #define skip_whitespace(str)  do { if (*(str) == ' ') ++(str); } while (0)
 
+enum fp_16bit_format
+{
+  ARM_FP16_FORMAT_IEEE		= 0x1,
+  ARM_FP16_FORMAT_ALTERNATIVE	= 0x2,
+  ARM_FP16_FORMAT_DEFAULT	= 0x3
+};
+
+static enum fp_16bit_format fp16_format = ARM_FP16_FORMAT_DEFAULT;
+
+
 static inline int
 skip_past_char (char ** str, char c)
 {
@@ -1178,6 +1198,11 @@ md_atof (int type, char * litP, int * sizeP)
 
   switch (type)
     {
+    case 'H':
+    case 'h':
+      prec = 1;
+      break;
+
     case 'f':
     case 'F':
     case 's':
@@ -1212,34 +1237,29 @@ md_atof (int type, char * litP, int * sizeP)
     input_line_pointer = t;
   *sizeP = prec * sizeof (LITTLENUM_TYPE);
 
-  if (target_big_endian)
-    {
-      for (i = 0; i < prec; i++)
-	{
-	  md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
-	  litP += sizeof (LITTLENUM_TYPE);
-	}
-    }
+  if (target_big_endian || prec == 1)
+    for (i = 0; i < prec; i++)
+      {
+	md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
+	litP += sizeof (LITTLENUM_TYPE);
+      }
+  else if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_endian_pure))
+    for (i = prec - 1; i >= 0; i--)
+      {
+	md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
+	litP += sizeof (LITTLENUM_TYPE);
+      }
   else
-    {
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_endian_pure))
-	for (i = prec - 1; i >= 0; i--)
-	  {
-	    md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
-	    litP += sizeof (LITTLENUM_TYPE);
-	  }
-      else
-	/* For a 4 byte float the order of elements in `words' is 1 0.
-	   For an 8 byte float the order is 1 0 3 2.  */
-	for (i = 0; i < prec; i += 2)
-	  {
-	    md_number_to_chars (litP, (valueT) words[i + 1],
-				sizeof (LITTLENUM_TYPE));
-	    md_number_to_chars (litP + sizeof (LITTLENUM_TYPE),
-				(valueT) words[i], sizeof (LITTLENUM_TYPE));
-	    litP += 2 * sizeof (LITTLENUM_TYPE);
-	  }
-    }
+    /* For a 4 byte float the order of elements in `words' is 1 0.
+       For an 8 byte float the order is 1 0 3 2.  */
+    for (i = 0; i < prec; i += 2)
+      {
+	md_number_to_chars (litP, (valueT) words[i + 1],
+			    sizeof (LITTLENUM_TYPE));
+	md_number_to_chars (litP + sizeof (LITTLENUM_TYPE),
+			    (valueT) words[i], sizeof (LITTLENUM_TYPE));
+	litP += 2 * sizeof (LITTLENUM_TYPE);
+      }
 
   return NULL;
 }
@@ -1652,9 +1672,14 @@ parse_typed_reg_or_scalar (char **ccp, enum arm_reg_type type,
     {
       if (type != REG_TYPE_VFD
 	  && !(type == REG_TYPE_VFS
-	       && ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_2)))
+	       && ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_2))
+	  && !(type == REG_TYPE_NQ
+	       && ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)))
 	{
-	  first_error (_("only D registers may be indexed"));
+	  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	    first_error (_("only D and Q registers may be indexed"));
+	  else
+	    first_error (_("only D registers may be indexed"));
 	  return FAIL;
 	}
 
@@ -1743,27 +1768,41 @@ arm_typed_reg_parse (char **ccp, enum arm_reg_type type,
    just do easy checks here, and do further checks later.  */
 
 static int
-parse_scalar (char **ccp, int elsize, struct neon_type_el *type)
+parse_scalar (char **ccp, int elsize, struct neon_type_el *type, enum
+	      arm_reg_type reg_type)
 {
   int reg;
   char *str = *ccp;
   struct neon_typed_alias atype;
-  enum arm_reg_type reg_type = REG_TYPE_VFD;
-
-  if (elsize == 4)
-    reg_type = REG_TYPE_VFS;
+  unsigned reg_size;
 
   reg = parse_typed_reg_or_scalar (&str, reg_type, NULL, &atype);
 
+  switch (reg_type)
+    {
+    case REG_TYPE_VFS:
+      reg_size = 32;
+      break;
+    case REG_TYPE_VFD:
+      reg_size = 64;
+      break;
+    case REG_TYPE_MQ:
+      reg_size = 128;
+      break;
+    default:
+      gas_assert (0);
+      return FAIL;
+    }
+
   if (reg == FAIL || (atype.defined & NTA_HASINDEX) == 0)
     return FAIL;
 
-  if (atype.index == NEON_ALL_LANES)
+  if (reg_type != REG_TYPE_MQ && atype.index == NEON_ALL_LANES)
     {
       first_error (_("scalar must have an index"));
       return FAIL;
     }
-  else if (atype.index >= 64 / elsize)
+  else if (atype.index >= reg_size / elsize)
     {
       first_error (_("scalar index out of range"));
       return FAIL;
@@ -4896,6 +4935,55 @@ pe_directive_secrel (int dummy ATTRIBUTE_UNUSED)
 }
 #endif /* TE_PE */
 
+int
+arm_is_largest_exponent_ok (int precision)
+{
+  /* precision == 1 ensures that this will only return
+     true for 16 bit floats.  */
+  return (precision == 1) && (fp16_format == ARM_FP16_FORMAT_ALTERNATIVE);
+}
+
+static void
+set_fp16_format (int dummy ATTRIBUTE_UNUSED)
+{
+  char saved_char;
+  char* name;
+  enum fp_16bit_format new_format;
+
+  new_format = ARM_FP16_FORMAT_DEFAULT;
+
+  name = input_line_pointer;
+  while (*input_line_pointer && !ISSPACE (*input_line_pointer))
+    input_line_pointer++;
+
+  saved_char = *input_line_pointer;
+  *input_line_pointer = 0;
+
+  if (strcasecmp (name, "ieee") == 0)
+    new_format = ARM_FP16_FORMAT_IEEE;
+  else if (strcasecmp (name, "alternative") == 0)
+    new_format = ARM_FP16_FORMAT_ALTERNATIVE;
+  else
+    {
+      as_bad (_("unrecognised float16 format \"%s\""), name);
+      goto cleanup;
+    }
+
+  /* Only set fp16_format if it is still the default (aka not already
+     been set yet).  */
+  if (fp16_format == ARM_FP16_FORMAT_DEFAULT)
+    fp16_format = new_format;
+  else
+    {
+      if (new_format != fp16_format)
+	as_warn (_("float16 format cannot be set more than once, ignoring."));
+    }
+
+cleanup:
+  *input_line_pointer = saved_char;
+  ignore_rest_of_line ();
+}
+
 /* This table describes all the machine specific pseudo-ops the assembler
    has to support.  The fields are:
      pseudo-op name without dot
@@ -4973,9 +5061,12 @@ const pseudo_typeS md_pseudo_table[] =
   {"asmfunc",      s_ccs_asmfunc,    0},
   {"endasmfunc",   s_ccs_endasmfunc, 0},
 
+  {"float16", float_cons, 'h' },
+  {"float16_format", set_fp16_format, 0 },
+
   { 0, 0, 0 }
 };
-
+
 /* Parser functions used exclusively in instruction operands.  */
 
 /* Generic immediate-value read function for use in insn parsing.
@@ -5289,7 +5380,7 @@ parse_qfloat_immediate (char **ccp, int *immed)
 /* Shift operands.  */
 enum shift_kind
 {
-  SHIFT_LSL, SHIFT_LSR, SHIFT_ASR, SHIFT_ROR, SHIFT_RRX
+  SHIFT_LSL, SHIFT_LSR, SHIFT_ASR, SHIFT_ROR, SHIFT_RRX, SHIFT_UXTW
 };
 
 struct asm_shift_name
@@ -5306,6 +5397,7 @@ enum parse_shift_mode
   SHIFT_LSL_OR_ASR_IMMEDIATE,	/* Shift must be LSL or ASR immediate.	*/
   SHIFT_ASR_IMMEDIATE,		/* Shift must be ASR immediate.	 */
   SHIFT_LSL_IMMEDIATE,		/* Shift must be LSL immediate.	 */
+  SHIFT_UXTW_IMMEDIATE		/* Shift must be UXTW immediate.  */
 };
 
 /* Parse a <shift> specifier on an ARM data processing instruction.
@@ -5350,7 +5442,13 @@ parse_shift (char **str, int i, enum parse_shift_mode mode)
   switch (mode)
     {
     case NO_SHIFT_RESTRICT:
-    case SHIFT_IMMEDIATE:   break;
+    case SHIFT_IMMEDIATE:
+      if (shift == SHIFT_UXTW)
+	{
+	  inst.error = _("'UXTW' not allowed here");
+	  return FAIL;
+	}
+      break;
 
     case SHIFT_LSL_OR_ASR_IMMEDIATE:
       if (shift != SHIFT_LSL && shift != SHIFT_ASR)
@@ -5375,6 +5473,13 @@ parse_shift (char **str, int i, enum parse_shift_mode mode)
 	  return FAIL;
 	}
       break;
+    case SHIFT_UXTW_IMMEDIATE:
+      if (shift != SHIFT_UXTW)
+	{
+	  inst.error = _("'UXTW' required");
+	  return FAIL;
+	}
+      break;
 
     default: abort ();
     }
@@ -5744,7 +5849,21 @@ parse_address_main (char **str, int i, int group_relocations,
   /* PR gas/14887: Allow for whitespace after the opening bracket.  */
   skip_whitespace (p);
 
-  if ((reg = arm_reg_parse (&p, REG_TYPE_RN)) == FAIL)
+  if (group_type == GROUP_MVE)
+    {
+      enum arm_reg_type rtype = REG_TYPE_MQ;
+      struct neon_type_el et;
+      if ((reg = arm_typed_reg_parse (&p, rtype, &rtype, &et)) != FAIL)
+	{
+	  inst.operands[i].isquad = 1;
+	}
+      else if ((reg = arm_reg_parse (&p, REG_TYPE_RN)) == FAIL)
+	{
+	  inst.error = BAD_ADDR_MODE;
+	  return PARSE_OPERAND_FAIL;
+	}
+    }
+  else if ((reg = arm_reg_parse (&p, REG_TYPE_RN)) == FAIL)
     {
       if (group_type == GROUP_MVE)
 	inst.error = BAD_ADDR_MODE;
@@ -5762,7 +5881,26 @@ parse_address_main (char **str, int i, int group_relocations,
       if (*p == '+') p++;
       else if (*p == '-') p++, inst.operands[i].negative = 1;
 
-      if ((reg = arm_reg_parse (&p, REG_TYPE_RN)) != FAIL)
+      enum arm_reg_type rtype = REG_TYPE_MQ;
+      struct neon_type_el et;
+      if (group_type == GROUP_MVE
+	  && (reg = arm_typed_reg_parse (&p, rtype, &rtype, &et)) != FAIL)
+	{
+	  inst.operands[i].immisreg = 2;
+	  inst.operands[i].imm = reg;
+
+	  if (skip_past_comma (&p) == SUCCESS)
+	    {
+	      if (parse_shift (&p, i, SHIFT_UXTW_IMMEDIATE) == SUCCESS)
+		{
+		  inst.operands[i].imm |= inst.relocs[0].exp.X_add_number << 5;
+		  inst.relocs[0].exp.X_add_number = 0;
+		}
+	      else
+		return PARSE_OPERAND_FAIL;
+	    }
+	}
+      else if ((reg = arm_reg_parse (&p, REG_TYPE_RN)) != FAIL)
 	{
 	  inst.operands[i].imm = reg;
 	  inst.operands[i].immisreg = 1;
@@ -5919,7 +6057,15 @@ parse_address_main (char **str, int i, int group_relocations,
 	  if (*p == '+') p++;
 	  else if (*p == '-') p++, inst.operands[i].negative = 1;
 
-	  if ((reg = arm_reg_parse (&p, REG_TYPE_RN)) != FAIL)
+	  enum arm_reg_type rtype = REG_TYPE_MQ;
+	  struct neon_type_el et;
+	  if (group_type == GROUP_MVE
+	      && (reg = arm_typed_reg_parse (&p, rtype, &rtype, &et)) != FAIL)
+	    {
+	      inst.operands[i].immisreg = 2;
+	      inst.operands[i].imm = reg;
+	    }
+	  else if ((reg = arm_reg_parse (&p, REG_TYPE_RN)) != FAIL)
 	    {
 	      /* We might be using the immediate for alignment already. If we
 		 are, OR the register number into the low-order bits.  */
@@ -6483,7 +6629,61 @@ parse_neon_mov (char **str, int *which_operand)
   char *ptr = *str;
   struct neon_type_el optype;
 
-  if ((val = parse_scalar (&ptr, 8, &optype)) != FAIL)
+   if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+    {
+      /* Cases 17 or 19.  */
+      inst.operands[i].reg = val;
+      inst.operands[i].isvec = 1;
+      inst.operands[i].isscalar = 2;
+      inst.operands[i].vectype = optype;
+      inst.operands[i++].present = 1;
+
+      if (skip_past_comma (&ptr) == FAIL)
+	goto wanted_comma;
+
+      if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
+	{
+	  /* Case 17: VMOV<c>.<dt> <Qd[idx]>, <Rt>  */
+	  inst.operands[i].reg = val;
+	  inst.operands[i].isreg = 1;
+	  inst.operands[i].present = 1;
+	}
+      else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+	{
+	  /* Case 19: VMOV<c> <Qd[idx]>, <Qd[idx2]>, <Rt>, <Rt2>  */
+	  inst.operands[i].reg = val;
+	  inst.operands[i].isvec = 1;
+	  inst.operands[i].isscalar = 2;
+	  inst.operands[i].vectype = optype;
+	  inst.operands[i++].present = 1;
+
+	  if (skip_past_comma (&ptr) == FAIL)
+	    goto wanted_comma;
+
+	  if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+	    goto wanted_arm;
+
+	  inst.operands[i].reg = val;
+	  inst.operands[i].isreg = 1;
+	  inst.operands[i++].present = 1;
+
+	  if (skip_past_comma (&ptr) == FAIL)
+	    goto wanted_comma;
+
+	  if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+	    goto wanted_arm;
+
+	  inst.operands[i].reg = val;
+	  inst.operands[i].isreg = 1;
+	  inst.operands[i].present = 1;
+	}
+      else
+	{
+	  first_error (_("expected ARM or MVE vector register"));
+	  return FAIL;
+	}
+    }
+   else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_VFD)) != FAIL)
     {
       /* Case 4: VMOV<c><q>.<size> <Dn[x]>, <Rd>.  */
       inst.operands[i].reg = val;
@@ -6501,8 +6701,10 @@ parse_neon_mov (char **str, int *which_operand)
       inst.operands[i].isreg = 1;
       inst.operands[i].present = 1;
     }
-  else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype, &optype))
-	   != FAIL)
+  else if (((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype, &optype))
+	    != FAIL)
+	   || ((val = arm_typed_reg_parse (&ptr, REG_TYPE_MQ, &rtype, &optype))
+	       != FAIL))
     {
       /* Cases 0, 1, 2, 3, 5 (D only).  */
       if (skip_past_comma (&ptr) == FAIL)
@@ -6541,8 +6743,10 @@ parse_neon_mov (char **str, int *which_operand)
 	      inst.operands[i].present = 1;
 	    }
 	}
-      else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype,
-					   &optype)) != FAIL)
+      else if (((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype,
+		&optype)) != FAIL)
+	       || ((val = arm_typed_reg_parse (&ptr, REG_TYPE_MQ, &rtype,
+		   &optype)) != FAIL))
 	{
 	  /* Case 0: VMOV<c><q> <Qd>, <Qm>
 	     Case 1: VMOV<c><q> <Dd>, <Dm>
@@ -6599,7 +6803,7 @@ parse_neon_mov (char **str, int *which_operand)
     }
   else if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
     {
-      /* Cases 6, 7.  */
+      /* Cases 6, 7, 16, 18.  */
       inst.operands[i].reg = val;
       inst.operands[i].isreg = 1;
       inst.operands[i++].present = 1;
@@ -6607,7 +6811,15 @@ parse_neon_mov (char **str, int *which_operand)
       if (skip_past_comma (&ptr) == FAIL)
 	goto wanted_comma;
 
-      if ((val = parse_scalar (&ptr, 8, &optype)) != FAIL)
+      if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+	{
+	  /* Case 18: VMOV<c>.<dt> <Rt>, <Qn[idx]>  */
+	  inst.operands[i].reg = val;
+	  inst.operands[i].isscalar = 2;
+	  inst.operands[i].present = 1;
+	  inst.operands[i].vectype = optype;
+	}
+      else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_VFD)) != FAIL)
 	{
 	  /* Case 6: VMOV<c><q>.<dt> <Rd>, <Dn[x]>  */
 	  inst.operands[i].reg = val;
@@ -6617,7 +6829,6 @@ parse_neon_mov (char **str, int *which_operand)
 	}
       else if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
 	{
-	  /* Case 7: VMOV<c><q> <Rd>, <Rn>, <Dm>  */
 	  inst.operands[i].reg = val;
 	  inst.operands[i].isreg = 1;
 	  inst.operands[i++].present = 1;
@@ -6626,37 +6837,70 @@ parse_neon_mov (char **str, int *which_operand)
 	    goto wanted_comma;
 
 	  if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFSD, &rtype, &optype))
-	      == FAIL)
+	      != FAIL)
 	    {
-	      first_error (_(reg_expected_msgs[REG_TYPE_VFSD]));
-	      return FAIL;
-	    }
-
-	  inst.operands[i].reg = val;
-	  inst.operands[i].isreg = 1;
-	  inst.operands[i].isvec = 1;
-	  inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
-	  inst.operands[i].vectype = optype;
-	  inst.operands[i].present = 1;
+	      /* Case 7: VMOV<c><q> <Rd>, <Rn>, <Dm>  */
 
-	  if (rtype == REG_TYPE_VFS)
-	    {
-	      /* Case 14.  */
-	      i++;
-	      if (skip_past_comma (&ptr) == FAIL)
-		goto wanted_comma;
-	      if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL,
-					      &optype)) == FAIL)
-		{
-		  first_error (_(reg_expected_msgs[REG_TYPE_VFS]));
-		  return FAIL;
-		}
 	      inst.operands[i].reg = val;
 	      inst.operands[i].isreg = 1;
 	      inst.operands[i].isvec = 1;
-	      inst.operands[i].issingle = 1;
+	      inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
 	      inst.operands[i].vectype = optype;
 	      inst.operands[i].present = 1;
+
+	      if (rtype == REG_TYPE_VFS)
+		{
+		  /* Case 14.  */
+		  i++;
+		  if (skip_past_comma (&ptr) == FAIL)
+		    goto wanted_comma;
+		  if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL,
+						  &optype)) == FAIL)
+		    {
+		      first_error (_(reg_expected_msgs[REG_TYPE_VFS]));
+		      return FAIL;
+		    }
+		  inst.operands[i].reg = val;
+		  inst.operands[i].isreg = 1;
+		  inst.operands[i].isvec = 1;
+		  inst.operands[i].issingle = 1;
+		  inst.operands[i].vectype = optype;
+		  inst.operands[i].present = 1;
+		}
+	    }
+	  else
+	    {
+	      if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ))
+		       != FAIL)
+		{
+		  /* Case 16: VMOV<c> <Rt>, <Rt2>, <Qd[idx]>, <Qd[idx2]>  */
+		  inst.operands[i].reg = val;
+		  inst.operands[i].isvec = 1;
+		  inst.operands[i].isscalar = 2;
+		  inst.operands[i].vectype = optype;
+		  inst.operands[i++].present = 1;
+
+		  if (skip_past_comma (&ptr) == FAIL)
+		    goto wanted_comma;
+
+		  if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ))
+		      == FAIL)
+		    {
+		      first_error (_(reg_expected_msgs[REG_TYPE_MQ]));
+		      return FAIL;
+		    }
+		  inst.operands[i].reg = val;
+		  inst.operands[i].isvec = 1;
+		  inst.operands[i].isscalar = 2;
+		  inst.operands[i].vectype = optype;
+		  inst.operands[i].present = 1;
+		}
+	      else
+		{
+		  first_error (_("VFP single, double or MVE vector register"
+			       " expected"));
+		  return FAIL;
+		}
 	    }
 	}
       else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL, &optype))
@@ -6720,9 +6964,12 @@ enum operand_parse_code
   OP_RNQ,	/* Neon quad precision register */
   OP_RNQMQ,	/* Neon quad or MVE vector register.  */
   OP_RVSD,	/* VFP single or double precision register */
+  OP_RVSD_COND,	/* VFP single, double precision register or condition code.  */
+  OP_RVSDMQ,	/* VFP single, double precision or MVE vector register.  */
   OP_RNSD,      /* Neon single or double precision register */
   OP_RNDQ,      /* Neon double or quad precision register */
   OP_RNDQMQ,     /* Neon double, quad or MVE vector register.  */
+  OP_RNDQMQR,   /* Neon double, quad, MVE vector or ARM register.  */
   OP_RNSDQ,	/* Neon single, double or quad precision register */
   OP_RNSC,      /* Neon scalar D[X] */
   OP_RVC,	/* VFP control register */
@@ -6742,12 +6989,15 @@ enum operand_parse_code
   OP_RNSDQMQR,	/* Neon single, double or quad register, MVE vector register or
 		   GPR (no SP/SP)  */
   OP_RMQ,	/* MVE vector register.  */
+  OP_RMQRZ,	/* MVE vector or ARM register including ZR.  */
+  OP_RMQRR,     /* MVE vector or ARM register.  */
 
   /* New operands for Armv8.1-M Mainline.  */
   OP_LR,	/* ARM LR register */
   OP_RRe,	/* ARM register, only even numbered.  */
   OP_RRo,	/* ARM register, only odd numbered, not r13 or r15.  */
   OP_RRnpcsp_I32, /* ARM register (no BadReg) or literal 1 .. 32 */
+  OP_RR_ZR,	/* ARM register or ZR but no PC */
 
   OP_REGLST,	/* ARM register list */
   OP_CLRMLST,	/* CLRM register list */
@@ -6763,16 +7013,28 @@ enum operand_parse_code
   OP_RNDQ_I0,   /* Neon D or Q reg, or immediate zero.  */
   OP_RVSD_I0,	/* VFP S or D reg, or immediate zero.  */
   OP_RSVD_FI0, /* VFP S or D reg, or floating point immediate zero.  */
+  OP_RSVDMQ_FI0, /* VFP S, D, MVE vector register or floating point immediate
+		    zero.  */
   OP_RR_RNSC,   /* ARM reg or Neon scalar.  */
   OP_RNSD_RNSC, /* Neon S or D reg, or Neon scalar.  */
   OP_RNSDQ_RNSC, /* Vector S, D or Q reg, or Neon scalar.  */
   OP_RNSDQ_RNSC_MQ, /* Vector S, D or Q reg, Neon scalar or MVE vector register.
 		     */
+  OP_RNSDQ_RNSC_MQ_RR, /* Vector S, D or Q reg, or MVE vector reg , or Neon
+			  scalar, or ARM register.  */
   OP_RNDQ_RNSC, /* Neon D or Q reg, or Neon scalar.  */
+  OP_RNDQ_RNSC_RR, /* Neon D or Q reg, Neon scalar, or ARM register.  */
+  OP_RNDQMQ_RNSC_RR, /* Neon D or Q reg, Neon scalar, MVE vector or ARM
+			register.  */
+  OP_RNDQMQ_RNSC, /* Neon D, Q or MVE vector reg, or Neon scalar.  */
   OP_RND_RNSC,  /* Neon D reg, or Neon scalar.  */
   OP_VMOV,      /* Neon VMOV operands.  */
   OP_RNDQ_Ibig,	/* Neon D or Q reg, or big immediate for logic and VMVN.  */
+  /* Neon D, Q or MVE vector register, or big immediate for logic and VMVN.  */
+  OP_RNDQMQ_Ibig,
   OP_RNDQ_I63b, /* Neon D or Q reg, or immediate for shift.  */
+  OP_RNDQMQ_I63b_RR, /* Neon D or Q reg, immediate for shift, MVE vector or
+			ARM register.  */
   OP_RIWR_I32z, /* iWMMXt wR register, or immediate 0 .. 32 for iWMMXt2.  */
   OP_VLDR,	/* VLDR operand.  */
 
@@ -6785,6 +7047,7 @@ enum operand_parse_code
   OP_I31w,	/*		   0 .. 31, optional trailing ! */
   OP_I32,	/*		   1 .. 32 */
   OP_I32z,	/*		   0 .. 32 */
+  OP_I48_I64,	/*		   48 or 64 */
   OP_I63,	/*		   0 .. 63 */
   OP_I63s,	/*		 -64 .. 63 */
   OP_I64,	/*		   1 .. 64 */
@@ -6853,6 +7116,8 @@ enum operand_parse_code
   OP_oROR,	 /* ROR 0/8/16/24 */
   OP_oBARRIER_I15, /* Option argument for a barrier instruction.  */
 
+  OP_oRMQRZ,	/* optional MVE vector or ARM register including ZR.  */
+
   /* Some pre-defined mixed (ARM/THUMB) operands.  */
   OP_RR_npcsp		= MIX_ARM_THUMB_OPERANDS (OP_RR, OP_RRnpcsp),
   OP_RRnpc_npcsp	= MIX_ARM_THUMB_OPERANDS (OP_RRnpc, OP_RRnpcsp),
@@ -6902,6 +7167,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
       inst.operands[i].isvec = (rtype == REG_TYPE_VFS		\
 			     || rtype == REG_TYPE_VFD		\
 			     || rtype == REG_TYPE_NQ);		\
+      inst.operands[i].iszr = (rtype == REG_TYPE_ZR);		\
     }								\
   while (0)
 
@@ -6920,6 +7186,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
       inst.operands[i].isvec = (rtype == REG_TYPE_VFS		\
 			     || rtype == REG_TYPE_VFD		\
 			     || rtype == REG_TYPE_NQ);		\
+      inst.operands[i].iszr = (rtype == REG_TYPE_ZR);		\
     }								\
   while (0)
 
@@ -6932,10 +7199,30 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
     }								\
   while (0)
 
-#define po_scalar_or_goto(elsz, label)					\
+#define po_imm1_or_imm2_or_fail(imm1, imm2, popt)		\
+  do								\
+    {								\
+      expressionS exp;						\
+      my_get_expression (&exp, &str, popt);			\
+      if (exp.X_op != O_constant)				\
+	{							\
+	  inst.error = _("constant expression required");	\
+	  goto failure;						\
+	}							\
+      if (exp.X_add_number != imm1 && exp.X_add_number != imm2) \
+	{							\
+	  inst.error = _("immediate value 48 or 64 expected");	\
+	  goto failure;						\
+	}							\
+      inst.operands[i].imm = exp.X_add_number;			\
+    }								\
+  while (0)
+
+#define po_scalar_or_goto(elsz, label, reg_type)			\
   do									\
     {									\
-      val = parse_scalar (& str, elsz, & inst.operands[i].vectype);	\
+      val = parse_scalar (& str, elsz, & inst.operands[i].vectype,	\
+			  reg_type);					\
       if (val == FAIL)							\
 	goto label;							\
       inst.operands[i].reg = val;					\
@@ -6992,7 +7279,6 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
       if (op_parse_code >= OP_FIRST_OPTIONAL)
 	{
 	  /* Remember where we are in case we need to backtrack.  */
-	  gas_assert (!backtrack_pos);
 	  backtrack_pos = str;
 	  backtrack_error = inst.error;
 	  backtrack_index = i;
@@ -7034,7 +7320,20 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  break;
 	  /* Also accept generic coprocessor regs for unknown registers.  */
 	  coproc_reg:
-	  po_reg_or_fail (REG_TYPE_CN);
+	  po_reg_or_goto (REG_TYPE_CN, vpr_po);
+	  break;
+	  /* Also accept P0 or p0 for VPR.P0.  Since P0 is already an
+	     existing register with a value of 0, this seems like the
+	     best way to parse P0.  */
+	  vpr_po:
+	  if (strncasecmp (str, "P0", 2) == 0)
+	    {
+	      str += 2;
+	      inst.operands[i].isreg = 1;
+	      inst.operands[i].reg = 13;
+	    }
+	  else
+	    goto failure;
 	  break;
 	case OP_RMF:   po_reg_or_fail (REG_TYPE_MVF);	  break;
 	case OP_RMD:   po_reg_or_fail (REG_TYPE_MVD);	  break;
@@ -7053,6 +7352,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	try_nq:
 	case OP_RNQ:   po_reg_or_fail (REG_TYPE_NQ);      break;
 	case OP_RNSD:  po_reg_or_fail (REG_TYPE_NSD);     break;
+	case OP_RNDQMQR:
+	  po_reg_or_goto (REG_TYPE_RN, try_rndqmq);
+	  break;
+	try_rndqmq:
 	case OP_oRNDQMQ:
 	case OP_RNDQMQ:
 	  po_reg_or_goto (REG_TYPE_MQ, try_rndq);
@@ -7060,7 +7363,14 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	try_rndq:
 	case OP_oRNDQ:
 	case OP_RNDQ:  po_reg_or_fail (REG_TYPE_NDQ);     break;
+	case OP_RVSDMQ:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rvsd);
+	  break;
+	try_rvsd:
 	case OP_RVSD:  po_reg_or_fail (REG_TYPE_VFSD);    break;
+	case OP_RVSD_COND:
+	  po_reg_or_goto (REG_TYPE_VFSD, try_cond);
+	  break;
 	case OP_oRNSDQ:
 	case OP_RNSDQ: po_reg_or_fail (REG_TYPE_NSDQ);    break;
 	case OP_RNSDQMQR:
@@ -7075,12 +7385,16 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  po_reg_or_fail (REG_TYPE_NSDQ);
 	  inst.error = 0;
 	  break;
+	case OP_RMQRR:
+	  po_reg_or_goto (REG_TYPE_RN, try_rmq);
+	  break;
+	try_rmq:
 	case OP_RMQ:
 	  po_reg_or_fail (REG_TYPE_MQ);
 	  break;
 	/* Neon scalar. Using an element size of 8 means that some invalid
 	   scalars are accepted here, so deal with those in later code.  */
-	case OP_RNSC:  po_scalar_or_goto (8, failure);    break;
+	case OP_RNSC:  po_scalar_or_goto (8, failure, REG_TYPE_VFD);    break;
 
 	case OP_RNDQ_I0:
 	  {
@@ -7095,6 +7409,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  po_reg_or_goto (REG_TYPE_VFSD, try_imm0);
 	  break;
 
+	case OP_RSVDMQ_FI0:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rsvd_fi0);
+	  break;
+	try_rsvd_fi0:
 	case OP_RSVD_FI0:
 	  {
 	    po_reg_or_goto (REG_TYPE_VFSD, try_ifimm0);
@@ -7113,41 +7431,58 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 
 	case OP_RR_RNSC:
 	  {
-	    po_scalar_or_goto (8, try_rr);
+	    po_scalar_or_goto (8, try_rr, REG_TYPE_VFD);
 	    break;
 	    try_rr:
 	    po_reg_or_fail (REG_TYPE_RN);
 	  }
 	  break;
 
+	case OP_RNSDQ_RNSC_MQ_RR:
+	  po_reg_or_goto (REG_TYPE_RN, try_rnsdq_rnsc_mq);
+	  break;
+	try_rnsdq_rnsc_mq:
 	case OP_RNSDQ_RNSC_MQ:
 	  po_reg_or_goto (REG_TYPE_MQ, try_rnsdq_rnsc);
 	  break;
 	try_rnsdq_rnsc:
 	case OP_RNSDQ_RNSC:
 	  {
-	    po_scalar_or_goto (8, try_nsdq);
+	    po_scalar_or_goto (8, try_nsdq, REG_TYPE_VFD);
+	    inst.error = 0;
 	    break;
 	    try_nsdq:
 	    po_reg_or_fail (REG_TYPE_NSDQ);
+	    inst.error = 0;
 	  }
 	  break;
 
 	case OP_RNSD_RNSC:
 	  {
-	    po_scalar_or_goto (8, try_s_scalar);
+	    po_scalar_or_goto (8, try_s_scalar, REG_TYPE_VFD);
 	    break;
 	    try_s_scalar:
-	    po_scalar_or_goto (4, try_nsd);
+	    po_scalar_or_goto (4, try_nsd, REG_TYPE_VFS);
 	    break;
 	    try_nsd:
 	    po_reg_or_fail (REG_TYPE_NSD);
 	  }
 	  break;
 
+	case OP_RNDQMQ_RNSC_RR:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rndq_rnsc_rr);
+	  break;
+	try_rndq_rnsc_rr:
+	case OP_RNDQ_RNSC_RR:
+	  po_reg_or_goto (REG_TYPE_RN, try_rndq_rnsc);
+	  break;
+	case OP_RNDQMQ_RNSC:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rndq_rnsc);
+	  break;
+	try_rndq_rnsc:
 	case OP_RNDQ_RNSC:
 	  {
-	    po_scalar_or_goto (8, try_ndq);
+	    po_scalar_or_goto (8, try_ndq, REG_TYPE_VFD);
 	    break;
 	    try_ndq:
 	    po_reg_or_fail (REG_TYPE_NDQ);
@@ -7156,7 +7491,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 
 	case OP_RND_RNSC:
 	  {
-	    po_scalar_or_goto (8, try_vfd);
+	    po_scalar_or_goto (8, try_vfd, REG_TYPE_VFD);
 	    break;
 	    try_vfd:
 	    po_reg_or_fail (REG_TYPE_VFD);
@@ -7169,6 +7504,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  po_misc_or_fail (parse_neon_mov (&str, &i) == FAIL);
 	  break;
 
+	case OP_RNDQMQ_Ibig:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rndq_ibig);
+	  break;
+	try_rndq_ibig:
 	case OP_RNDQ_Ibig:
 	  {
 	    po_reg_or_goto (REG_TYPE_NDQ, try_immbig);
@@ -7185,6 +7524,13 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  }
 	  break;
 
+	case OP_RNDQMQ_I63b_RR:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rndq_i63b_rr);
+	  break;
+	try_rndq_i63b_rr:
+	  po_reg_or_goto (REG_TYPE_RN, try_rndq_i63b);
+	  break;
+	try_rndq_i63b:
 	case OP_RNDQ_I63b:
 	  {
 	    po_reg_or_goto (REG_TYPE_NDQ, try_shimm);
@@ -7216,6 +7562,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	case OP_I31:	 po_imm_or_fail (  0,	  31, FALSE);	break;
 	case OP_I32:	 po_imm_or_fail (  1,	  32, FALSE);	break;
 	case OP_I32z:	 po_imm_or_fail (  0,     32, FALSE);   break;
+	case OP_I48_I64: po_imm1_or_imm2_or_fail (48, 64, FALSE); break;
 	case OP_I63s:	 po_imm_or_fail (-64,	  63, FALSE);	break;
 	case OP_I63:	 po_imm_or_fail (  0,     63, FALSE);   break;
 	case OP_I64:	 po_imm_or_fail (  1,     64, FALSE);   break;
@@ -7314,6 +7661,9 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	case OP_RRnpc_I0: po_reg_or_goto (REG_TYPE_RN, I0);   break;
 	I0:		  po_imm_or_fail (0, 0, FALSE);	      break;
 
+	case OP_RRnpcsp_I32: po_reg_or_goto (REG_TYPE_RN, I32);	break;
+	I32:		     po_imm_or_fail (1, 32, FALSE);	break;
+
 	case OP_RF_IF:    po_reg_or_goto (REG_TYPE_FN, IF);   break;
 	IF:
 	  if (!is_immediate_prefix (*str))
@@ -7367,6 +7717,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	case OP_CPSF:	 val = parse_cps_flags (&str);		break;
 	case OP_ENDI:	 val = parse_endian_specifier (&str);	break;
 	case OP_oROR:	 val = parse_ror (&str);		break;
+	try_cond:
 	case OP_COND:	 val = parse_cond (&str);		break;
 	case OP_oBARRIER_I15:
 	  po_barrier_or_imm (str); break;
@@ -7540,6 +7891,19 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  po_misc_or_fail (parse_shift (&str, i, SHIFT_LSL_OR_ASR_IMMEDIATE));
 	  break;
 
+	case OP_RMQRZ:
+	case OP_oRMQRZ:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rr_zr);
+	  break;
+
+	case OP_RR_ZR:
+	try_rr_zr:
+	  po_reg_or_goto (REG_TYPE_RN, ZR);
+	  break;
+	ZR:
+	  po_reg_or_fail (REG_TYPE_ZR);
+	  break;
+
 	default:
 	  as_fatal (_("unhandled operand code %d"), op_parse_code);
 	}
@@ -7561,6 +7925,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 
 	case OP_oRRnpcsp:
 	case OP_RRnpcsp:
+	case OP_RRnpcsp_I32:
 	  if (inst.operands[i].isreg)
 	    {
 	      if (inst.operands[i].reg == REG_PC)
@@ -7583,10 +7948,12 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	    inst.error = BAD_PC;
 	  break;
 
+	case OP_RVSD_COND:
 	case OP_VLDR:
 	  if (inst.operands[i].isreg)
 	    break;
 	/* fall through.  */
+
 	case OP_CPSF:
 	case OP_ENDI:
 	case OP_oROR:
@@ -7615,6 +7982,13 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	    inst.error = _("operand must be LR register");
 	  break;
 
+	case OP_RMQRZ:
+	case OP_oRMQRZ:
+	case OP_RR_ZR:
+	  if (!inst.operands[i].iszr && inst.operands[i].reg == REG_PC)
+	    inst.error = BAD_PC;
+	  break;
+
 	case OP_RRe:
 	  if (inst.operands[i].isreg
 	      && (inst.operands[i].reg & 0x00000001) != 0)
@@ -8431,6 +8805,11 @@ move_or_literal_pool (int i, enum lit_type t, bfd_boolean mode_3)
 		      inst.instruction |= (imm & 0x0800) << 15;
 		      inst.instruction |= (imm & 0x0700) << 4;
 		      inst.instruction |= (imm & 0x00ff);
+		      /*  In case this replacement is being done on Armv8-M
+			  Baseline we need to make sure to disable the
+			  instruction size check, as otherwise GAS will reject
+			  the use of this T32 instruction.  */
+		      inst.size_req = 0;
 		      return TRUE;
 		    }
 		}
@@ -9555,10 +9934,42 @@ do_vmrs (void)
       return;
     }
 
-  /* MVFR2 is only valid at ARMv8-A.  */
-  if (inst.operands[1].reg == 5)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-		_(BAD_FPU));
+  switch (inst.operands[1].reg)
+    {
+    /* MVFR2 is only valid for Armv8-A.  */
+    case 5:
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+		  _(BAD_FPU));
+      break;
+
+    /* Check for new Armv8.1-M Mainline changes to <spec_reg>.  */
+    case 1: /* fpscr.  */
+      constraint (!(ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		    || ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+		  _(BAD_FPU));
+      break;
+
+    case 14: /* fpcxt_ns.  */
+    case 15: /* fpcxt_s.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main),
+		  _("selected processor does not support instruction"));
+      break;
+
+    case  2: /* fpscr_nzcvqc.  */
+    case 12: /* vpr.  */
+    case 13: /* p0.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main)
+		  || (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		      && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+		  _("selected processor does not support instruction"));
+      if (inst.operands[0].reg != 2
+	  && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	as_warn (_("accessing MVE system register without MVE is UNPREDICTABLE"));
+      break;
+
+    default:
+      break;
+    }
 
   /* APSR_ sets isvec. All other refs to PC are illegal.  */
   if (!inst.operands[0].isvec && Rt == REG_PC)
@@ -9586,10 +9997,42 @@ do_vmsr (void)
       return;
     }
 
-  /* MVFR2 is only valid for ARMv8-A.  */
-  if (inst.operands[0].reg == 5)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-		_(BAD_FPU));
+  switch (inst.operands[0].reg)
+    {
+    /* MVFR2 is only valid for Armv8-A.  */
+    case 5:
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+		  _(BAD_FPU));
+      break;
+
+    /* Check for new Armv8.1-M Mainline changes to <spec_reg>.  */
+    case  1: /* fpcr.  */
+      constraint (!(ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		    || ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+		  _(BAD_FPU));
+      break;
+
+    case 14: /* fpcxt_ns.  */
+    case 15: /* fpcxt_s.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main),
+		  _("selected processor does not support instruction"));
+      break;
+
+    case  2: /* fpscr_nzcvqc.  */
+    case 12: /* vpr.  */
+    case 13: /* p0.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main)
+		  || (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		      && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+		  _("selected processor does not support instruction"));
+      if (inst.operands[0].reg != 2
+	  && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	as_warn (_("accessing MVE system register without MVE is UNPREDICTABLE"));
+      break;
+
+    default:
+      break;
+    }
 
   /* If we get through parsing the register name, we just insert the number
      generated into the instruction without further validation.  */
@@ -9889,6 +10332,9 @@ do_shift (void)
 static void
 do_smc (void)
 {
+  unsigned int value = inst.relocs[0].exp.X_add_number;
+  constraint (value > 0xf, _("immediate too large (bigger than 0xF)"));
+
   inst.relocs[0].type = BFD_RELOC_ARM_SMC;
   inst.relocs[0].pc_rel = 0;
 }
@@ -10109,6 +10555,10 @@ do_sxth (void)
 static void
 do_vfp_sp_monadic (void)
 {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+	      && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+	      _(BAD_FPU));
+
   encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Sd);
   encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Sm);
 }
@@ -10144,6 +10594,10 @@ do_vfp_sp_dp_cvt (void)
 static void
 do_vfp_reg_from_sp (void)
 {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+	     && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+	     _(BAD_FPU));
+
   inst.instruction |= inst.operands[0].reg << 12;
   encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Sn);
 }
@@ -10161,6 +10615,10 @@ do_vfp_reg2_from_sp2 (void)
 static void
 do_vfp_sp_from_reg (void)
 {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+	     && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+	     _(BAD_FPU));
+
   encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Sn);
   inst.instruction |= inst.operands[1].reg << 12;
 }
@@ -10263,6 +10721,10 @@ do_vfp_xp_ldstmdb (void)
 static void
 do_vfp_dp_rd_rm (void)
 {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+	      && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+	      _(BAD_FPU));
+
   encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dd);
   encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dm);
 }
@@ -10284,6 +10746,10 @@ do_vfp_dp_rd_rn (void)
 static void
 do_vfp_dp_rd_rn_rm (void)
 {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+	      && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+	      _(BAD_FPU));
+
   encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dd);
   encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dn);
   encode_arm_vfp_reg (inst.operands[2].reg, VFP_REG_Dm);
@@ -10298,6 +10764,10 @@ do_vfp_dp_rd (void)
 static void
 do_vfp_dp_rm_rd_rn (void)
 {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+	      && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+	      _(BAD_FPU));
+
   encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dm);
   encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dd);
   encode_arm_vfp_reg (inst.operands[2].reg, VFP_REG_Dn);
@@ -10809,7 +11279,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
     inst.error = _("instruction does not accept unindexed addressing");
 }
 
-/* Table of Thumb instructions which exist in both 16- and 32-bit
+/* Table of Thumb instructions which exist in 16- and/or 32-bit
    encodings (the latter only in post-V6T2 cores).  The index is the
    value used in the insns table below.  When there is more than one
    possible 16-bit encoding for the instruction, this table always
@@ -10838,16 +11308,27 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
   X(_bflx,  0000, f070e001),			\
   X(_bic,   4380, ea200000),			\
   X(_bics,  4380, ea300000),			\
+  X(_cinc,  0000, ea509000),			\
+  X(_cinv,  0000, ea50a000),			\
   X(_cmn,   42c0, eb100f00),			\
   X(_cmp,   2800, ebb00f00),			\
+  X(_cneg,  0000, ea50b000),			\
   X(_cpsie, b660, f3af8400),			\
   X(_cpsid, b670, f3af8600),			\
   X(_cpy,   4600, ea4f0000),			\
+  X(_csel,  0000, ea508000),			\
+  X(_cset,  0000, ea5f900f),			\
+  X(_csetm, 0000, ea5fa00f),			\
+  X(_csinc, 0000, ea509000),			\
+  X(_csinv, 0000, ea50a000),			\
+  X(_csneg, 0000, ea50b000),			\
   X(_dec_sp,80dd, f1ad0d00),			\
   X(_dls,   0000, f040e001),			\
+  X(_dlstp, 0000, f000e001),			\
   X(_eor,   4040, ea800000),			\
   X(_eors,  4040, ea900000),			\
   X(_inc_sp,00dd, f10d0d00),			\
+  X(_lctp,  0000, f00fe001),			\
   X(_ldmia, c800, e8900000),			\
   X(_ldr,   6800, f8500000),			\
   X(_ldrb,  7800, f8100000),			\
@@ -10858,6 +11339,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
   X(_ldr_pc2,4800, f85f0000),			\
   X(_ldr_sp,9800, f85d0000),			\
   X(_le,    0000, f00fc001),			\
+  X(_letp,  0000, f01fc001),			\
   X(_lsl,   0000, fa00f000),			\
   X(_lsls,  0000, fa10f000),			\
   X(_lsr,   0800, fa20f000),			\
@@ -10900,6 +11382,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
   X(_wfe,   bf20, f3af8002),			\
   X(_wfi,   bf30, f3af8003),			\
   X(_wls,   0000, f040c001),			\
+  X(_wlstp, 0000, f000c001),			\
   X(_sev,   bf40, f3af8004),                    \
   X(_sevl,  bf50, f3af8005),			\
   X(_udf,   de00, f7f0a000)
@@ -11653,6 +12136,60 @@ do_t_clz (void)
   inst.instruction |= Rm;
 }
 
+/* For the Armv8.1-M conditional instructions.  */
+static void
+do_t_cond (void)
+{
+  unsigned Rd, Rn, Rm;
+  signed int cond;
+
+  constraint (inst.cond != COND_ALWAYS, BAD_COND);
+
+  Rd = inst.operands[0].reg;
+  switch (inst.instruction)
+    {
+      case T_MNEM_csinc:
+      case T_MNEM_csinv:
+      case T_MNEM_csneg:
+      case T_MNEM_csel:
+	Rn = inst.operands[1].reg;
+	Rm = inst.operands[2].reg;
+	cond = inst.operands[3].imm;
+	constraint (Rn == REG_SP, BAD_SP);
+	constraint (Rm == REG_SP, BAD_SP);
+	break;
+
+      case T_MNEM_cinc:
+      case T_MNEM_cinv:
+      case T_MNEM_cneg:
+	Rn = inst.operands[1].reg;
+	cond = inst.operands[2].imm;
+	/* Invert the last bit to invert the cond.  */
+	cond = TOGGLE_BIT (cond, 0);
+	constraint (Rn == REG_SP, BAD_SP);
+	Rm = Rn;
+	break;
+
+      case T_MNEM_csetm:
+      case T_MNEM_cset:
+	cond = inst.operands[1].imm;
+	/* Invert the last bit to invert the cond.  */
+	cond = TOGGLE_BIT (cond, 0);
+	Rn = REG_PC;
+	Rm = REG_PC;
+	break;
+
+      default: abort ();
+    }
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  inst.instruction = THUMB_OP32 (inst.instruction);
+  inst.instruction |= Rd << 8;
+  inst.instruction |= Rn << 16;
+  inst.instruction |= Rm;
+  inst.instruction |= cond << 4;
+}
+
 static void
 do_t_csdb (void)
 {
@@ -11801,18 +12338,6 @@ do_t_it (void)
   inst.instruction |= cond << 4;
 }
 
-static void
-do_mve_vpt (void)
-{
-  /* We are dealing with a vector predicated block.  */
-  set_pred_insn_type (VPT_INSN);
-  now_pred.cc = 0;
-  now_pred.mask = ((inst.instruction & 0x00400000) >> 19)
-		  | ((inst.instruction & 0xe000) >> 13);
-  now_pred.warn_deprecated = FALSE;
-  now_pred.type = VECTOR_PRED;
-}
-
 /* Helper function used for both push/pop and ldm/stm.  */
 static void
 encode_thumb2_multi (bfd_boolean do_io, int base, unsigned mask,
@@ -13440,10 +13965,11 @@ do_t_smc (void)
 	      _("SMC is not permitted on this architecture"));
   constraint (inst.relocs[0].exp.X_op != O_constant,
 	      _("expression too complex"));
+  constraint (value > 0xf, _("immediate too large (bigger than 0xF)"));
+
   inst.relocs[0].type = BFD_RELOC_UNUSED;
-  inst.instruction |= (value & 0xf000) >> 12;
-  inst.instruction |= (value & 0x0ff0);
   inst.instruction |= (value & 0x000f) << 16;
+
   /* PR gas/15623: SMC instructions must be last in an IT block.  */
   set_pred_insn_type_last ();
 }
@@ -13837,35 +14363,52 @@ v8_1_loop_reloc (int is_le)
     }
 }
 
-/* To handle the Scalar Low Overhead Loop instructions
-   in Armv8.1-M Mainline.  */
+/* For shifts with four operands in MVE.  */
 static void
-do_t_loloop (void)
+do_mve_scalar_shift1 (void)
 {
-  unsigned long insn = inst.instruction;
+  unsigned int value = inst.operands[2].imm;
 
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
-  inst.instruction = THUMB_OP32 (inst.instruction);
+  inst.instruction |= inst.operands[0].reg << 16;
+  inst.instruction |= inst.operands[1].reg << 8;
 
-  switch (insn)
-    {
-    case T_MNEM_le:
-      /* le <label>.  */
-      if (!inst.operands[0].present)
-	inst.instruction |= 1 << 21;
+  /* Setting the bit for saturation.  */
+  inst.instruction |= ((value == 64) ? 0: 1) << 7;
 
-      v8_1_loop_reloc (TRUE);
-      break;
+  /* Assuming Rm is already checked not to be 11x1.  */
+  constraint (inst.operands[3].reg == inst.operands[0].reg, BAD_OVERLAP);
+  constraint (inst.operands[3].reg == inst.operands[1].reg, BAD_OVERLAP);
+  inst.instruction |= inst.operands[3].reg << 12;
+}
 
-    case T_MNEM_wls:
-      v8_1_loop_reloc (FALSE);
-      /* Fall through.  */
-    case T_MNEM_dls:
-      constraint (inst.operands[1].isreg != 1, BAD_ARGS);
-      inst.instruction |= (inst.operands[1].reg << 16);
-      break;
+/* For shifts in MVE.  */
+static void
+do_mve_scalar_shift (void)
+{
+  if (!inst.operands[2].present)
+    {
+      inst.operands[2] = inst.operands[1];
+      inst.operands[1].reg = 0xf;
+    }
 
-    default: abort();
+  inst.instruction |= inst.operands[0].reg << 16;
+  inst.instruction |= inst.operands[1].reg << 8;
+
+  if (inst.operands[2].isreg)
+    {
+      /* Assuming Rm is already checked not to be 11x1.  */
+      constraint (inst.operands[2].reg == inst.operands[0].reg, BAD_OVERLAP);
+      constraint (inst.operands[2].reg == inst.operands[1].reg, BAD_OVERLAP);
+      inst.instruction |= inst.operands[2].reg << 12;
+    }
+  else
+    {
+      /* Assuming imm is already checked as [1,32].  */
+      unsigned int value = inst.operands[2].imm;
+      inst.instruction |= (value & 0x1c) << 10;
+      inst.instruction |= (value & 0x03) << 6;
+      /* Change last 4 bits from 0xd to 0xf.  */
+      inst.instruction |= 0x2;
     }
 }
 
@@ -13893,6 +14436,63 @@ do_t_loloop (void)
 #define M_MNEM_vld41	0xfc901e21
 #define M_MNEM_vld42	0xfc901e41
 #define M_MNEM_vld43	0xfc901e61
+#define M_MNEM_vstrb	0xec000e00
+#define M_MNEM_vstrh	0xec000e10
+#define M_MNEM_vstrw	0xec000e40
+#define M_MNEM_vstrd	0xec000e50
+#define M_MNEM_vldrb	0xec100e00
+#define M_MNEM_vldrh	0xec100e10
+#define M_MNEM_vldrw	0xec100e40
+#define M_MNEM_vldrd	0xec100e50
+#define M_MNEM_vmovlt	0xeea01f40
+#define M_MNEM_vmovlb	0xeea00f40
+#define M_MNEM_vmovnt	0xfe311e81
+#define M_MNEM_vmovnb	0xfe310e81
+#define M_MNEM_vadc	0xee300f00
+#define M_MNEM_vadci	0xee301f00
+#define M_MNEM_vbrsr	0xfe011e60
+#define M_MNEM_vaddlv	0xee890f00
+#define M_MNEM_vaddlva	0xee890f20
+#define M_MNEM_vaddv	0xeef10f00
+#define M_MNEM_vaddva	0xeef10f20
+#define M_MNEM_vddup	0xee011f6e
+#define M_MNEM_vdwdup	0xee011f60
+#define M_MNEM_vidup	0xee010f6e
+#define M_MNEM_viwdup	0xee010f60
+#define M_MNEM_vmaxv	0xeee20f00
+#define M_MNEM_vmaxav	0xeee00f00
+#define M_MNEM_vminv	0xeee20f80
+#define M_MNEM_vminav	0xeee00f80
+#define M_MNEM_vmlaldav	  0xee800e00
+#define M_MNEM_vmlaldava  0xee800e20
+#define M_MNEM_vmlaldavx  0xee801e00
+#define M_MNEM_vmlaldavax 0xee801e20
+#define M_MNEM_vmlsldav	  0xee800e01
+#define M_MNEM_vmlsldava  0xee800e21
+#define M_MNEM_vmlsldavx  0xee801e01
+#define M_MNEM_vmlsldavax 0xee801e21
+#define M_MNEM_vrmlaldavhx  0xee801f00
+#define M_MNEM_vrmlaldavhax 0xee801f20
+#define M_MNEM_vrmlsldavh   0xfe800e01
+#define M_MNEM_vrmlsldavha  0xfe800e21
+#define M_MNEM_vrmlsldavhx  0xfe801e01
+#define M_MNEM_vrmlsldavhax 0xfe801e21
+#define M_MNEM_vqmovnt	  0xee331e01
+#define M_MNEM_vqmovnb	  0xee330e01
+#define M_MNEM_vqmovunt	  0xee311e81
+#define M_MNEM_vqmovunb	  0xee310e81
+#define M_MNEM_vshrnt	    0xee801fc1
+#define M_MNEM_vshrnb	    0xee800fc1
+#define M_MNEM_vrshrnt	    0xfe801fc1
+#define M_MNEM_vqshrnt	    0xee801f40
+#define M_MNEM_vqshrnb	    0xee800f40
+#define M_MNEM_vqshrunt	    0xee801fc0
+#define M_MNEM_vqshrunb	    0xee800fc0
+#define M_MNEM_vrshrnb	    0xfe800fc1
+#define M_MNEM_vqrshrnt	    0xee801f41
+#define M_MNEM_vqrshrnb	    0xee800f41
+#define M_MNEM_vqrshrunt    0xfe801fc0
+#define M_MNEM_vqrshrunb    0xfe800fc0
 
 /* Neon instruction encoder helpers.  */
 
@@ -14057,6 +14657,13 @@ NEON_ENC_TAB
      - a table used to drive neon_select_shape.  */
 
 #define NEON_SHAPE_DEF			\
+  X(4, (R, R, Q, Q), QUAD),		\
+  X(4, (Q, R, R, I), QUAD),		\
+  X(4, (R, R, S, S), QUAD),		\
+  X(4, (S, S, R, R), QUAD),		\
+  X(3, (Q, R, I), QUAD),		\
+  X(3, (I, Q, Q), QUAD),		\
+  X(3, (I, Q, R), QUAD),		\
   X(3, (R, Q, Q), QUAD),		\
   X(3, (D, D, D), DOUBLE),		\
   X(3, (Q, Q, Q), QUAD),		\
@@ -14065,6 +14672,8 @@ NEON_ENC_TAB
   X(3, (D, D, S), DOUBLE),		\
   X(3, (Q, Q, S), QUAD),		\
   X(3, (Q, Q, R), QUAD),		\
+  X(3, (R, R, Q), QUAD),		\
+  X(2, (R, Q),	  QUAD),		\
   X(2, (D, D), DOUBLE),			\
   X(2, (Q, Q), QUAD),			\
   X(2, (D, S), DOUBLE),			\
@@ -14101,6 +14710,8 @@ NEON_ENC_TAB
   X(2, (R, S), SINGLE),			\
   X(2, (F, R), SINGLE),			\
   X(2, (R, F), SINGLE),			\
+/* Used for MVE tail predicated loop instructions.  */\
+  X(2, (R, R), QUAD),			\
 /* Half float shape supported so far.  */\
   X (2, (H, D), MIXED),			\
   X (2, (D, H), MIXED),			\
@@ -15095,141 +15706,748 @@ do_vfp_nsyn_nmul (void)
 
 }
 
-static void
-do_vfp_nsyn_cmp (void)
+/* Turn a size (8, 16, 32, 64) into the respective bit number minus 3
+   (0, 1, 2, 3).  */
+
+static unsigned
+neon_logbits (unsigned x)
 {
-  enum neon_shape rs;
-  if (inst.operands[1].isreg)
-    {
-      rs = neon_select_shape (NS_HH, NS_FF, NS_DD, NS_NULL);
-      neon_check_type (2, rs, N_EQK | N_VFP, N_F_ALL | N_KEY | N_VFP);
+  return ffs (x) - 4;
+}
 
-      if (rs == NS_FF || rs == NS_HH)
+#define LOW4(R) ((R) & 0xf)
+#define HI1(R) (((R) >> 4) & 1)
+
+static unsigned
+mve_get_vcmp_vpt_cond (struct neon_type_el et)
+{
+  switch (et.type)
+    {
+    default:
+      first_error (BAD_EL_TYPE);
+      return 0;
+    case NT_float:
+      switch (inst.operands[0].imm)
 	{
-	  NEON_ENCODE (SINGLE, inst);
-	  do_vfp_sp_monadic ();
+	default:
+	  first_error (_("invalid condition"));
+	  return 0;
+	case 0x0:
+	  /* eq.  */
+	  return 0;
+	case 0x1:
+	  /* ne.  */
+	  return 1;
+	case 0xa:
+	  /* ge/  */
+	  return 4;
+	case 0xb:
+	  /* lt.  */
+	  return 5;
+	case 0xc:
+	  /* gt.  */
+	  return 6;
+	case 0xd:
+	  /* le.  */
+	  return 7;
+	}
+    case NT_integer:
+      /* only accept eq and ne.  */
+      if (inst.operands[0].imm > 1)
+	{
+	  first_error (_("invalid condition"));
+	  return 0;
 	}
+      return inst.operands[0].imm;
+    case NT_unsigned:
+      if (inst.operands[0].imm == 0x2)
+	return 2;
+      else if (inst.operands[0].imm == 0x8)
+	return 3;
       else
 	{
-	  NEON_ENCODE (DOUBLE, inst);
-	  do_vfp_dp_rd_rm ();
+	  first_error (_("invalid condition"));
+	  return 0;
+	}
+    case NT_signed:
+      switch (inst.operands[0].imm)
+	{
+	  default:
+	    first_error (_("invalid condition"));
+	    return 0;
+	  case 0xa:
+	    /* ge.  */
+	    return 4;
+	  case 0xb:
+	    /* lt.  */
+	    return 5;
+	  case 0xc:
+	    /* gt.  */
+	    return 6;
+	  case 0xd:
+	    /* le.  */
+	    return 7;
 	}
     }
-  else
+  /* Should be unreachable.  */
+  abort ();
+}
+
+static void
+do_mve_vpt (void)
+{
+  /* We are dealing with a vector predicated block.  */
+  if (inst.operands[0].present)
     {
-      rs = neon_select_shape (NS_HI, NS_FI, NS_DI, NS_NULL);
-      neon_check_type (2, rs, N_F_ALL | N_KEY | N_VFP, N_EQK);
+      enum neon_shape rs = neon_select_shape (NS_IQQ, NS_IQR, NS_NULL);
+      struct neon_type_el et
+	= neon_check_type (3, rs, N_EQK, N_KEY | N_F_MVE | N_I_MVE | N_SU_32,
+			   N_EQK);
 
-      switch (inst.instruction & 0x0fffffff)
+      unsigned fcond = mve_get_vcmp_vpt_cond (et);
+
+      constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
+
+      if (et.type == NT_invtype)
+	return;
+
+      if (et.type == NT_float)
 	{
-	case N_MNEM_vcmp:
-	  inst.instruction += N_MNEM_vcmpz - N_MNEM_vcmp;
-	  break;
-	case N_MNEM_vcmpe:
-	  inst.instruction += N_MNEM_vcmpez - N_MNEM_vcmpe;
-	  break;
-	default:
-	  abort ();
+	  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+		      BAD_FPU);
+	  constraint (et.size != 16 && et.size != 32, BAD_EL_TYPE);
+	  inst.instruction |= (et.size == 16) << 28;
+	  inst.instruction |= 0x3 << 20;
+	}
+      else
+	{
+	  constraint (et.size != 8 && et.size != 16 && et.size != 32,
+		      BAD_EL_TYPE);
+	  inst.instruction |= 1 << 28;
+	  inst.instruction |= neon_logbits (et.size) << 20;
 	}
 
-      if (rs == NS_FI || rs == NS_HI)
+      if (inst.operands[2].isquad)
 	{
-	  NEON_ENCODE (SINGLE, inst);
-	  do_vfp_sp_compare_z ();
+	  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+	  inst.instruction |= LOW4 (inst.operands[2].reg);
+	  inst.instruction |= (fcond & 0x2) >> 1;
 	}
       else
 	{
-	  NEON_ENCODE (DOUBLE, inst);
-	  do_vfp_dp_rd ();
+	  if (inst.operands[2].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  inst.instruction |= 1 << 6;
+	  inst.instruction |= (fcond & 0x2) << 4;
+	  inst.instruction |= inst.operands[2].reg;
 	}
-    }
-  do_vfp_cond_or_thumb ();
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= (fcond & 0x4) << 10;
+      inst.instruction |= (fcond & 0x1) << 7;
 
-  /* ARMv8.2 fp16 instruction.  */
-  if (rs == NS_HI || rs == NS_HH)
-    do_scalar_fp16_v82_encode ();
+    }
+    set_pred_insn_type (VPT_INSN);
+    now_pred.cc = 0;
+    now_pred.mask = ((inst.instruction & 0x00400000) >> 19)
+		    | ((inst.instruction & 0xe000) >> 13);
+    now_pred.warn_deprecated = FALSE;
+    now_pred.type = VECTOR_PRED;
+    inst.is_neon = 1;
 }
 
 static void
-nsyn_insert_sp (void)
+do_mve_vcmp (void)
 {
-  inst.operands[1] = inst.operands[0];
-  memset (&inst.operands[0], '\0', sizeof (inst.operands[0]));
-  inst.operands[0].reg = REG_SP;
-  inst.operands[0].isreg = 1;
-  inst.operands[0].writeback = 1;
-  inst.operands[0].present = 1;
-}
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+  if (!inst.operands[1].isreg || !inst.operands[1].isquad)
+    first_error (_(reg_expected_msgs[REG_TYPE_MQ]));
+  if (!inst.operands[2].present)
+    first_error (_("MVE vector or ARM register expected"));
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
 
-static void
-do_vfp_nsyn_push (void)
-{
-  nsyn_insert_sp ();
+  /* Deal with 'else' conditional MVE's vcmp, it will be parsed as vcmpe.  */
+  if ((inst.instruction & 0xffffffff) == N_MNEM_vcmpe
+      && inst.operands[1].isquad)
+    {
+      inst.instruction = N_MNEM_vcmp;
+      inst.cond = 0x10;
+    }
 
-  constraint (inst.operands[1].imm < 1 || inst.operands[1].imm > 16,
-	      _("register list must contain at least 1 and at most 16 "
-		"registers"));
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
 
-  if (inst.operands[1].issingle)
-    do_vfp_nsyn_opcode ("fstmdbs");
+  enum neon_shape rs = neon_select_shape (NS_IQQ, NS_IQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_KEY | N_F_MVE | N_I_MVE | N_SU_32,
+		       N_EQK);
+
+  constraint (rs == NS_IQR && inst.operands[2].reg == REG_PC
+	      && !inst.operands[2].iszr, BAD_PC);
+
+  unsigned fcond = mve_get_vcmp_vpt_cond (et);
+
+  inst.instruction = 0xee010f00;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= (fcond & 0x4) << 10;
+  inst.instruction |= (fcond & 0x1) << 7;
+  if (et.type == NT_float)
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+		  BAD_FPU);
+      inst.instruction |= (et.size == 16) << 28;
+      inst.instruction |= 0x3 << 20;
+    }
   else
-    do_vfp_nsyn_opcode ("fstmdbd");
+    {
+      inst.instruction |= 1 << 28;
+      inst.instruction |= neon_logbits (et.size) << 20;
+    }
+  if (inst.operands[2].isquad)
+    {
+      inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+      inst.instruction |= (fcond & 0x2) >> 1;
+      inst.instruction |= LOW4 (inst.operands[2].reg);
+    }
+  else
+    {
+      if (inst.operands[2].reg == REG_SP)
+	as_tsktsk (MVE_BAD_SP);
+      inst.instruction |= 1 << 6;
+      inst.instruction |= (fcond & 0x2) << 4;
+      inst.instruction |= inst.operands[2].reg;
+    }
+
+  inst.is_neon = 1;
+  return;
 }
 
 static void
-do_vfp_nsyn_pop (void)
+do_mve_vmaxa_vmina (void)
 {
-  nsyn_insert_sp ();
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
 
-  constraint (inst.operands[1].imm < 1 || inst.operands[1].imm > 16,
-	      _("register list must contain at least 1 and at most 16 "
-		"registers"));
+  enum neon_shape rs = neon_select_shape (NS_QQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_KEY | N_S8 | N_S16 | N_S32);
 
-  if (inst.operands[1].issingle)
-    do_vfp_nsyn_opcode ("fldmias");
-  else
-    do_vfp_nsyn_opcode ("fldmiad");
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
 }
 
-/* Fix up Neon data-processing instructions, ORing in the correct bits for
-   ARM mode or Thumb mode and moving the encoded bit 24 to bit 28.  */
-
 static void
-neon_dp_fixup (struct arm_it* insn)
+do_mve_vfmas (void)
 {
-  unsigned int i = insn->instruction;
-  insn->is_neon = 1;
-
-  if (thumb_mode)
-    {
-      /* The U bit is at bit 24 by default. Move to bit 28 in Thumb mode.  */
-      if (i & (1 << 24))
-	i |= 1 << 28;
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_F_MVE | N_KEY, N_EQK, N_EQK);
 
-      i &= ~(1 << 24);
-
-      i |= 0xef000000;
-    }
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
   else
-    i |= 0xf2000000;
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
 
-  insn->instruction = i;
+  if (inst.operands[2].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[2].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  inst.instruction |= (et.size == 16) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
 }
 
-/* Turn a size (8, 16, 32, 64) into the respective bit number minus 3
-   (0, 1, 2, 3).  */
+static void
+do_mve_viddup (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned imm = inst.relocs[0].exp.X_add_number;
+  constraint (imm != 1 && imm != 2 && imm != 4 && imm != 8,
+	      _("immediate must be either 1, 2, 4 or 8"));
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  unsigned Rm;
+  if (inst.instruction == M_MNEM_vddup || inst.instruction == M_MNEM_vidup)
+    {
+      rs = neon_select_shape (NS_QRI, NS_NULL);
+      et = neon_check_type (2, rs, N_KEY | N_U8 | N_U16 | N_U32, N_EQK);
+      Rm = 7;
+    }
+  else
+    {
+      constraint ((inst.operands[2].reg % 2) != 1, BAD_EVEN);
+      if (inst.operands[2].reg == REG_SP)
+	as_tsktsk (MVE_BAD_SP);
+      else if (inst.operands[2].reg == REG_PC)
+	first_error (BAD_PC);
+
+      rs = neon_select_shape (NS_QRRI, NS_NULL);
+      et = neon_check_type (3, rs, N_KEY | N_U8 | N_U16 | N_U32, N_EQK, N_EQK);
+      Rm = inst.operands[2].reg >> 1;
+    }
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= inst.operands[1].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (imm > 2) << 7;
+  inst.instruction |= Rm << 1;
+  inst.instruction |= (imm == 2 || imm == 8);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vmlas (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+  if (inst.operands[2].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[2].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshll (void)
+{
+  struct neon_type_el et
+    = neon_check_type (2, NS_QQI, N_EQK, N_S8 | N_U8 | N_S16 | N_U16 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || (unsigned)imm > et.size,
+	      _("immediate value out of range"));
+
+  if ((unsigned)imm == et.size)
+    {
+      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= 0x110001;
+    }
+  else
+    {
+      inst.instruction |= (et.size + imm) << 16;
+      inst.instruction |= 0x800140;
+    }
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshlc (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[1].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[1].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || imm > 32, _("immediate value out of range"));
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (imm & 0x1f) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= inst.operands[1].reg;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshrn (void)
+{
+  unsigned types;
+  switch (inst.instruction)
+    {
+    case M_MNEM_vshrnt:
+    case M_MNEM_vshrnb:
+    case M_MNEM_vrshrnt:
+    case M_MNEM_vrshrnb:
+      types = N_I16 | N_I32;
+      break;
+    case M_MNEM_vqshrnt:
+    case M_MNEM_vqshrnb:
+    case M_MNEM_vqrshrnt:
+    case M_MNEM_vqrshrnb:
+      types = N_U16 | N_U32 | N_S16 | N_S32;
+      break;
+    case M_MNEM_vqshrunt:
+    case M_MNEM_vqshrunb:
+    case M_MNEM_vqrshrunt:
+    case M_MNEM_vqrshrunb:
+      types = N_S16 | N_S32;
+      break;
+    default:
+      abort ();
+    }
+
+  struct neon_type_el et = neon_check_type (2, NS_QQI, N_EQK, types | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned Qd = inst.operands[0].reg;
+  unsigned Qm = inst.operands[1].reg;
+  unsigned imm = inst.operands[2].imm;
+  constraint (imm < 1 || ((unsigned) imm) > (et.size / 2),
+	      et.size == 16
+	      ? _("immediate operand expected in the range [1,8]")
+	      : _("immediate operand expected in the range [1,16]"));
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (Qd) << 22;
+  inst.instruction |= (et.size - imm) << 16;
+  inst.instruction |= LOW4 (Qd) << 12;
+  inst.instruction |= HI1 (Qm) << 5;
+  inst.instruction |= LOW4 (Qm);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vqmovn (void)
+{
+  struct neon_type_el et;
+  if (inst.instruction == M_MNEM_vqmovnt
+     || inst.instruction == M_MNEM_vqmovnb)
+    et = neon_check_type (2, NS_QQ, N_EQK,
+			  N_U16 | N_U32 | N_S16 | N_S32 | N_KEY);
+  else
+    et = neon_check_type (2, NS_QQ, N_EQK, N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (et.size == 32) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vpsel (void)
+{
+  neon_select_shape (NS_QQQ, NS_NULL);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vpnot (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+}
+
+static void
+do_mve_vmaxnma_vminnma (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.size == 16) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vcmul (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQI, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
+	      _("immediate out of range"));
+
+  if (et.size == 32 && (inst.operands[0].reg == inst.operands[1].reg
+			|| inst.operands[0].reg == inst.operands[2].reg))
+    as_tsktsk (BAD_MVE_SRCDEST);
+
+  inst.instruction |= (et.size == 32) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (rot > 90) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.instruction |= (rot == 90 || rot == 270);
+  inst.is_neon = 1;
+}
+
+/* To handle the Low Overhead Loop instructions
+   in Armv8.1-M Mainline and MVE.  */
+static void
+do_t_loloop (void)
+{
+  unsigned long insn = inst.instruction;
+
+  inst.instruction = THUMB_OP32 (inst.instruction);
+
+  if (insn == T_MNEM_lctp)
+    return;
+
+  set_pred_insn_type (MVE_OUTSIDE_PRED_INSN);
+
+  if (insn == T_MNEM_wlstp || insn == T_MNEM_dlstp)
+    {
+      struct neon_type_el et
+       = neon_check_type (2, NS_RR, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+      inst.instruction |= neon_logbits (et.size) << 20;
+      inst.is_neon = 1;
+    }
+
+  switch (insn)
+    {
+    case T_MNEM_letp:
+      constraint (!inst.operands[0].present,
+		  _("expected LR"));
+      /* fall through.  */
+    case T_MNEM_le:
+      /* le <label>.  */
+      if (!inst.operands[0].present)
+       inst.instruction |= 1 << 21;
+
+      v8_1_loop_reloc (TRUE);
+      break;
+
+    case T_MNEM_wls:
+    case T_MNEM_wlstp:
+      v8_1_loop_reloc (FALSE);
+      /* fall through.  */
+    case T_MNEM_dlstp:
+    case T_MNEM_dls:
+      constraint (inst.operands[1].isreg != 1, BAD_ARGS);
+
+      if (insn == T_MNEM_wlstp || insn == T_MNEM_dlstp)
+       constraint (inst.operands[1].reg == REG_PC, BAD_PC);
+      else if (inst.operands[1].reg == REG_PC)
+       as_tsktsk (MVE_BAD_PC);
+      if (inst.operands[1].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+
+      inst.instruction |= (inst.operands[1].reg << 16);
+      break;
+
+    default:
+      abort ();
+    }
+}
+
+
+static void
+do_vfp_nsyn_cmp (void)
+{
+  enum neon_shape rs;
+  if (!inst.operands[0].isreg)
+    {
+      do_mve_vcmp ();
+      return;
+    }
+  else
+    {
+      constraint (inst.operands[2].present, BAD_SYNTAX);
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd),
+		  BAD_FPU);
+    }
+
+  if (inst.operands[1].isreg)
+    {
+      rs = neon_select_shape (NS_HH, NS_FF, NS_DD, NS_NULL);
+      neon_check_type (2, rs, N_EQK | N_VFP, N_F_ALL | N_KEY | N_VFP);
+
+      if (rs == NS_FF || rs == NS_HH)
+	{
+	  NEON_ENCODE (SINGLE, inst);
+	  do_vfp_sp_monadic ();
+	}
+      else
+	{
+	  NEON_ENCODE (DOUBLE, inst);
+	  do_vfp_dp_rd_rm ();
+	}
+    }
+  else
+    {
+      rs = neon_select_shape (NS_HI, NS_FI, NS_DI, NS_NULL);
+      neon_check_type (2, rs, N_F_ALL | N_KEY | N_VFP, N_EQK);
+
+      switch (inst.instruction & 0x0fffffff)
+	{
+	case N_MNEM_vcmp:
+	  inst.instruction += N_MNEM_vcmpz - N_MNEM_vcmp;
+	  break;
+	case N_MNEM_vcmpe:
+	  inst.instruction += N_MNEM_vcmpez - N_MNEM_vcmpe;
+	  break;
+	default:
+	  abort ();
+	}
+
+      if (rs == NS_FI || rs == NS_HI)
+	{
+	  NEON_ENCODE (SINGLE, inst);
+	  do_vfp_sp_compare_z ();
+	}
+      else
+	{
+	  NEON_ENCODE (DOUBLE, inst);
+	  do_vfp_dp_rd ();
+	}
+    }
+  do_vfp_cond_or_thumb ();
+
+  /* ARMv8.2 fp16 instruction.  */
+  if (rs == NS_HI || rs == NS_HH)
+    do_scalar_fp16_v82_encode ();
+}
+
+static void
+nsyn_insert_sp (void)
+{
+  inst.operands[1] = inst.operands[0];
+  memset (&inst.operands[0], '\0', sizeof (inst.operands[0]));
+  inst.operands[0].reg = REG_SP;
+  inst.operands[0].isreg = 1;
+  inst.operands[0].writeback = 1;
+  inst.operands[0].present = 1;
+}
+
+static void
+do_vfp_nsyn_push (void)
+{
+  nsyn_insert_sp ();
+
+  constraint (inst.operands[1].imm < 1 || inst.operands[1].imm > 16,
+	      _("register list must contain at least 1 and at most 16 "
+		"registers"));
+
+  if (inst.operands[1].issingle)
+    do_vfp_nsyn_opcode ("fstmdbs");
+  else
+    do_vfp_nsyn_opcode ("fstmdbd");
+}
+
+static void
+do_vfp_nsyn_pop (void)
+{
+  nsyn_insert_sp ();
+
+  constraint (inst.operands[1].imm < 1 || inst.operands[1].imm > 16,
+	      _("register list must contain at least 1 and at most 16 "
+		"registers"));
 
-static unsigned
-neon_logbits (unsigned x)
-{
-  return ffs (x) - 4;
+  if (inst.operands[1].issingle)
+    do_vfp_nsyn_opcode ("fldmias");
+  else
+    do_vfp_nsyn_opcode ("fldmiad");
 }
 
-#define LOW4(R) ((R) & 0xf)
-#define HI1(R) (((R) >> 4) & 1)
+/* Fix up Neon data-processing instructions, ORing in the correct bits for
+   ARM mode or Thumb mode and moving the encoded bit 24 to bit 28.  */
+
+static void
+neon_dp_fixup (struct arm_it* insn)
+{
+  unsigned int i = insn->instruction;
+  insn->is_neon = 1;
+
+  if (thumb_mode)
+    {
+      /* The U bit is at bit 24 by default. Move to bit 28 in Thumb mode.  */
+      if (i & (1 << 24))
+	i |= 1 << 28;
+
+      i &= ~(1 << 24);
+
+      i |= 0xef000000;
+    }
+  else
+    i |= 0xf2000000;
+
+  insn->instruction = i;
+}
 
 static void
-mve_encode_qqr (int size, int fp)
+mve_encode_qqr (int size, int U, int fp)
 {
   if (inst.operands[2].reg == REG_SP)
     as_tsktsk (MVE_BAD_SP);
@@ -15244,6 +16462,9 @@ mve_encode_qqr (int size, int fp)
       /* vsub.  */
       else if (((unsigned)inst.instruction) == 0x200d00)
 	inst.instruction = 0xee301f40;
+      /* vmul.  */
+      else if (((unsigned)inst.instruction) == 0x1000d10)
+	inst.instruction = 0xee310e60;
 
       /* Setting size which is 1 for F16 and 0 for F32.  */
       inst.instruction |= (size == 16) << 28;
@@ -15256,6 +16477,37 @@ mve_encode_qqr (int size, int fp)
       /* vsub.  */
       else if (((unsigned)inst.instruction) == 0x1000800)
 	inst.instruction = 0xee011f40;
+      /* vhadd.  */
+      else if (((unsigned)inst.instruction) == 0)
+	inst.instruction = 0xee000f40;
+      /* vhsub.  */
+      else if (((unsigned)inst.instruction) == 0x200)
+	inst.instruction = 0xee001f40;
+      /* vmla.  */
+      else if (((unsigned)inst.instruction) == 0x900)
+	inst.instruction = 0xee010e40;
+      /* vmul.  */
+      else if (((unsigned)inst.instruction) == 0x910)
+	inst.instruction = 0xee011e60;
+      /* vqadd.  */
+      else if (((unsigned)inst.instruction) == 0x10)
+	inst.instruction = 0xee000f60;
+      /* vqsub.  */
+      else if (((unsigned)inst.instruction) == 0x210)
+	inst.instruction = 0xee001f60;
+      /* vqrdmlah.  */
+      else if (((unsigned)inst.instruction) == 0x3000b10)
+	inst.instruction = 0xee000e40;
+      /* vqdmulh.  */
+      else if (((unsigned)inst.instruction) == 0x0000b00)
+	inst.instruction = 0xee010e60;
+      /* vqrdmulh.  */
+      else if (((unsigned)inst.instruction) == 0x1000b00)
+	inst.instruction = 0xfe010e60;
+
+      /* Set U-bit.  */
+      inst.instruction |= U << 28;
+
       /* Setting bits for size.  */
       inst.instruction |= neon_logbits (size) << 20;
     }
@@ -15296,6 +16548,30 @@ mve_encode_qqq (int ubit, int size)
   inst.is_neon = 1;
 }
 
+static void
+mve_encode_rq (unsigned bit28, unsigned size)
+{
+  inst.instruction |= bit28 << 28;
+  inst.instruction |= neon_logbits (size) << 18;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+mve_encode_rrqq (unsigned U, unsigned size)
+{
+  constraint (inst.operands[3].reg > 14, MVE_BAD_QREG);
+
+  inst.instruction |= U << 28;
+  inst.instruction |= (inst.operands[1].reg >> 1) << 20;
+  inst.instruction |= LOW4 (inst.operands[2].reg) << 16;
+  inst.instruction |= (size == 32) << 16;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 7;
+  inst.instruction |= inst.operands[3].reg;
+  inst.is_neon = 1;
+}
 
 /* Encode insns with bit pattern:
 
@@ -15345,24 +16621,141 @@ neon_two_same (int qbit, int ubit, int size)
   neon_dp_fixup (&inst);
 }
 
+enum vfp_or_neon_is_neon_bits
+{
+NEON_CHECK_CC = 1,
+NEON_CHECK_ARCH = 2,
+NEON_CHECK_ARCH8 = 4
+};
+
+/* Call this function if an instruction which may have belonged to the VFP or
+ Neon instruction sets, but turned out to be a Neon instruction (due to the
+ operand types involved, etc.). We have to check and/or fix-up a couple of
+ things:
+
+   - Make sure the user hasn't attempted to make a Neon instruction
+     conditional.
+   - Alter the value in the condition code field if necessary.
+   - Make sure that the arch supports Neon instructions.
+
+ Which of these operations take place depends on bits from enum
+ vfp_or_neon_is_neon_bits.
+
+ WARNING: This function has side effects! If NEON_CHECK_CC is used and the
+ current instruction's condition is COND_ALWAYS, the condition field is
+ changed to inst.uncond_value.  This is necessary because instructions shared
+ between VFP and Neon may be conditional for the VFP variants only, and the
+ unconditional Neon version must have, e.g., 0xF in the condition field.  */
+
+static int
+vfp_or_neon_is_neon (unsigned check)
+{
+/* Conditions are always legal in Thumb mode (IT blocks).  */
+if (!thumb_mode && (check & NEON_CHECK_CC))
+  {
+    if (inst.cond != COND_ALWAYS)
+      {
+	first_error (_(BAD_COND));
+	return FAIL;
+      }
+    if (inst.uncond_value != -1)
+      inst.instruction |= inst.uncond_value << 28;
+  }
+
+
+  if (((check & NEON_CHECK_ARCH) && !mark_feature_used (&fpu_neon_ext_v1))
+      || ((check & NEON_CHECK_ARCH8)
+	  && !mark_feature_used (&fpu_neon_ext_armv8)))
+    {
+      first_error (_(BAD_FPU));
+      return FAIL;
+    }
+
+return SUCCESS;
+}
+
+
+/* Return TRUE if the SIMD instruction is available for the current
+   cpu_variant.  FP is set to TRUE if this is a SIMD floating-point
+   instruction.  CHECK contains th.  CHECK contains the set of bits to pass to
+   vfp_or_neon_is_neon for the NEON specific checks.  */
+
+static bfd_boolean
+check_simd_pred_availability (int fp, unsigned check)
+{
+if (inst.cond > COND_ALWAYS)
+  {
+    if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      {
+	inst.error = BAD_FPU;
+	return FALSE;
+      }
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  }
+else if (inst.cond < COND_ALWAYS)
+  {
+    if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+    else if (vfp_or_neon_is_neon (check) == FAIL)
+      return FALSE;
+  }
+else
+  {
+    if (!ARM_CPU_HAS_FEATURE (cpu_variant, fp ? mve_fp_ext : mve_ext)
+	&& vfp_or_neon_is_neon (check) == FAIL)
+      return FALSE;
+
+    if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+  }
+return TRUE;
+}
+
 /* Neon instruction encoders, in approximate order of appearance.  */
 
 static void
 do_neon_dyadic_i_su (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_32 | N_KEY);
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+
+  et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_32 | N_KEY);
+
+
+  if (rs != NS_QQR)
+    neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  else
+    mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
 }
 
 static void
 do_neon_dyadic_i64_su (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_ALL | N_KEY);
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_ALL | N_KEY);
+    }
+  if (rs == NS_QQR)
+    mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
+  else
+    neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
 }
 
 static void
@@ -15385,12 +16778,25 @@ neon_imm_shift (int write_ubit, int uval, int isquad, struct neon_type_el et,
 }
 
 static void
-do_neon_shl_imm (void)
+do_neon_shl (void)
 {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
   if (!inst.operands[2].isreg)
     {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQI, NS_NULL);
+	  et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_MVE);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+	  et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
+	}
       int imm = inst.operands[2].imm;
 
       constraint (imm < 0 || (unsigned)imm >= et.size,
@@ -15400,33 +16806,77 @@ do_neon_shl_imm (void)
     }
   else
     {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
-      unsigned int tmp;
-
-      /* VSHL/VQSHL 3-register variants have syntax such as:
-	   vshl.xx Dd, Dm, Dn
-	 whereas other 3-register operations encoded by neon_three_same have
-	 syntax like:
-	   vadd.xx Dd, Dn, Dm
-	 (i.e. with Dn & Dm reversed). Swap operands[1].reg and operands[2].reg
-	 here.  */
-      tmp = inst.operands[2].reg;
-      inst.operands[2].reg = inst.operands[1].reg;
-      inst.operands[1].reg = tmp;
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+	  et = neon_check_type (3, rs, N_EQK, N_SU_MVE | N_KEY, N_EQK | N_EQK);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+	  et = neon_check_type (3, rs, N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
+	}
+
+
+      if (rs == NS_QQR)
+	{
+	  constraint (inst.operands[0].reg != inst.operands[1].reg,
+		       _("invalid instruction shape"));
+	  if (inst.operands[2].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  else if (inst.operands[2].reg == REG_PC)
+	    as_tsktsk (MVE_BAD_PC);
+
+	  inst.instruction = 0xee311e60;
+	  inst.instruction |= (et.type == NT_unsigned) << 28;
+	  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+	  inst.instruction |= neon_logbits (et.size) << 18;
+	  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+	  inst.instruction |= inst.operands[2].reg;
+	  inst.is_neon = 1;
+	}
+      else
+	{
+	  unsigned int tmp;
+
+	  /* VSHL/VQSHL 3-register variants have syntax such as:
+	       vshl.xx Dd, Dm, Dn
+	     whereas other 3-register operations encoded by neon_three_same have
+	     syntax like:
+	       vadd.xx Dd, Dn, Dm
+	     (i.e. with Dn & Dm reversed). Swap operands[1].reg and
+	     operands[2].reg here.  */
+	  tmp = inst.operands[2].reg;
+	  inst.operands[2].reg = inst.operands[1].reg;
+	  inst.operands[1].reg = tmp;
+	  NEON_ENCODE (INTEGER, inst);
+	  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+	}
     }
 }
 
 static void
-do_neon_qshl_imm (void)
+do_neon_qshl (void)
 {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
   if (!inst.operands[2].isreg)
     {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQI, NS_NULL);
+	  et = neon_check_type (2, rs, N_EQK, N_KEY | N_SU_MVE);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+	  et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+	}
       int imm = inst.operands[2].imm;
 
       constraint (imm < 0 || (unsigned)imm >= et.size,
@@ -15436,32 +16886,103 @@ do_neon_qshl_imm (void)
     }
   else
     {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
-      unsigned int tmp;
+      enum neon_shape rs;
+      struct neon_type_el et;
 
-      /* See note in do_neon_shl_imm.  */
-      tmp = inst.operands[2].reg;
-      inst.operands[2].reg = inst.operands[1].reg;
-      inst.operands[1].reg = tmp;
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+	  et = neon_check_type (3, rs, N_EQK, N_SU_MVE | N_KEY, N_EQK | N_EQK);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+	  et = neon_check_type (3, rs, N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
+	}
+
+      if (rs == NS_QQR)
+	{
+	  constraint (inst.operands[0].reg != inst.operands[1].reg,
+		       _("invalid instruction shape"));
+	  if (inst.operands[2].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  else if (inst.operands[2].reg == REG_PC)
+	    as_tsktsk (MVE_BAD_PC);
+
+	  inst.instruction = 0xee311ee0;
+	  inst.instruction |= (et.type == NT_unsigned) << 28;
+	  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+	  inst.instruction |= neon_logbits (et.size) << 18;
+	  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+	  inst.instruction |= inst.operands[2].reg;
+	  inst.is_neon = 1;
+	}
+      else
+	{
+	  unsigned int tmp;
+
+	  /* See note in do_neon_shl.  */
+	  tmp = inst.operands[2].reg;
+	  inst.operands[2].reg = inst.operands[1].reg;
+	  inst.operands[1].reg = tmp;
+	  NEON_ENCODE (INTEGER, inst);
+	  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+	}
     }
 }
 
 static void
 do_neon_rshl (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_ALL | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_ALL | N_KEY);
+    }
+
   unsigned int tmp;
 
-  tmp = inst.operands[2].reg;
-  inst.operands[2].reg = inst.operands[1].reg;
-  inst.operands[1].reg = tmp;
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (rs == NS_QQR)
+    {
+      if (inst.operands[2].reg == REG_PC)
+	as_tsktsk (MVE_BAD_PC);
+      else if (inst.operands[2].reg == REG_SP)
+	as_tsktsk (MVE_BAD_SP);
+
+      constraint (inst.operands[0].reg != inst.operands[1].reg,
+		  _("invalid instruction shape"));
+
+      if (inst.instruction == 0x0000510)
+	/* We are dealing with vqrshl.  */
+	inst.instruction = 0xee331ee0;
+      else
+	/* We are dealing with vrshl.  */
+	inst.instruction = 0xee331e60;
+
+      inst.instruction |= (et.type == NT_unsigned) << 28;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= inst.operands[2].reg;
+      inst.is_neon = 1;
+    }
+  else
+    {
+      tmp = inst.operands[2].reg;
+      inst.operands[2].reg = inst.operands[1].reg;
+      inst.operands[1].reg = tmp;
+      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+    }
 }
 
 static int
@@ -15526,6 +17047,14 @@ do_neon_logic (void)
   if (inst.operands[2].present && inst.operands[2].isreg)
     {
       enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      if (rs == NS_QQQ
+	  && !check_simd_pred_availability (FALSE,
+					    NEON_CHECK_ARCH | NEON_CHECK_CC))
+	return;
+      else if (rs != NS_QQQ
+	       && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+	first_error (BAD_FPU);
+
       neon_check_type (3, rs, N_IGNORE_TYPE);
       /* U bit and size field were set as part of the bitmask.  */
       NEON_ENCODE (INTEGER, inst);
@@ -15539,14 +17068,29 @@ do_neon_logic (void)
       enum neon_shape rs = (three_ops_form
 			    ? neon_select_shape (NS_DDI, NS_QQI, NS_NULL)
 			    : neon_select_shape (NS_DI, NS_QI, NS_NULL));
-      struct neon_type_el et = neon_check_type (2, rs,
-	N_I8 | N_I16 | N_I32 | N_I64 | N_F32 | N_KEY, N_EQK);
+      /* Because neon_select_shape makes the second operand a copy of the first
+	 if the second operand is not present.  */
+      if (rs == NS_QQI
+	  && !check_simd_pred_availability (FALSE,
+					    NEON_CHECK_ARCH | NEON_CHECK_CC))
+	return;
+      else if (rs != NS_QQI
+	       && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+	first_error (BAD_FPU);
+
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	et = neon_check_type (2, rs, N_I32 | N_I16 | N_KEY, N_EQK);
+      else
+	et = neon_check_type (2, rs, N_I8 | N_I16 | N_I32 | N_I64 | N_F32
+			      | N_KEY, N_EQK);
+
+      if (et.type == NT_invtype)
+	return;
       enum neon_opc opcode = (enum neon_opc) inst.instruction & 0x0fffffff;
       unsigned immbits;
       int cmode;
 
-      if (et.type == NT_invtype)
-	return;
 
       if (three_ops_form)
 	constraint (inst.operands[0].reg != inst.operands[1].reg,
@@ -15625,7 +17169,7 @@ neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
     {
       NEON_ENCODE (FLOAT, inst);
       if (rs == NS_QQR)
-	mve_encode_qqr (et.size, 1);
+	mve_encode_qqr (et.size, 0, 1);
       else
 	neon_three_same (neon_quad (rs), 0, et.size == 16 ? (int) et.size : -1);
     }
@@ -15633,7 +17177,7 @@ neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
     {
       NEON_ENCODE (INTEGER, inst);
       if (rs == NS_QQR)
-	mve_encode_qqr (et.size, 0);
+	mve_encode_qqr (et.size, et.type == ubit_meaning, 0);
       else
 	neon_three_same (neon_quad (rs), et.type == ubit_meaning, et.size);
     }
@@ -15656,88 +17200,245 @@ do_neon_dyadic_if_i_d (void)
   neon_dyadic_misc (NT_untyped, N_IF_32, 0);
 }
 
-enum vfp_or_neon_is_neon_bits
+static void
+do_mve_vstr_vldr_QI (int size, int elsize, int load)
+{
+  constraint (size < 32, BAD_ADDR_MODE);
+  constraint (size != elsize, BAD_EL_TYPE);
+  constraint (inst.operands[1].immisreg, BAD_ADDR_MODE);
+  constraint (!inst.operands[1].preind, BAD_ADDR_MODE);
+  constraint (load && inst.operands[0].reg == inst.operands[1].reg,
+	      _("destination register and offset register may not be the"
+		" same"));
+
+  int imm = inst.relocs[0].exp.X_add_number;
+  int add = 1;
+  if (imm < 0)
+    {
+      add = 0;
+      imm = -imm;
+    }
+  constraint ((imm % (size / 8) != 0)
+	      || imm > (0x7f << neon_logbits (size)),
+	      (size == 32) ? _("immediate must be a multiple of 4 in the"
+			       " range of +/-[0,508]")
+			   : _("immediate must be a multiple of 8 in the"
+			       " range of +/-[0,1016]"));
+  inst.instruction |= 0x11 << 24;
+  inst.instruction |= add << 23;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= inst.operands[1].writeback << 21;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= 1 << 12;
+  inst.instruction |= (size == 64) << 8;
+  inst.instruction &= 0xffffff00;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= imm >> neon_logbits (size);
+}
+
+static void
+do_mve_vstr_vldr_RQ (int size, int elsize, int load)
+{
+    unsigned os = inst.operands[1].imm >> 5;
+    constraint (os != 0 && size == 8,
+		_("can not shift offsets when accessing less than half-word"));
+    constraint (os && os != neon_logbits (size),
+		_("shift immediate must be 1, 2 or 3 for half-word, word"
+		  " or double-word accesses respectively"));
+    if (inst.operands[1].reg == REG_PC)
+      as_tsktsk (MVE_BAD_PC);
+
+    switch (size)
+      {
+      case 8:
+	constraint (elsize >= 64, BAD_EL_TYPE);
+	break;
+      case 16:
+	constraint (elsize < 16 || elsize >= 64, BAD_EL_TYPE);
+	break;
+      case 32:
+      case 64:
+	constraint (elsize != size, BAD_EL_TYPE);
+	break;
+      default:
+	break;
+      }
+    constraint (inst.operands[1].writeback || !inst.operands[1].preind,
+		BAD_ADDR_MODE);
+    if (load)
+      {
+	constraint (inst.operands[0].reg == (inst.operands[1].imm & 0x1f),
+		    _("destination register and offset register may not be"
+		    " the same"));
+	constraint (size == elsize && inst.vectype.el[0].type != NT_unsigned,
+		    BAD_EL_TYPE);
+	constraint (inst.vectype.el[0].type != NT_unsigned
+		    && inst.vectype.el[0].type != NT_signed, BAD_EL_TYPE);
+	inst.instruction |= (inst.vectype.el[0].type == NT_unsigned) << 28;
+      }
+    else
+      {
+	constraint (inst.vectype.el[0].type != NT_untyped, BAD_EL_TYPE);
+      }
+
+    inst.instruction |= 1 << 23;
+    inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+    inst.instruction |= inst.operands[1].reg << 16;
+    inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+    inst.instruction |= neon_logbits (elsize) << 7;
+    inst.instruction |= HI1 (inst.operands[1].imm) << 5;
+    inst.instruction |= LOW4 (inst.operands[1].imm);
+    inst.instruction |= !!os;
+}
+
+static void
+do_mve_vstr_vldr_RI (int size, int elsize, int load)
 {
-  NEON_CHECK_CC = 1,
-  NEON_CHECK_ARCH = 2,
-  NEON_CHECK_ARCH8 = 4
-};
-
-/* Call this function if an instruction which may have belonged to the VFP or
-   Neon instruction sets, but turned out to be a Neon instruction (due to the
-   operand types involved, etc.). We have to check and/or fix-up a couple of
-   things:
-
-     - Make sure the user hasn't attempted to make a Neon instruction
-       conditional.
-     - Alter the value in the condition code field if necessary.
-     - Make sure that the arch supports Neon instructions.
+  enum neon_el_type type = inst.vectype.el[0].type;
 
-   Which of these operations take place depends on bits from enum
-   vfp_or_neon_is_neon_bits.
+  constraint (size >= 64, BAD_ADDR_MODE);
+  switch (size)
+    {
+    case 16:
+      constraint (elsize < 16 || elsize >= 64, BAD_EL_TYPE);
+      break;
+    case 32:
+      constraint (elsize != size, BAD_EL_TYPE);
+      break;
+    default:
+      break;
+    }
+  if (load)
+    {
+      constraint (elsize != size && type != NT_unsigned
+		  && type != NT_signed, BAD_EL_TYPE);
+    }
+  else
+    {
+      constraint (elsize != size && type != NT_untyped, BAD_EL_TYPE);
+    }
 
-   WARNING: This function has side effects! If NEON_CHECK_CC is used and the
-   current instruction's condition is COND_ALWAYS, the condition field is
-   changed to inst.uncond_value. This is necessary because instructions shared
-   between VFP and Neon may be conditional for the VFP variants only, and the
-   unconditional Neon version must have, e.g., 0xF in the condition field.  */
+  int imm = inst.relocs[0].exp.X_add_number;
+  int add = 1;
+  if (imm < 0)
+    {
+      add = 0;
+      imm = -imm;
+    }
 
-static int
-vfp_or_neon_is_neon (unsigned check)
-{
-  /* Conditions are always legal in Thumb mode (IT blocks).  */
-  if (!thumb_mode && (check & NEON_CHECK_CC))
+  if ((imm % (size / 8) != 0) || imm > (0x7f << neon_logbits (size)))
     {
-      if (inst.cond != COND_ALWAYS)
+      switch (size)
 	{
-	  first_error (_(BAD_COND));
-	  return FAIL;
+	case 8:
+	  constraint (1, _("immediate must be in the range of +/-[0,127]"));
+	  break;
+	case 16:
+	  constraint (1, _("immediate must be a multiple of 2 in the"
+			   " range of +/-[0,254]"));
+	  break;
+	case 32:
+	  constraint (1, _("immediate must be a multiple of 4 in the"
+			   " range of +/-[0,508]"));
+	  break;
 	}
-      if (inst.uncond_value != -1)
-	inst.instruction |= inst.uncond_value << 28;
     }
 
+  if (size != elsize)
+    {
+      constraint (inst.operands[1].reg > 7, BAD_HIREG);
+      constraint (inst.operands[0].reg > 14,
+		  _("MVE vector register in the range [Q0..Q7] expected"));
+      inst.instruction |= (load && type == NT_unsigned) << 28;
+      inst.instruction |= (size == 16) << 19;
+      inst.instruction |= neon_logbits (elsize) << 7;
+    }
+  else
+    {
+      if (inst.operands[1].reg == REG_PC)
+	as_tsktsk (MVE_BAD_PC);
+      else if (inst.operands[1].reg == REG_SP && inst.operands[1].writeback)
+	as_tsktsk (MVE_BAD_SP);
+      inst.instruction |= 1 << 12;
+      inst.instruction |= neon_logbits (size) << 7;
+    }
+  inst.instruction |= inst.operands[1].preind << 24;
+  inst.instruction |= add << 23;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= inst.operands[1].writeback << 21;
+  inst.instruction |= inst.operands[1].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction &= 0xffffff80;
+  inst.instruction |= imm >> neon_logbits (size);
 
-    if (((check & NEON_CHECK_ARCH) && !mark_feature_used (&fpu_neon_ext_v1))
-	|| ((check & NEON_CHECK_ARCH8)
-	    && !mark_feature_used (&fpu_neon_ext_armv8)))
-      {
-	first_error (_(BAD_FPU));
-	return FAIL;
-      }
-
-  return SUCCESS;
 }
 
-static int
-check_simd_pred_availability (int fp, unsigned check)
+static void
+do_mve_vstr_vldr (void)
 {
+  unsigned size;
+  int load = 0;
+
   if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  switch (inst.instruction)
     {
-      if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-	{
-	  inst.error = BAD_FPU;
-	  return 1;
-	}
-      inst.pred_insn_type = INSIDE_VPT_INSN;
+    default:
+      gas_assert (0);
+      break;
+    case M_MNEM_vldrb:
+      load = 1;
+      /* fall through.  */
+    case M_MNEM_vstrb:
+      size = 8;
+      break;
+    case M_MNEM_vldrh:
+      load = 1;
+      /* fall through.  */
+    case M_MNEM_vstrh:
+      size = 16;
+      break;
+    case M_MNEM_vldrw:
+      load = 1;
+      /* fall through.  */
+    case M_MNEM_vstrw:
+      size = 32;
+      break;
+    case M_MNEM_vldrd:
+      load = 1;
+      /* fall through.  */
+    case M_MNEM_vstrd:
+      size = 64;
+      break;
     }
-  else if (inst.cond < COND_ALWAYS)
+  unsigned elsize = inst.vectype.el[0].size;
+
+  if (inst.operands[1].isquad)
     {
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-	inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
-      else if (vfp_or_neon_is_neon (check) == FAIL)
-	return 2;
+      /* We are dealing with [Q, imm]{!} cases.  */
+      do_mve_vstr_vldr_QI (size, elsize, load);
     }
   else
     {
-      if (!ARM_CPU_HAS_FEATURE (cpu_variant, fp ? mve_fp_ext : mve_ext)
-	  && vfp_or_neon_is_neon (check) == FAIL)
-	return 3;
-
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-	inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+      if (inst.operands[1].immisreg == 2)
+	{
+	  /* We are dealing with [R, Q, {UXTW #os}] cases.  */
+	  do_mve_vstr_vldr_RQ (size, elsize, load);
+	}
+      else if (!inst.operands[1].immisreg)
+	{
+	  /* We are dealing with [R, Imm]{!}/[R], Imm cases.  */
+	  do_mve_vstr_vldr_RI (size, elsize, load);
+	}
+      else
+	constraint (1, BAD_ADDR_MODE);
     }
-  return 0;
+
+  inst.is_neon = 1;
 }
 
 static void
@@ -15778,6 +17479,30 @@ do_mve_vst_vld (void)
   inst.is_neon = 1;
 }
 
+static void
+do_mve_vaddlv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RRQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S32 | N_U32 | N_KEY);
+
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= inst.operands[1].reg << 19;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
+}
+
 static void
 do_neon_dyadic_if_su (void)
 {
@@ -15785,8 +17510,13 @@ do_neon_dyadic_if_su (void)
   struct neon_type_el et = neon_check_type (3, rs, N_EQK , N_EQK,
 					    N_SUF_32 | N_KEY);
 
-  if (check_simd_pred_availability (et.type == NT_float,
-				    NEON_CHECK_ARCH | NEON_CHECK_CC))
+  constraint ((inst.instruction == ((unsigned) N_MNEM_vmax)
+	       || inst.instruction == ((unsigned) N_MNEM_vmin))
+	      && et.type == NT_float
+	      && !ARM_CPU_HAS_FEATURE (cpu_variant,fpu_neon_ext_v1), BAD_FPU);
+
+  if (!check_simd_pred_availability (et.type == NT_float,
+				     NEON_CHECK_ARCH | NEON_CHECK_CC))
     return;
 
   neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
@@ -15810,8 +17540,8 @@ do_neon_addsub_if_i (void)
      they are predicated or not.  */
   if ((rs == NS_QQQ || rs == NS_QQR) && et.size != 64)
     {
-      if (check_simd_pred_availability (et.type == NT_float,
-					NEON_CHECK_ARCH | NEON_CHECK_CC))
+      if (!check_simd_pred_availability (et.type == NT_float,
+					 NEON_CHECK_ARCH | NEON_CHECK_CC))
 	return;
     }
   else
@@ -15938,121 +17668,367 @@ neon_scalar_for_mul (unsigned scalar, unsigned elsize)
       first_error (_("scalar out of range for multiply instruction"));
     }
 
-  return 0;
-}
+  return 0;
+}
+
+/* Encode multiply / multiply-accumulate scalar instructions.  */
+
+static void
+neon_mul_mac (struct neon_type_el et, int ubit)
+{
+  unsigned scalar;
+
+  /* Give a more helpful error message if we have an invalid type.  */
+  if (et.type == NT_invtype)
+    return;
+
+  scalar = neon_scalar_for_mul (inst.operands[2].reg, et.size);
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= LOW4 (scalar);
+  inst.instruction |= HI1 (scalar) << 5;
+  inst.instruction |= (et.type == NT_float) << 8;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= (ubit != 0) << 24;
+
+  neon_dp_fixup (&inst);
+}
+
+static void
+do_neon_mac_maybe_scalar (void)
+{
+  if (try_vfp_nsyn (3, do_vfp_nsyn_mla_mls) == SUCCESS)
+    return;
+
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  if (inst.operands[2].isscalar)
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs,
+	N_EQK, N_EQK, N_I16 | N_I32 | N_F_16_32 | N_KEY);
+      NEON_ENCODE (SCALAR, inst);
+      neon_mul_mac (et, neon_quad (rs));
+    }
+  else if (!inst.operands[2].isvec)
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+
+      enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+      neon_dyadic_misc (NT_unsigned, N_SU_MVE, 0);
+    }
+  else
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      /* The "untyped" case can't happen.  Do this to stop the "U" bit being
+	 affected if we specify unsigned args.  */
+      neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+    }
+}
+
+static void
+do_neon_fmac (void)
+{
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_fma)
+      && try_vfp_nsyn (3, do_vfp_nsyn_fma_fms) == SUCCESS)
+    return;
+
+  if (!check_simd_pred_availability (TRUE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+    {
+      enum neon_shape rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs, N_F_MVE | N_KEY, N_EQK,
+						N_EQK);
+
+      if (rs == NS_QQR)
+	{
+	  if (inst.operands[2].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  else if (inst.operands[2].reg == REG_PC)
+	    as_tsktsk (MVE_BAD_PC);
+
+	  inst.instruction = 0xee310e40;
+	  inst.instruction |= (et.size == 16) << 28;
+	  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+	  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+	  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+	  inst.instruction |= HI1 (inst.operands[1].reg) << 6;
+	  inst.instruction |= inst.operands[2].reg;
+	  inst.is_neon = 1;
+	  return;
+	}
+    }
+  else
+    {
+      constraint (!inst.operands[2].isvec, BAD_FPU);
+    }
+
+  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+}
+
+static void
+do_neon_tst (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs,
+    N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+  neon_three_same (neon_quad (rs), 0, et.size);
+}
+
+/* VMUL with 3 registers allows the P8 type. The scalar version supports the
+   same types as the MAC equivalents. The polynomial type for this instruction
+   is encoded the same as the integer type.  */
+
+static void
+do_neon_mul (void)
+{
+  if (try_vfp_nsyn (3, do_vfp_nsyn_mul) == SUCCESS)
+    return;
+
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  if (inst.operands[2].isscalar)
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      do_neon_mac_maybe_scalar ();
+    }
+  else
+    {
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  enum neon_shape rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+	  struct neon_type_el et
+	    = neon_check_type (3, rs, N_EQK, N_EQK, N_I_MVE | N_F_MVE | N_KEY);
+	  if (et.type == NT_float)
+	    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+			BAD_FPU);
+
+	  neon_dyadic_misc (NT_float, N_I_MVE | N_F_MVE, 0);
+	}
+      else
+	{
+	  constraint (!inst.operands[2].isvec, BAD_FPU);
+	  neon_dyadic_misc (NT_poly,
+			    N_I8 | N_I16 | N_I32 | N_F16 | N_F32 | N_P8, 0);
+	}
+    }
+}
+
+static void
+do_neon_qdmulh (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  if (inst.operands[2].isscalar)
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs,
+	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+      NEON_ENCODE (SCALAR, inst);
+      neon_mul_mac (et, neon_quad (rs));
+    }
+  else
+    {
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+	  et = neon_check_type (3, rs,
+	    N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+	  et = neon_check_type (3, rs,
+	    N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+	}
+
+      NEON_ENCODE (INTEGER, inst);
+      if (rs == NS_QQR)
+	mve_encode_qqr (et.size, 0, 0);
+      else
+	/* The U bit (rounding) comes from bit mask.  */
+	neon_three_same (neon_quad (rs), 0, et.size);
+    }
+}
+
+static void
+do_mve_vaddv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK,  N_SU_32 | N_KEY);
+
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
 
-/* Encode multiply / multiply-accumulate scalar instructions.  */
+  mve_encode_rq (et.type == NT_unsigned, et.size);
+}
 
 static void
-neon_mul_mac (struct neon_type_el et, int ubit)
+do_mve_vhcadd (void)
 {
-  unsigned scalar;
+  enum neon_shape rs = neon_select_shape (NS_QQQI, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
 
-  /* Give a more helpful error message if we have an invalid type.  */
-  if (et.type == NT_invtype)
-    return;
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
 
-  scalar = neon_scalar_for_mul (inst.operands[2].reg, et.size);
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
-  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-  inst.instruction |= LOW4 (scalar);
-  inst.instruction |= HI1 (scalar) << 5;
-  inst.instruction |= (et.type == NT_float) << 8;
-  inst.instruction |= neon_logbits (et.size) << 20;
-  inst.instruction |= (ubit != 0) << 24;
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 90 && rot != 270, _("immediate out of range"));
 
-  neon_dp_fixup (&inst);
+  if (et.size == 32 && inst.operands[0].reg == inst.operands[2].reg)
+    as_tsktsk (_("Warning: 32-bit element size and same first and third "
+		 "operand makes instruction UNPREDICTABLE"));
+
+  mve_encode_qqq (0, et.size);
+  inst.instruction |= (rot == 270) << 12;
+  inst.is_neon = 1;
 }
 
 static void
-do_neon_mac_maybe_scalar (void)
+do_mve_vqdmull (void)
 {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_mla_mls) == SUCCESS)
-    return;
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
 
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
+  if (et.size == 32
+      && (inst.operands[0].reg == inst.operands[1].reg
+	  || (rs == NS_QQQ && inst.operands[0].reg == inst.operands[2].reg)))
+    as_tsktsk (BAD_MVE_SRCDEST);
 
-  if (inst.operands[2].isscalar)
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (rs == NS_QQQ)
     {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_I16 | N_I32 | N_F_16_32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
+      mve_encode_qqq (et.size == 32, 64);
+      inst.instruction |= 1;
     }
   else
     {
-      /* The "untyped" case can't happen.  Do this to stop the "U" bit being
-	 affected if we specify unsigned args.  */
-      neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+      mve_encode_qqr (64, et.size == 32, 0);
+      inst.instruction |= 0x3 << 5;
     }
 }
 
 static void
-do_neon_fmac (void)
+do_mve_vadc (void)
 {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_fma_fms) == SUCCESS)
-    return;
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_KEY | N_I32, N_EQK, N_EQK);
 
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
 
-  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (0, 64);
 }
 
 static void
-do_neon_tst (void)
+do_mve_vbrsr (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
-  neon_three_same (neon_quad (rs), 0, et.size);
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqr (et.size, 0, 0);
 }
 
-/* VMUL with 3 registers allows the P8 type. The scalar version supports the
-   same types as the MAC equivalents. The polynomial type for this instruction
-   is encoded the same as the integer type.  */
+static void
+do_mve_vsbc (void)
+{
+  neon_check_type (3, NS_QQQ, N_EQK, N_EQK, N_I32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (1, 64);
+}
 
 static void
-do_neon_mul (void)
+do_mve_vmulh (void)
 {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_mul) == SUCCESS)
-    return;
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
 
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
 
-  if (inst.operands[2].isscalar)
-    do_neon_mac_maybe_scalar ();
+  mve_encode_qqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vqdmlah (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S_32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
   else
-    neon_dyadic_misc (NT_poly, N_I8 | N_I16 | N_I32 | N_F16 | N_F32 | N_P8, 0);
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
 }
 
 static void
-do_neon_qdmulh (void)
+do_mve_vqdmladh (void)
 {
-  if (inst.operands[2].isscalar)
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
-    }
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
   else
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (INTEGER, inst);
-      /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (neon_quad (rs), 0, et.size);
-    }
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (0, et.size);
 }
 
+
 static void
 do_mve_vmull (void)
 {
@@ -16168,34 +18144,159 @@ do_mve_vmladav (void)
 }
 
 static void
-do_neon_qrdmlah (void)
+do_mve_vmlaldav (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RRQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (4, rs, N_EQK, N_EQK, N_EQK,
+		       N_S16 | N_S32 | N_U16 | N_U32 | N_KEY);
+
+  if (et.type == NT_unsigned
+      && (inst.instruction == M_MNEM_vmlsldav
+	  || inst.instruction == M_MNEM_vmlsldava
+	  || inst.instruction == M_MNEM_vmlsldavx
+	  || inst.instruction == M_MNEM_vmlsldavax))
+    first_error (BAD_SIMD_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rrqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vrmlaldavh (void)
 {
-  /* Check we're on the correct architecture.  */
-  if (!mark_feature_used (&fpu_neon_ext_armv8))
-    inst.error =
-      _("instruction form not available on this architecture.");
-  else if (!mark_feature_used (&fpu_neon_ext_v8_1))
+  struct neon_type_el et;
+  if (inst.instruction == M_MNEM_vrmlsldavh
+     || inst.instruction == M_MNEM_vrmlsldavha
+     || inst.instruction == M_MNEM_vrmlsldavhx
+     || inst.instruction == M_MNEM_vrmlsldavhax)
     {
-      as_warn (_("this instruction implies use of ARMv8.1 AdvSIMD."));
-      record_feature_use (&fpu_neon_ext_v8_1);
+      et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK, N_S32 | N_KEY);
+      if (inst.operands[1].reg == REG_SP)
+	as_tsktsk (MVE_BAD_SP);
     }
+  else
+    {
+      if (inst.instruction == M_MNEM_vrmlaldavhx
+	  || inst.instruction == M_MNEM_vrmlaldavhax)
+	et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK, N_S32 | N_KEY);
+      else
+	et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK,
+			      N_U32 | N_S32 | N_KEY);
+      /* vrmlaldavh's encoding with SP as the second, odd, GPR operand may alias
+	 with vmax/min instructions, making the use of SP in assembly really
+	 nonsensical, so instead of issuing a warning like we do for other uses
+	 of SP for the odd register operand we error out.  */
+      constraint (inst.operands[1].reg == REG_SP, BAD_SP);
+    }
+
+  /* Make sure we still check the second operand is an odd one and that PC is
+     disallowed.  This because we are parsing for any GPR operand, to be able
+     to distinguish between giving a warning or an error for SP as described
+     above.  */
+  constraint ((inst.operands[1].reg % 2) != 1, BAD_EVEN);
+  constraint (inst.operands[1].reg == REG_PC, BAD_PC);
 
-  if (inst.operands[2].isscalar)
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rrqq (et.type == NT_unsigned, 0);
+}
+
+
+static void
+do_mve_vmaxnmv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[0].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[0].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  mve_encode_rq (et.size == 16, 64);
+}
+
+static void
+do_mve_vmaxv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et;
+
+  if (inst.instruction == M_MNEM_vmaxv || inst.instruction == M_MNEM_vminv)
+    et = neon_check_type (2, rs, N_EQK, N_SU_MVE | N_KEY);
+  else
+    et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[0].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[0].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  mve_encode_rq (et.type == NT_unsigned, et.size);
+}
+
+
+static void
+do_neon_qrdmlah (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
     {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
+      /* Check we're on the correct architecture.  */
+      if (!mark_feature_used (&fpu_neon_ext_armv8))
+	inst.error
+	  = _("instruction form not available on this architecture.");
+      else if (!mark_feature_used (&fpu_neon_ext_v8_1))
+	{
+	  as_warn (_("this instruction implies use of ARMv8.1 AdvSIMD."));
+	  record_feature_use (&fpu_neon_ext_v8_1);
+	}
+	if (inst.operands[2].isscalar)
+	  {
+	    enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+	    struct neon_type_el et = neon_check_type (3, rs,
+	      N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+	    NEON_ENCODE (SCALAR, inst);
+	    neon_mul_mac (et, neon_quad (rs));
+	  }
+	else
+	  {
+	    enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+	    struct neon_type_el et = neon_check_type (3, rs,
+	      N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+	    NEON_ENCODE (INTEGER, inst);
+	    /* The U bit (rounding) comes from bit mask.  */
+	    neon_three_same (neon_quad (rs), 0, et.size);
+	  }
     }
   else
     {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+      enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+      struct neon_type_el et
+	= neon_check_type (3, rs, N_EQK, N_EQK, N_S_32 | N_KEY);
+
       NEON_ENCODE (INTEGER, inst);
-      /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (neon_quad (rs), 0, et.size);
+      mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
     }
 }
 
@@ -16237,8 +18338,8 @@ do_neon_abs_neg (void)
   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   et = neon_check_type (2, rs, N_EQK, N_S_32 | N_F_16_32 | N_KEY);
 
-  if (check_simd_pred_availability (et.type == NT_float,
-				    NEON_CHECK_ARCH | NEON_CHECK_CC))
+  if (!check_simd_pred_availability (et.type == NT_float,
+				     NEON_CHECK_ARCH | NEON_CHECK_CC))
     return;
 
   inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -16255,9 +18356,23 @@ do_neon_abs_neg (void)
 static void
 do_neon_sli (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+    }
+
+
   int imm = inst.operands[2].imm;
   constraint (imm < 0 || (unsigned)imm >= et.size,
 	      _("immediate out of range for insert"));
@@ -16267,9 +18382,22 @@ do_neon_sli (void)
 static void
 do_neon_sri (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+    }
+
   int imm = inst.operands[2].imm;
   constraint (imm < 1 || (unsigned)imm > et.size,
 	      _("immediate out of range for insert"));
@@ -16279,9 +18407,23 @@ do_neon_sri (void)
 static void
 do_neon_qshlu_imm (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK | N_UNS, N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK | N_UNS,
+			    N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
+    }
+
   int imm = inst.operands[2].imm;
   constraint (imm < 0 || (unsigned)imm >= et.size,
 	      _("immediate out of range for shift"));
@@ -16727,15 +18869,65 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
 
   switch (rs)
     {
-    case NS_DDI:
     case NS_QQI:
+      if (mode == neon_cvt_mode_z
+	  && (flavour == neon_cvt_flavour_f16_s16
+	      || flavour == neon_cvt_flavour_f16_u16
+	      || flavour == neon_cvt_flavour_s16_f16
+	      || flavour == neon_cvt_flavour_u16_f16
+	      || flavour == neon_cvt_flavour_f32_u32
+	      || flavour == neon_cvt_flavour_f32_s32
+	      || flavour == neon_cvt_flavour_s32_f32
+	      || flavour == neon_cvt_flavour_u32_f32))
+	{
+	  if (!check_simd_pred_availability (TRUE,
+					     NEON_CHECK_CC | NEON_CHECK_ARCH))
+	    return;
+	}
+      else if (mode == neon_cvt_mode_n)
+	{
+	  /* We are dealing with vcvt with the 'ne' condition.  */
+	  inst.cond = 0x1;
+	  inst.instruction = N_MNEM_vcvt;
+	  do_neon_cvt_1 (neon_cvt_mode_z);
+	  return;
+	}
+      /* fall through.  */
+    case NS_DDI:
       {
 	unsigned immbits;
 	unsigned enctab[] = {0x0000100, 0x1000100, 0x0, 0x1000000,
 			     0x0000100, 0x1000100, 0x0, 0x1000000};
 
-	if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-	  return;
+	if ((rs != NS_QQI || !ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+	    && vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+	    return;
+
+	if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+	  {
+	    constraint (inst.operands[2].present && inst.operands[2].imm == 0,
+			_("immediate value out of range"));
+	    switch (flavour)
+	      {
+		case neon_cvt_flavour_f16_s16:
+		case neon_cvt_flavour_f16_u16:
+		case neon_cvt_flavour_s16_f16:
+		case neon_cvt_flavour_u16_f16:
+		  constraint (inst.operands[2].imm > 16,
+			      _("immediate value out of range"));
+		  break;
+		case neon_cvt_flavour_f32_u32:
+		case neon_cvt_flavour_f32_s32:
+		case neon_cvt_flavour_s32_f32:
+		case neon_cvt_flavour_u32_f32:
+		  constraint (inst.operands[2].imm > 32,
+			      _("immediate value out of range"));
+		  break;
+		default:
+		  inst.error = BAD_FPU;
+		  return;
+	      }
+	  }
 
 	/* Fixed-point conversion with #0 immediate is encoded as an
 	   integer conversion.  */
@@ -16768,14 +18960,40 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
       }
       break;
 
+    case NS_QQ:
+      if ((mode == neon_cvt_mode_a || mode == neon_cvt_mode_n
+	   || mode == neon_cvt_mode_m || mode == neon_cvt_mode_p)
+	  && (flavour == neon_cvt_flavour_s16_f16
+	      || flavour == neon_cvt_flavour_u16_f16
+	      || flavour == neon_cvt_flavour_s32_f32
+	      || flavour == neon_cvt_flavour_u32_f32))
+	{
+	  if (!check_simd_pred_availability (TRUE,
+					     NEON_CHECK_CC | NEON_CHECK_ARCH8))
+	    return;
+	}
+      else if (mode == neon_cvt_mode_z
+	       && (flavour == neon_cvt_flavour_f16_s16
+		   || flavour == neon_cvt_flavour_f16_u16
+		   || flavour == neon_cvt_flavour_s16_f16
+		   || flavour == neon_cvt_flavour_u16_f16
+		   || flavour == neon_cvt_flavour_f32_u32
+		   || flavour == neon_cvt_flavour_f32_s32
+		   || flavour == neon_cvt_flavour_s32_f32
+		   || flavour == neon_cvt_flavour_u32_f32))
+	{
+	  if (!check_simd_pred_availability (TRUE,
+					     NEON_CHECK_CC | NEON_CHECK_ARCH))
+	    return;
+	}
+      /* fall through.  */
     case NS_DD:
-    case NS_QQ:
       if (mode != neon_cvt_mode_x && mode != neon_cvt_mode_z)
 	{
-	  NEON_ENCODE (FLOAT, inst);
-	  set_pred_insn_type (OUTSIDE_PRED_INSN);
 
-	  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
+	  NEON_ENCODE (FLOAT, inst);
+	  if (!check_simd_pred_availability (TRUE,
+					     NEON_CHECK_CC | NEON_CHECK_ARCH8))
 	    return;
 
 	  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -16805,8 +19023,11 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
 
 	    NEON_ENCODE (INTEGER, inst);
 
-	    if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-	      return;
+	  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+	    {
+	      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+		return;
+	    }
 
 	    if (flavour != neon_cvt_flavour_invalid)
 	      inst.instruction |= enctab[flavour];
@@ -16925,10 +19146,51 @@ static void
 do_neon_cvttb_1 (bfd_boolean t)
 {
   enum neon_shape rs = neon_select_shape (NS_HF, NS_HD, NS_FH, NS_FF, NS_FD,
-					  NS_DF, NS_DH, NS_NULL);
+					  NS_DF, NS_DH, NS_QQ, NS_QQI, NS_NULL);
 
   if (rs == NS_NULL)
     return;
+  else if (rs == NS_QQ || rs == NS_QQI)
+    {
+      int single_to_half = 0;
+      if (!check_simd_pred_availability (TRUE, NEON_CHECK_ARCH))
+	return;
+
+      enum neon_cvt_flavour flavour = get_neon_cvt_flavour (rs);
+
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+	  && (flavour ==  neon_cvt_flavour_u16_f16
+	      || flavour ==  neon_cvt_flavour_s16_f16
+	      || flavour ==  neon_cvt_flavour_f16_s16
+	      || flavour ==  neon_cvt_flavour_f16_u16
+	      || flavour ==  neon_cvt_flavour_u32_f32
+	      || flavour ==  neon_cvt_flavour_s32_f32
+	      || flavour ==  neon_cvt_flavour_f32_s32
+	      || flavour ==  neon_cvt_flavour_f32_u32))
+	{
+	  inst.cond = 0xf;
+	  inst.instruction = N_MNEM_vcvt;
+	  set_pred_insn_type (INSIDE_VPT_INSN);
+	  do_neon_cvt_1 (neon_cvt_mode_z);
+	  return;
+	}
+      else if (rs == NS_QQ && flavour == neon_cvt_flavour_f32_f16)
+	single_to_half = 1;
+      else if (rs == NS_QQ && flavour != neon_cvt_flavour_f16_f32)
+	{
+	  first_error (BAD_FPU);
+	  return;
+	}
+
+      inst.instruction = 0xee3f0e01;
+      inst.instruction |= single_to_half << 28;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 13;
+      inst.instruction |= t << 12;
+      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 1;
+      inst.is_neon = 1;
+    }
   else if (neon_check_type (2, rs, N_F16, N_F32 | N_VFP).type != NT_invtype)
     {
       inst.error = NULL;
@@ -17031,9 +19293,16 @@ neon_move_immediate (void)
 static void
 do_neon_mvn (void)
 {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
   if (inst.operands[1].isreg)
     {
-      enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+      enum neon_shape rs;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	rs = neon_select_shape (NS_QQ, NS_NULL);
+      else
+	rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
 
       NEON_ENCODE (INTEGER, inst);
       inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17049,6 +19318,13 @@ do_neon_mvn (void)
     }
 
   neon_dp_fixup (&inst);
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      constraint (!inst.operands[1].isreg && !inst.operands[0].isquad, BAD_FPU);
+      constraint ((inst.instruction & 0xd00) == 0xd00,
+		  _("immediate value out of range"));
+    }
 }
 
 /* Encode instructions of form:
@@ -17361,14 +19637,29 @@ do_neon_ext (void)
 static void
 do_neon_rev (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_8 | N_16 | N_32 | N_KEY);
+
   unsigned op = (inst.instruction >> 7) & 3;
   /* N (width of reversed regions) is encoded as part of the bitmask. We
      extract it here to check the elements to be reversed are smaller.
      Otherwise we'd get a reserved instruction.  */
   unsigned elsize = (op == 2) ? 16 : (op == 1) ? 32 : (op == 0) ? 64 : 0;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext) && elsize == 64
+      && inst.operands[0].reg == inst.operands[1].reg)
+    as_tsktsk (_("Warning: 64-bit element size and same destination and source"
+		 " operands makes instruction UNPREDICTABLE"));
+
   gas_assert (elsize != 0);
   constraint (et.size >= elsize,
 	      _("elements must be smaller than reversal region"));
@@ -17380,6 +19671,8 @@ do_neon_dup (void)
 {
   if (inst.operands[1].isscalar)
     {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1),
+		  BAD_FPU);
       enum neon_shape rs = neon_select_shape (NS_DS, NS_QS, NS_NULL);
       struct neon_type_el et = neon_check_type (2, rs,
 	N_EQK, N_8 | N_16 | N_32 | N_KEY);
@@ -17407,6 +19700,23 @@ do_neon_dup (void)
       enum neon_shape rs = neon_select_shape (NS_DR, NS_QR, NS_NULL);
       struct neon_type_el et = neon_check_type (2, rs,
 	N_8 | N_16 | N_32 | N_KEY, N_EQK);
+      if (rs == NS_QR)
+	{
+	  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH))
+	    return;
+	}
+      else
+	constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1),
+		    BAD_FPU);
+
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  if (inst.operands[1].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  else if (inst.operands[1].reg == REG_PC)
+	    as_tsktsk (MVE_BAD_PC);
+	}
+
       /* Duplicate ARM register to lanes of vector.  */
       NEON_ENCODE (ARMREG, inst);
       switch (et.size)
@@ -17426,6 +19736,67 @@ do_neon_dup (void)
     }
 }
 
+static void
+do_mve_mov (int toQ)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = MVE_UNPREDICABLE_INSN;
+
+  unsigned Rt = 0, Rt2 = 1, Q0 = 2, Q1 = 3;
+  if (toQ)
+    {
+      Q0 = 0;
+      Q1 = 1;
+      Rt = 2;
+      Rt2 = 3;
+    }
+
+  constraint (inst.operands[Q0].reg != inst.operands[Q1].reg + 2,
+	      _("Index one must be [2,3] and index two must be two less than"
+		" index one."));
+  constraint (inst.operands[Rt].reg == inst.operands[Rt2].reg,
+	      _("General purpose registers may not be the same"));
+  constraint (inst.operands[Rt].reg == REG_SP
+	      || inst.operands[Rt2].reg == REG_SP,
+	      BAD_SP);
+  constraint (inst.operands[Rt].reg == REG_PC
+	      || inst.operands[Rt2].reg == REG_PC,
+	      BAD_PC);
+
+  inst.instruction = 0xec000f00;
+  inst.instruction |= HI1 (inst.operands[Q1].reg / 32) << 23;
+  inst.instruction |= !!toQ << 20;
+  inst.instruction |= inst.operands[Rt2].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[Q1].reg / 32) << 13;
+  inst.instruction |= (inst.operands[Q1].reg % 4) << 4;
+  inst.instruction |= inst.operands[Rt].reg;
+}
+
+static void
+do_mve_movn (void)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  struct neon_type_el et = neon_check_type (2, NS_QQ, N_EQK, N_I16 | N_I32
+					    | N_KEY);
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (neon_logbits (et.size) - 1) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+
+}
+
 /* VMOV has particularly many variations. It can be one of:
      0. VMOV<c><q> <Qd>, <Qm>
      1. VMOV<c><q> <Dd>, <Dm>
@@ -17455,6 +19826,10 @@ do_neon_dup (void)
    (Two ARM regs to two VFP singles.)
     15. VMOV <Sd>, <Se>, <Rn>, <Rm>
    (Two VFP singles to two ARM regs.)
+   16. VMOV<c> <Rt>, <Rt2>, <Qd[idx]>, <Qd[idx2]>
+   17. VMOV<c> <Qd[idx]>, <Qd[idx2]>, <Rt>, <Rt2>
+   18. VMOV<c>.<dt> <Rt>, <Qn[idx]>
+   19. VMOV<c>.<dt> <Qd[idx]>, <Rt>
 
    These cases can be disambiguated using neon_select_shape, except cases 1/9
    and 3/11 which depend on the operand type too.
@@ -17470,10 +19845,11 @@ do_neon_dup (void)
 static void
 do_neon_mov (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_RRFF, NS_FFRR, NS_DRR, NS_RRD,
-					  NS_QQ, NS_DD, NS_QI, NS_DI, NS_SR,
-					  NS_RS, NS_FF, NS_FI, NS_RF, NS_FR,
-					  NS_HR, NS_RH, NS_HI, NS_NULL);
+  enum neon_shape rs = neon_select_shape (NS_RRSS, NS_SSRR, NS_RRFF, NS_FFRR,
+					  NS_DRR, NS_RRD, NS_QQ, NS_DD, NS_QI,
+					  NS_DI, NS_SR, NS_RS, NS_FF, NS_FI,
+					  NS_RF, NS_FR, NS_HR, NS_RH, NS_HI,
+					  NS_NULL);
   struct neon_type_el et;
   const char *ldconst = 0;
 
@@ -17492,7 +19868,8 @@ do_neon_mov (void)
 
     case NS_QQ:  /* case 0/1.  */
       {
-	if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+	if (!check_simd_pred_availability (FALSE,
+					   NEON_CHECK_CC | NEON_CHECK_ARCH))
 	  return;
 	/* The architecture manual I have doesn't explicitly state which
 	   value the U bit should have for register->register moves, but
@@ -17522,7 +19899,8 @@ do_neon_mov (void)
       /* fall through.  */
 
     case NS_QI:  /* case 2/3.  */
-      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+      if (!check_simd_pred_availability (FALSE,
+					 NEON_CHECK_CC | NEON_CHECK_ARCH))
 	return;
       inst.instruction = 0x0800010;
       neon_move_immediate ();
@@ -17549,12 +19927,31 @@ do_neon_mov (void)
 	et = neon_check_type (2, NS_NULL, N_8 | N_16 | N_32 | N_KEY, N_EQK);
 	logsize = neon_logbits (et.size);
 
-	constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
-		    _(BAD_FPU));
-	constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
-		    && et.size != 32, _(BAD_FPU));
+	if (et.size != 32)
+	  {
+	    if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		&& vfp_or_neon_is_neon (NEON_CHECK_ARCH) == FAIL)
+	      return;
+	  }
+	else
+	  {
+	    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+			&& !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+			_(BAD_FPU));
+	  }
+
+	if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	  {
+	    if (inst.operands[1].reg == REG_SP)
+	      as_tsktsk (MVE_BAD_SP);
+	    else if (inst.operands[1].reg == REG_PC)
+	      as_tsktsk (MVE_BAD_PC);
+	  }
+	unsigned size = inst.operands[0].isscalar == 1 ? 64 : 128;
+
 	constraint (et.type == NT_invtype, _("bad type for scalar"));
-	constraint (x >= 64 / et.size, _("scalar index out of range"));
+	constraint (x >= size / et.size, _("scalar index out of range"));
+
 
 	switch (et.size)
 	  {
@@ -17564,7 +19961,7 @@ do_neon_mov (void)
 	  default: ;
 	  }
 
-	bcdebits |= x << logsize;
+	bcdebits |= (x & ((1 << (3-logsize)) - 1)) << logsize;
 
 	inst.instruction = 0xe000b10;
 	do_vfp_cond_or_thumb ();
@@ -17572,12 +19969,14 @@ do_neon_mov (void)
 	inst.instruction |= HI1 (dn) << 7;
 	inst.instruction |= inst.operands[1].reg << 12;
 	inst.instruction |= (bcdebits & 3) << 5;
-	inst.instruction |= (bcdebits >> 2) << 21;
+	inst.instruction |= ((bcdebits >> 2) & 3) << 21;
+	inst.instruction |= (x >> (3-logsize)) << 16;
       }
       break;
 
     case NS_DRR:  /* case 5 (fmdrr).  */
-      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+		  && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
 		  _(BAD_FPU));
 
       inst.instruction = 0xc400b10;
@@ -17609,12 +20008,32 @@ do_neon_mov (void)
 			      N_EQK, N_S8 | N_S16 | N_U8 | N_U16 | N_32 | N_KEY);
 	logsize = neon_logbits (et.size);
 
-	constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
-		    _(BAD_FPU));
-	constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
-		    && et.size != 32, _(BAD_FPU));
+	if (et.size != 32)
+	  {
+	    if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		&& vfp_or_neon_is_neon (NEON_CHECK_CC
+					| NEON_CHECK_ARCH) == FAIL)
+	      return;
+	  }
+	else
+	  {
+	    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+			&& !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+			_(BAD_FPU));
+	  }
+
+	if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	  {
+	    if (inst.operands[0].reg == REG_SP)
+	      as_tsktsk (MVE_BAD_SP);
+	    else if (inst.operands[0].reg == REG_PC)
+	      as_tsktsk (MVE_BAD_PC);
+	  }
+
+	unsigned size = inst.operands[1].isscalar == 1 ? 64 : 128;
+
 	constraint (et.type == NT_invtype, _("bad type for scalar"));
-	constraint (x >= 64 / et.size, _("scalar index out of range"));
+	constraint (x >= size / et.size, _("scalar index out of range"));
 
 	switch (et.size)
 	  {
@@ -17624,7 +20043,7 @@ do_neon_mov (void)
 	  default: ;
 	  }
 
-	abcdebits |= x << logsize;
+	abcdebits |= (x & ((1 << (3-logsize)) - 1)) << logsize;
 	inst.instruction = 0xe100b10;
 	do_vfp_cond_or_thumb ();
 	inst.instruction |= LOW4 (dn) << 16;
@@ -17632,11 +20051,13 @@ do_neon_mov (void)
 	inst.instruction |= inst.operands[0].reg << 12;
 	inst.instruction |= (abcdebits & 3) << 5;
 	inst.instruction |= (abcdebits >> 2) << 21;
+	inst.instruction |= (x >> (3-logsize)) << 16;
       }
       break;
 
     case NS_RRD:  /* case 7 (fmrrd).  */
-      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+		  && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
 		  _(BAD_FPU));
 
       inst.instruction = 0xc500b10;
@@ -17703,11 +20124,21 @@ do_neon_mov (void)
 	do_scalar_fp16_v82_encode ();
       break;
 
+    case NS_RRSS:
+      do_mve_mov (0);
+      break;
+    case NS_SSRR:
+      do_mve_mov (1);
+      break;
+
     /* The encoders for the fmrrs and fmsrr instructions expect three operands
        (one of which is a list), but we have parsed four.  Do some fiddling to
        make the operands what do_vfp_reg2_from_sp2 and do_vfp_sp2_from_reg2
        expect.  */
     case NS_RRFF:  /* case 14 (fmrrs).  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+		  && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+		  _(BAD_FPU));
       constraint (inst.operands[3].reg != inst.operands[2].reg + 1,
 		  _("VFP registers must be adjacent"));
       inst.operands[2].imm = 2;
@@ -17716,6 +20147,9 @@ do_neon_mov (void)
       break;
 
     case NS_FFRR:  /* case 15 (fmsrr).  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+		  && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+		  _(BAD_FPU));
       constraint (inst.operands[1].reg != inst.operands[0].reg + 1,
 		  _("VFP registers must be adjacent"));
       inst.operands[1] = inst.operands[2];
@@ -17735,11 +20169,58 @@ do_neon_mov (void)
     }
 }
 
+static void
+do_mve_movl (void)
+{
+  if (!(inst.operands[0].present && inst.operands[0].isquad
+      && inst.operands[1].present && inst.operands[1].isquad
+      && !inst.operands[2].present))
+    {
+      inst.instruction = 0;
+      inst.cond = 0xb;
+      if (thumb_mode)
+	set_pred_insn_type (INSIDE_IT_INSN);
+      do_neon_mov ();
+      return;
+    }
+
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+
+  if (inst.cond != COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+
+  struct neon_type_el et = neon_check_type (2, NS_QQ, N_EQK, N_S8 | N_U8
+					    | N_S16 | N_U16 | N_KEY);
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (neon_logbits (et.size) + 1) << 19;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
 static void
 do_neon_rshift_round_imm (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+    }
   int imm = inst.operands[2].imm;
 
   /* imm == 0 case is encoded as VMOV for V{R}SHR.  */
@@ -17821,7 +20302,14 @@ do_neon_zip_uzp (void)
 static void
 do_neon_sat_abs_neg (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
   neon_two_same (neon_quad (rs), 1, et.size);
@@ -17850,7 +20338,15 @@ do_neon_recip_est (void)
 static void
 do_neon_cls (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+   rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
   neon_two_same (neon_quad (rs), 1, et.size);
@@ -17859,7 +20355,15 @@ do_neon_cls (void)
 static void
 do_neon_clz (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+   rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_I8 | N_I16 | N_I32 | N_KEY);
   neon_two_same (neon_quad (rs), 1, et.size);
@@ -18398,12 +20902,13 @@ do_vsel (void)
 static void
 do_vmaxnm (void)
 {
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    set_pred_insn_type (OUTSIDE_PRED_INSN);
 
   if (try_vfp_nsyn (3, do_vfp_nsyn_fpv8) == SUCCESS)
     return;
 
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
+  if (!check_simd_pred_availability (TRUE, NEON_CHECK_CC | NEON_CHECK_ARCH8))
     return;
 
   neon_dyadic_misc (NT_untyped, N_F_16_32, 0);
@@ -18467,12 +20972,12 @@ do_vrint_1 (enum neon_cvt_mode mode)
       if (et.type == NT_invtype)
 	return;
 
-      set_pred_insn_type (OUTSIDE_PRED_INSN);
-      NEON_ENCODE (FLOAT, inst);
-
-      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
+      if (!check_simd_pred_availability (TRUE,
+					 NEON_CHECK_CC | NEON_CHECK_ARCH8))
 	return;
 
+      NEON_ENCODE (FLOAT, inst);
+
       inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
       inst.instruction |= HI1 (inst.operands[0].reg) << 22;
       inst.instruction |= LOW4 (inst.operands[1].reg);
@@ -18561,16 +21066,24 @@ neon_scalar_for_vcmla (unsigned opnd, unsigned elsize)
 static void
 do_vcmla (void)
 {
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-	      _(BAD_FPU));
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext)
+	      && (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8)
+		  || !mark_feature_used (&arm_ext_v8_3)), (BAD_FPU));
   constraint (inst.relocs[0].exp.X_op != O_constant,
 	      _("expression too complex"));
   unsigned rot = inst.relocs[0].exp.X_add_number;
   constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
 	      _("immediate out of range"));
   rot /= 90;
+
+  if (!check_simd_pred_availability (TRUE,
+				     NEON_CHECK_ARCH8 | NEON_CHECK_CC))
+    return;
+
   if (inst.operands[2].isscalar)
     {
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+	first_error (_("invalid instruction shape"));
       enum neon_shape rs = neon_select_shape (NS_DDSI, NS_QQSI, NS_NULL);
       unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
 				       N_KEY | N_F16 | N_F32).size;
@@ -18589,9 +21102,19 @@ do_vcmla (void)
     }
   else
     {
-      enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+      enum neon_shape rs;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+	rs = neon_select_shape (NS_QQQI, NS_NULL);
+      else
+	rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+
       unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
 				       N_KEY | N_F16 | N_F32).size;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext) && size == 32
+	  && (inst.operands[0].reg == inst.operands[1].reg
+	      || inst.operands[0].reg == inst.operands[2].reg))
+	as_tsktsk (BAD_MVE_SRCDEST);
+
       neon_three_same (neon_quad (rs), 0, -1);
       inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
       inst.instruction |= 0xfc200800;
@@ -18603,20 +21126,60 @@ do_vcmla (void)
 static void
 do_vcadd (void)
 {
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-	      _(BAD_FPU));
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+	      && (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8)
+		  || !mark_feature_used (&arm_ext_v8_3)), (BAD_FPU));
   constraint (inst.relocs[0].exp.X_op != O_constant,
 	      _("expression too complex"));
+
   unsigned rot = inst.relocs[0].exp.X_add_number;
   constraint (rot != 90 && rot != 270, _("immediate out of range"));
-  enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
-  unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
-				   N_KEY | N_F16 | N_F32).size;
-  neon_three_same (neon_quad (rs), 0, -1);
-  inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
-  inst.instruction |= 0xfc800800;
-  inst.instruction |= (rot == 270) << 24;
-  inst.instruction |= (size == 32) << 20;
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16 | N_F32);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_QQQI, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16 | N_F32 | N_I8
+			    | N_I16 | N_I32);
+      if (et.size == 32 && inst.operands[0].reg == inst.operands[2].reg)
+	as_tsktsk (_("Warning: 32-bit element size and same first and third "
+		     "operand makes instruction UNPREDICTABLE"));
+    }
+
+  if (et.type == NT_invtype)
+    return;
+
+  if (!check_simd_pred_availability (et.type == NT_float,
+				     NEON_CHECK_ARCH8 | NEON_CHECK_CC))
+    return;
+
+  if (et.type == NT_float)
+    {
+      neon_three_same (neon_quad (rs), 0, -1);
+      inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
+      inst.instruction |= 0xfc800800;
+      inst.instruction |= (rot == 270) << 24;
+      inst.instruction |= (et.size == 32) << 20;
+    }
+  else
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      inst.instruction = 0xfe000f00;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= neon_logbits (et.size) << 20;
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= (rot == 270) << 12;
+      inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+      inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+      inst.instruction |= LOW4 (inst.operands[2].reg);
+      inst.is_neon = 1;
+    }
 }
 
 /* Dot Product instructions encoding support.  */
@@ -20362,6 +22925,10 @@ static const struct reg_entry reg_names[] =
   REGDEF(WR, 7,RN), REGDEF(SB, 9,RN), REGDEF(SL,10,RN), REGDEF(FP,11,RN),
   REGDEF(IP,12,RN), REGDEF(SP,13,RN), REGDEF(LR,14,RN), REGDEF(PC,15,RN),
 
+  /* Defining the new Zero register from ARMv8.1-M.  */
+  REGDEF(zr,15,ZR),
+  REGDEF(ZR,15,ZR),
+
   /* Coprocessor numbers.  */
   REGSET(p, CP), REGSET(P, CP),
 
@@ -20425,6 +22992,10 @@ static const struct reg_entry reg_names[] =
   REGDEF(mvfr0,7,VFC), REGDEF(mvfr1,6,VFC),
   REGDEF(MVFR0,7,VFC), REGDEF(MVFR1,6,VFC),
   REGDEF(mvfr2,5,VFC), REGDEF(MVFR2,5,VFC),
+  REGDEF(fpscr_nzcvqc,2,VFC), REGDEF(FPSCR_nzcvqc,2,VFC),
+  REGDEF(vpr,12,VFC), REGDEF(VPR,12,VFC),
+  REGDEF(fpcxt_ns,14,VFC), REGDEF(FPCXT_NS,14,VFC),
+  REGDEF(fpcxt_s,15,VFC), REGDEF(FPCXT_S,15,VFC),
 
   /* Maverick DSP coprocessor registers.  */
   REGSET(mvf,MVF),  REGSET(mvd,MVD),  REGSET(mvfx,MVFX),  REGSET(mvdx,MVDX),
@@ -20578,7 +23149,8 @@ static const struct asm_shift_name shift_names [] =
   { "lsr", SHIFT_LSR },	 { "LSR", SHIFT_LSR },
   { "asr", SHIFT_ASR },	 { "ASR", SHIFT_ASR },
   { "ror", SHIFT_ROR },	 { "ROR", SHIFT_ROR },
-  { "rrx", SHIFT_RRX },	 { "RRX", SHIFT_RRX }
+  { "rrx", SHIFT_RRX },	 { "RRX", SHIFT_RRX },
+  { "uxtw", SHIFT_UXTW}, { "UXTW", SHIFT_UXTW}
 };
 
 /* Table of all explicit relocation names.  */
@@ -20779,6 +23351,10 @@ static struct asm_barrier_opt barrier_opt_names[] =
 #define cCE(mnem,  op, nops, ops, ae)	\
   { mnem, OPS##nops ops, OT_csuffix, 0x##op, 0xe##op, ARM_VARIANT, ARM_VARIANT, do_##ae, do_##ae, 0 }
 
+/* mov instructions that are shared between coprocessor and MVE.  */
+#define mcCE(mnem,  op, nops, ops, ae)	\
+  { #mnem, OPS##nops ops, OT_csuffix, 0x##op, 0xe##op, ARM_VARIANT, THUMB_VARIANT, do_##ae, do_##ae, 0 }
+
 /* Legacy coprocessor instructions where conditional infix and conditional
    suffix are ambiguous.  For consistency this includes all FPA instructions,
    not just the potentially ambiguous ones.  */
@@ -21540,19 +24116,13 @@ static const struct asm_opcode insns[] =
   nUF(vselvs, _vselvs, 3, (RVSD, RVSD, RVSD),		vsel),
   nUF(vselge, _vselge, 3, (RVSD, RVSD, RVSD),		vsel),
   nUF(vselgt, _vselgt, 3, (RVSD, RVSD, RVSD),		vsel),
-  nUF(vmaxnm, _vmaxnm, 3, (RNSDQ, oRNSDQ, RNSDQ),	vmaxnm),
-  nUF(vminnm, _vminnm, 3, (RNSDQ, oRNSDQ, RNSDQ),	vmaxnm),
-  nUF(vcvta,  _vcvta,  2, (RNSDQ, oRNSDQ),		neon_cvta),
-  nUF(vcvtn,  _vcvta,  2, (RNSDQ, oRNSDQ),		neon_cvtn),
-  nUF(vcvtp,  _vcvta,  2, (RNSDQ, oRNSDQ),		neon_cvtp),
-  nUF(vcvtm,  _vcvta,  2, (RNSDQ, oRNSDQ),		neon_cvtm),
   nCE(vrintr, _vrintr, 2, (RNSDQ, oRNSDQ),		vrintr),
-  nCE(vrintz, _vrintr, 2, (RNSDQ, oRNSDQ),		vrintz),
-  nCE(vrintx, _vrintr, 2, (RNSDQ, oRNSDQ),		vrintx),
-  nUF(vrinta, _vrinta, 2, (RNSDQ, oRNSDQ),		vrinta),
-  nUF(vrintn, _vrinta, 2, (RNSDQ, oRNSDQ),		vrintn),
-  nUF(vrintp, _vrinta, 2, (RNSDQ, oRNSDQ),		vrintp),
-  nUF(vrintm, _vrinta, 2, (RNSDQ, oRNSDQ),		vrintm),
+  mnCE(vrintz, _vrintr, 2, (RNSDQMQ, oRNSDQMQ),		vrintz),
+  mnCE(vrintx, _vrintr, 2, (RNSDQMQ, oRNSDQMQ),		vrintx),
+  mnUF(vrinta, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),		vrinta),
+  mnUF(vrintn, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),		vrintn),
+  mnUF(vrintp, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),		vrintp),
+  mnUF(vrintm, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),		vrintm),
 
   /* Crypto v1 extensions.  */
 #undef  ARM_VARIANT
@@ -21598,8 +24168,6 @@ static const struct asm_opcode insns[] =
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT & arm_ext_v8_3
  NCE (vjcvt, eb90bc0, 2, (RVS, RVD), vjcvt),
- NUF (vcmla, 0, 4, (RNDQ, RNDQ, RNDQ_RNSC, EXPi), vcmla),
- NUF (vcadd, 0, 4, (RNDQ, RNDQ, RNDQ, EXPi), vcadd),
 
 #undef  ARM_VARIANT
 #define ARM_VARIANT   & fpu_neon_ext_dotprod
@@ -22055,14 +24623,14 @@ static const struct asm_opcode insns[] =
 
 #undef  ARM_VARIANT
 #define ARM_VARIANT  & fpu_vfp_ext_v1xd  /* VFP V1xD (single precision).  */
+#undef THUMB_VARIANT
+#define THUMB_VARIANT  & arm_ext_v6t2
+ mcCE(vmrs,	ef00a10, 2, (APSR_RR, RVC),   vmrs),
+ mcCE(vmsr,	ee00a10, 2, (RVC, RR),        vmsr),
+#undef THUMB_VARIANT
 
   /* Moves and type conversions.  */
- cCE("fcpys",	eb00a40, 2, (RVS, RVS),	      vfp_sp_monadic),
- cCE("fmrs",	e100a10, 2, (RR, RVS),	      vfp_reg_from_sp),
- cCE("fmsr",	e000a10, 2, (RVS, RR),	      vfp_sp_from_reg),
  cCE("fmstat",	ef1fa10, 0, (),		      noargs),
- cCE("vmrs",	ef00a10, 2, (APSR_RR, RVC),   vmrs),
- cCE("vmsr",	ee00a10, 2, (RVC, RR),        vmsr),
  cCE("fsitos",	eb80ac0, 2, (RVS, RVS),	      vfp_sp_monadic),
  cCE("fuitos",	eb80a40, 2, (RVS, RVS),	      vfp_sp_monadic),
  cCE("ftosis",	ebd0a40, 2, (RVS, RVS),	      vfp_sp_monadic),
@@ -22131,7 +24699,6 @@ static const struct asm_opcode insns[] =
 #define ARM_VARIANT  & fpu_vfp_ext_v1 /* VFP V1 (Double precision).  */
 
   /* Moves and type conversions.  */
- cCE("fcpyd",	eb00b40, 2, (RVD, RVD),	      vfp_dp_rd_rm),
  cCE("fcvtds",	eb70ac0, 2, (RVD, RVS),	      vfp_dp_sp_cvt),
  cCE("fcvtsd",	eb70bc0, 2, (RVS, RVD),	      vfp_sp_dp_cvt),
  cCE("fmdhr",	e200b10, 2, (RVD, RR),	      vfp_dp_rn_rd),
@@ -22167,14 +24734,6 @@ static const struct asm_opcode insns[] =
  cCE("fcmped",	eb40bc0, 2, (RVD, RVD),	      vfp_dp_rd_rm),
  cCE("fcmpezd",	eb50bc0, 1, (RVD),	      vfp_dp_rd),
 
-#undef  ARM_VARIANT
-#define ARM_VARIANT  & fpu_vfp_ext_v2
-
- cCE("fmsrr",	c400a10, 3, (VRSLST, RR, RR), vfp_sp2_from_reg2),
- cCE("fmrrs",	c500a10, 3, (RR, RR, VRSLST), vfp_reg2_from_sp2),
- cCE("fmdrr",	c400b10, 3, (RVD, RR, RR),    vfp_dp_rm_rd_rn),
- cCE("fmrrd",	c500b10, 3, (RR, RR, RVD),    vfp_dp_rd_rn_rm),
-
 /* Instructions which may belong to either the Neon or VFP instruction sets.
    Individual encoder functions perform additional architecture checks.  */
 #undef  ARM_VARIANT
@@ -22188,15 +24747,11 @@ static const struct asm_opcode insns[] =
  nCE(vnmul,     _vnmul,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
  nCE(vnmla,     _vnmla,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
  nCE(vnmls,     _vnmls,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
- nCE(vcmp,      _vcmp,    2, (RVSD, RSVD_FI0),    vfp_nsyn_cmp),
- nCE(vcmpe,     _vcmpe,   2, (RVSD, RSVD_FI0),    vfp_nsyn_cmp),
  NCE(vpush,     0,       1, (VRSDLST),          vfp_nsyn_push),
  NCE(vpop,      0,       1, (VRSDLST),          vfp_nsyn_pop),
  NCE(vcvtz,     0,       2, (RVSD, RVSD),       vfp_nsyn_cvtz),
 
   /* Mnemonics shared by Neon and VFP.  */
- nCEF(vmul,     _vmul,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mul),
- nCEF(vmla,     _vmla,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
  nCEF(vmls,     _vmls,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
 
  NCE(vldm,      c900b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
@@ -22206,14 +24761,13 @@ static const struct asm_opcode insns[] =
  NCE(vstmia,    c800b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
  NCE(vstmdb,    d000b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
 
- nCEF(vcvt,     _vcvt,   3, (RNSDQ, RNSDQ, oI32z), neon_cvt),
+ mnCEF(vcvt,     _vcvt,   3, (RNSDQMQ, RNSDQMQ, oI32z), neon_cvt),
  nCEF(vcvtr,    _vcvt,   2, (RNSDQ, RNSDQ), neon_cvtr),
- NCEF(vcvtb,	eb20a40, 2, (RVSD, RVSD), neon_cvtb),
- NCEF(vcvtt,	eb20a40, 2, (RVSD, RVSD), neon_cvtt),
+ MNCEF(vcvtb,	eb20a40, 3, (RVSDMQ, RVSDMQ, oI32b), neon_cvtb),
+ MNCEF(vcvtt,	eb20a40, 3, (RVSDMQ, RVSDMQ, oI32b), neon_cvtt),
 
 
   /* NOTE: All VMOV encoding is special-cased!  */
- NCE(vmov,      0,       1, (VMOV), neon_mov),
  NCE(vmovq,     0,       1, (VMOV), neon_mov),
 
 #undef  THUMB_VARIANT
@@ -22247,38 +24801,24 @@ static const struct asm_opcode insns[] =
   /* integer ops, valid types S8 S16 S32 U8 U16 U32.  */
  NUF(vaba,      0000710, 3, (RNDQ, RNDQ,  RNDQ), neon_dyadic_i_su),
  NUF(vabaq,     0000710, 3, (RNQ,  RNQ,   RNQ),  neon_dyadic_i_su),
- NUF(vhadd,     0000000, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
  NUF(vhaddq,    0000000, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
- NUF(vrhadd,    0000100, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
  NUF(vrhaddq,   0000100, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
- NUF(vhsub,     0000200, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
  NUF(vhsubq,    0000200, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
   /* integer ops, valid types S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vqadd,     0000010, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i64_su),
  NUF(vqaddq,    0000010, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i64_su),
- NUF(vqsub,     0000210, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i64_su),
  NUF(vqsubq,    0000210, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i64_su),
- NUF(vrshl,     0000500, 3, (RNDQ, oRNDQ, RNDQ), neon_rshl),
  NUF(vrshlq,    0000500, 3, (RNQ,  oRNQ,  RNQ),  neon_rshl),
- NUF(vqrshl,    0000510, 3, (RNDQ, oRNDQ, RNDQ), neon_rshl),
  NUF(vqrshlq,   0000510, 3, (RNQ,  oRNQ,  RNQ),  neon_rshl),
   /* If not immediate, fall back to neon_dyadic_i64_su.
-     shl_imm should accept I8 I16 I32 I64,
-     qshl_imm should accept S8 S16 S32 S64 U8 U16 U32 U64.  */
- nUF(vshl,      _vshl,    3, (RNDQ, oRNDQ, RNDQ_I63b), neon_shl_imm),
- nUF(vshlq,     _vshl,    3, (RNQ,  oRNQ,  RNDQ_I63b), neon_shl_imm),
- nUF(vqshl,     _vqshl,   3, (RNDQ, oRNDQ, RNDQ_I63b), neon_qshl_imm),
- nUF(vqshlq,    _vqshl,   3, (RNQ,  oRNQ,  RNDQ_I63b), neon_qshl_imm),
+     shl should accept I8 I16 I32 I64,
+     qshl should accept S8 S16 S32 S64 U8 U16 U32 U64.  */
+ nUF(vshlq,     _vshl,    3, (RNQ,  oRNQ,  RNDQ_I63b), neon_shl),
+ nUF(vqshlq,    _vqshl,   3, (RNQ,  oRNQ,  RNDQ_I63b), neon_qshl),
   /* Logic ops, types optional & ignored.  */
- nUF(vand,      _vand,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
  nUF(vandq,     _vand,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vbic,      _vbic,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
  nUF(vbicq,     _vbic,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vorr,      _vorr,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
  nUF(vorrq,     _vorr,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vorn,      _vorn,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
  nUF(vornq,     _vorn,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(veor,      _veor,    3, (RNDQ, oRNDQ, RNDQ),      neon_logic),
  nUF(veorq,     _veor,    3, (RNQ,  oRNQ,  RNQ),       neon_logic),
   /* Bitfield ops, untyped.  */
  NUF(vbsl,      1100110, 3, (RNDQ, RNDQ, RNDQ), neon_bitfield),
@@ -22289,9 +24829,7 @@ static const struct asm_opcode insns[] =
  NUF(vbifq,     1300110, 3, (RNQ,  RNQ,  RNQ),  neon_bitfield),
   /* Int and float variants, types S8 S16 S32 U8 U16 U32 F16 F32.  */
  nUF(vabdq,     _vabd,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
- nUF(vmax,      _vmax,    3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_if_su),
  nUF(vmaxq,     _vmax,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
- nUF(vmin,      _vmin,    3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_if_su),
  nUF(vminq,     _vmin,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
   /* Comparisons. Types S8 S16 S32 U8 U16 U32 F32. Non-immediate versions fall
      back to neon_dyadic_if_su.  */
@@ -22322,9 +24860,7 @@ static const struct asm_opcode insns[] =
   /* VMUL takes I8 I16 I32 F32 P8.  */
  nUF(vmulq,     _vmul,     3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_mul),
   /* VQD{R}MULH takes S16 S32.  */
- nUF(vqdmulh,   _vqdmulh,  3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
  nUF(vqdmulhq,  _vqdmulh,  3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
- nUF(vqrdmulh,  _vqrdmulh, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
  nUF(vqrdmulhq, _vqrdmulh, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
  NUF(vacge,     0000e10,  3, (RNDQ, oRNDQ, RNDQ), neon_fcmp_absolute),
  NUF(vacgeq,    0000e10,  3, (RNQ,  oRNQ,  RNQ),  neon_fcmp_absolute),
@@ -22339,7 +24875,6 @@ static const struct asm_opcode insns[] =
  NUF(vrsqrts,   0200f10,  3, (RNDQ, oRNDQ, RNDQ), neon_step),
  NUF(vrsqrtsq,  0200f10,  3, (RNQ,  oRNQ,  RNQ),  neon_step),
  /* ARM v8.1 extension.  */
- nUF (vqrdmlah,  _vqrdmlah, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qrdmlah),
  nUF (vqrdmlahq, _vqrdmlah, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qrdmlah),
  nUF (vqrdmlsh,  _vqrdmlsh, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qrdmlah),
  nUF (vqrdmlshq, _vqrdmlsh, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qrdmlah),
@@ -22351,21 +24886,16 @@ static const struct asm_opcode insns[] =
   /* Data processing with two registers and a shift amount.  */
   /* Right shifts, and variants with rounding.
      Types accepted S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vshr,      0800010, 3, (RNDQ, oRNDQ, I64z), neon_rshift_round_imm),
  NUF(vshrq,     0800010, 3, (RNQ,  oRNQ,  I64z), neon_rshift_round_imm),
- NUF(vrshr,     0800210, 3, (RNDQ, oRNDQ, I64z), neon_rshift_round_imm),
  NUF(vrshrq,    0800210, 3, (RNQ,  oRNQ,  I64z), neon_rshift_round_imm),
  NUF(vsra,      0800110, 3, (RNDQ, oRNDQ, I64),  neon_rshift_round_imm),
  NUF(vsraq,     0800110, 3, (RNQ,  oRNQ,  I64),  neon_rshift_round_imm),
  NUF(vrsra,     0800310, 3, (RNDQ, oRNDQ, I64),  neon_rshift_round_imm),
  NUF(vrsraq,    0800310, 3, (RNQ,  oRNQ,  I64),  neon_rshift_round_imm),
   /* Shift and insert. Sizes accepted 8 16 32 64.  */
- NUF(vsli,      1800510, 3, (RNDQ, oRNDQ, I63), neon_sli),
  NUF(vsliq,     1800510, 3, (RNQ,  oRNQ,  I63), neon_sli),
- NUF(vsri,      1800410, 3, (RNDQ, oRNDQ, I64), neon_sri),
  NUF(vsriq,     1800410, 3, (RNQ,  oRNQ,  I64), neon_sri),
   /* QSHL{U} immediate accepts S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vqshlu,    1800610, 3, (RNDQ, oRNDQ, I63), neon_qshlu_imm),
  NUF(vqshluq,   1800610, 3, (RNQ,  oRNQ,  I63), neon_qshlu_imm),
   /* Right shift immediate, saturating & narrowing, with rounding variants.
      Types accepted S16 S32 S64 U16 U32 U64.  */
@@ -22382,7 +24912,6 @@ static const struct asm_opcode insns[] =
   /* CVT with optional immediate for fixed-point variant.  */
  nUF(vcvtq,     _vcvt,    3, (RNQ, RNQ, oI32b), neon_cvt),
 
- nUF(vmvn,      _vmvn,    2, (RNDQ, RNDQ_Ibig), neon_mvn),
  nUF(vmvnq,     _vmvn,    2, (RNQ,  RNDQ_Ibig), neon_mvn),
 
   /* Data processing, three registers of different lengths.  */
@@ -22414,14 +24943,10 @@ static const struct asm_opcode insns[] =
 
   /* Two registers, miscellaneous.  */
   /* Reverse. Sizes 8 16 32 (must be < size in opcode).  */
- NUF(vrev64,    1b00000, 2, (RNDQ, RNDQ),     neon_rev),
  NUF(vrev64q,   1b00000, 2, (RNQ,  RNQ),      neon_rev),
- NUF(vrev32,    1b00080, 2, (RNDQ, RNDQ),     neon_rev),
  NUF(vrev32q,   1b00080, 2, (RNQ,  RNQ),      neon_rev),
- NUF(vrev16,    1b00100, 2, (RNDQ, RNDQ),     neon_rev),
  NUF(vrev16q,   1b00100, 2, (RNQ,  RNQ),      neon_rev),
   /* Vector replicate. Sizes 8 16 32.  */
- nCE(vdup,      _vdup,    2, (RNDQ, RR_RNSC),  neon_dup),
  nCE(vdupq,     _vdup,    2, (RNQ,  RR_RNSC),  neon_dup),
   /* VMOVL. Types S8 S16 S32 U8 U16 U32.  */
  NUF(vmovl,     0800a10, 2, (RNQ, RND),       neon_movl),
@@ -22437,9 +24962,7 @@ static const struct asm_opcode insns[] =
  NUF(vuzp,      1b20100, 2, (RNDQ, RNDQ),     neon_zip_uzp),
  NUF(vuzpq,     1b20100, 2, (RNQ,  RNQ),      neon_zip_uzp),
   /* VQABS / VQNEG. Types S8 S16 S32.  */
- NUF(vqabs,     1b00700, 2, (RNDQ, RNDQ),     neon_sat_abs_neg),
  NUF(vqabsq,    1b00700, 2, (RNQ,  RNQ),      neon_sat_abs_neg),
- NUF(vqneg,     1b00780, 2, (RNDQ, RNDQ),     neon_sat_abs_neg),
  NUF(vqnegq,    1b00780, 2, (RNQ,  RNQ),      neon_sat_abs_neg),
   /* Pairwise, lengthening. Types S8 S16 S32 U8 U16 U32.  */
  NUF(vpadal,    1b00600, 2, (RNDQ, RNDQ),     neon_pair_long),
@@ -22452,10 +24975,8 @@ static const struct asm_opcode insns[] =
  NUF(vrsqrte,   1b30480, 2, (RNDQ, RNDQ),     neon_recip_est),
  NUF(vrsqrteq,  1b30480, 2, (RNQ,  RNQ),      neon_recip_est),
   /* VCLS. Types S8 S16 S32.  */
- NUF(vcls,      1b00400, 2, (RNDQ, RNDQ),     neon_cls),
  NUF(vclsq,     1b00400, 2, (RNQ,  RNQ),      neon_cls),
   /* VCLZ. Types I8 I16 I32.  */
- NUF(vclz,      1b00480, 2, (RNDQ, RNDQ),     neon_clz),
  NUF(vclzq,     1b00480, 2, (RNQ,  RNQ),      neon_clz),
   /* VCNT. Size 8.  */
  NUF(vcnt,      1b00500, 2, (RNDQ, RNDQ),     neon_cnt),
@@ -22519,11 +25040,12 @@ static const struct asm_opcode insns[] =
 #define ARM_VARIANT    & fpu_vfp_ext_fma
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT  & fpu_vfp_ext_fma
- /* Mnemonics shared by Neon and VFP.  These are included in the
+ /* Mnemonics shared by Neon, VFP and MVE.  These are included in the
     VFP FMA variant; NEON and VFP FMA always includes the NEON
     FMA instructions.  */
- nCEF(vfma,     _vfma,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_fmac),
- nCEF(vfms,     _vfms,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_fmac),
+ mnCEF(vfma,     _vfma,    3, (RNSDQMQ, oRNSDQMQ, RNSDQMQR), neon_fmac),
+ mnCEF(vfms,     _vfms,    3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),  neon_fmac),
+
  /* ffmas/ffmad/ffmss/ffmsd are dummy mnemonics to satisfy gas;
     the v form should always be used.  */
  cCE("ffmas",	ea00a00, 3, (RVS, RVS, RVS),  vfp_sp_dyadic),
@@ -22891,6 +25413,16 @@ static const struct asm_opcode insns[] =
  /* Armv8.1-M Mainline instructions.  */
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT & arm_ext_v8_1m_main
+ toU("cinc",  _cinc,  3, (RRnpcsp, RR_ZR, COND),	t_cond),
+ toU("cinv",  _cinv,  3, (RRnpcsp, RR_ZR, COND),	t_cond),
+ toU("cneg",  _cneg,  3, (RRnpcsp, RR_ZR, COND),	t_cond),
+ toU("csel",  _csel,  4, (RRnpcsp, RR_ZR, RR_ZR, COND),	t_cond),
+ toU("csetm", _csetm, 2, (RRnpcsp, COND),		t_cond),
+ toU("cset",  _cset,  2, (RRnpcsp, COND),		t_cond),
+ toU("csinc", _csinc, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),	t_cond),
+ toU("csinv", _csinv, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),	t_cond),
+ toU("csneg", _csneg, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),	t_cond),
+
  toC("bf",     _bf,	2, (EXPs, EXPs),	     t_branch_future),
  toU("bfcsel", _bfcsel,	4, (EXPs, EXPs, EXPs, COND), t_branch_future),
  toC("bfx",    _bfx,	2, (EXPs, RRnpcsp),	     t_branch_future),
@@ -22906,6 +25438,38 @@ static const struct asm_opcode insns[] =
 
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT & mve_ext
+ ToC("lsll",	ea50010d, 3, (RRe, RRo, RRnpcsp_I32), mve_scalar_shift),
+ ToC("lsrl",	ea50011f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("asrl",	ea50012d, 3, (RRe, RRo, RRnpcsp_I32), mve_scalar_shift),
+ ToC("uqrshll",	ea51010d, 4, (RRe, RRo, I48_I64, RRnpcsp), mve_scalar_shift1),
+ ToC("sqrshrl",	ea51012d, 4, (RRe, RRo, I48_I64, RRnpcsp), mve_scalar_shift1),
+ ToC("uqshll",	ea51010f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("urshrl",	ea51011f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("srshrl",	ea51012f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("sqshll",	ea51013f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("uqrshl",	ea500f0d, 2, (RRnpcsp, RRnpcsp),      mve_scalar_shift),
+ ToC("sqrshr",	ea500f2d, 2, (RRnpcsp, RRnpcsp),      mve_scalar_shift),
+ ToC("uqshl",	ea500f0f, 2, (RRnpcsp, I32),	      mve_scalar_shift),
+ ToC("urshr",	ea500f1f, 2, (RRnpcsp, I32),	      mve_scalar_shift),
+ ToC("srshr",	ea500f2f, 2, (RRnpcsp, I32),	      mve_scalar_shift),
+ ToC("sqshl",	ea500f3f, 2, (RRnpcsp, I32),	      mve_scalar_shift),
+
+ ToC("vpt",	ee410f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptt",	ee018f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpte",	ee418f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttt",	ee014f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptte",	ee01cf00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptet",	ee41cf00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptee",	ee414f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptttt",	ee012f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttte",	ee016f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttet",	ee01ef00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttee",	ee01af00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptett",	ee41af00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptete",	ee41ef00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpteet",	ee416f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpteee",	ee412f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+
  ToC("vpst",	fe710f4d, 0, (), mve_vpt),
  ToC("vpstt",	fe318f4d, 0, (), mve_vpt),
  ToC("vpste",	fe718f4d, 0, (), mve_vpt),
@@ -22923,6 +25487,11 @@ static const struct asm_opcode insns[] =
  ToC("vpsteee",	fe712f4d, 0, (), mve_vpt),
 
  /* MVE and MVE FP only.  */
+ mToC("vhcadd",	ee000f00,   4, (RMQ, RMQ, RMQ, EXPi),		  mve_vhcadd),
+ mCEF(vadc,	_vadc,      3, (RMQ, RMQ, RMQ),			  mve_vadc),
+ mCEF(vadci,	_vadci,     3, (RMQ, RMQ, RMQ),			  mve_vadc),
+ mToC("vsbc",	fe300f00,   3, (RMQ, RMQ, RMQ),			  mve_vsbc),
+ mToC("vsbci",	fe301f00,   3, (RMQ, RMQ, RMQ),			  mve_vsbc),
  mCEF(vmullb,	_vmullb,    3, (RMQ, RMQ, RMQ),			  mve_vmull),
  mCEF(vabav,	_vabav,	    3, (RRnpcsp, RMQ, RMQ),		  mve_vabav),
  mCEF(vmladav,	  _vmladav,	3, (RRe, RMQ, RMQ),		mve_vmladav),
@@ -22948,11 +25517,127 @@ static const struct asm_opcode insns[] =
  mCEF(vld41,	_vld41,	    2, (MSTRLST4, ADDRMVE),		mve_vst_vld),
  mCEF(vld42,	_vld42,	    2, (MSTRLST4, ADDRMVE),		mve_vst_vld),
  mCEF(vld43,	_vld43,	    2, (MSTRLST4, ADDRMVE),		mve_vst_vld),
+ mCEF(vstrb,	_vstrb,	    2, (RMQ, ADDRMVE),			mve_vstr_vldr),
+ mCEF(vstrh,	_vstrh,	    2, (RMQ, ADDRMVE),			mve_vstr_vldr),
+ mCEF(vstrw,	_vstrw,	    2, (RMQ, ADDRMVE),			mve_vstr_vldr),
+ mCEF(vstrd,	_vstrd,	    2, (RMQ, ADDRMVE),			mve_vstr_vldr),
+ mCEF(vldrb,	_vldrb,	    2, (RMQ, ADDRMVE),			mve_vstr_vldr),
+ mCEF(vldrh,	_vldrh,	    2, (RMQ, ADDRMVE),			mve_vstr_vldr),
+ mCEF(vldrw,	_vldrw,	    2, (RMQ, ADDRMVE),			mve_vstr_vldr),
+ mCEF(vldrd,	_vldrd,	    2, (RMQ, ADDRMVE),			mve_vstr_vldr),
+
+ mCEF(vmovnt,	_vmovnt,    2, (RMQ, RMQ),			  mve_movn),
+ mCEF(vmovnb,	_vmovnb,    2, (RMQ, RMQ),			  mve_movn),
+ mCEF(vbrsr,	_vbrsr,     3, (RMQ, RMQ, RR),			  mve_vbrsr),
+ mCEF(vaddlv,	_vaddlv,    3, (RRe, RRo, RMQ),			  mve_vaddlv),
+ mCEF(vaddlva,	_vaddlva,   3, (RRe, RRo, RMQ),			  mve_vaddlv),
+ mCEF(vaddv,	_vaddv,	    2, (RRe, RMQ),			  mve_vaddv),
+ mCEF(vaddva,	_vaddva,    2, (RRe, RMQ),			  mve_vaddv),
+ mCEF(vddup,	_vddup,	    3, (RMQ, RRe, EXPi),		  mve_viddup),
+ mCEF(vdwdup,	_vdwdup,    4, (RMQ, RRe, RR, EXPi),		  mve_viddup),
+ mCEF(vidup,	_vidup,	    3, (RMQ, RRe, EXPi),		  mve_viddup),
+ mCEF(viwdup,	_viwdup,    4, (RMQ, RRe, RR, EXPi),		  mve_viddup),
+ mToC("vmaxa",	ee330e81,   2, (RMQ, RMQ),			  mve_vmaxa_vmina),
+ mToC("vmina",	ee331e81,   2, (RMQ, RMQ),			  mve_vmaxa_vmina),
+ mCEF(vmaxv,	_vmaxv,	  2, (RR, RMQ),				  mve_vmaxv),
+ mCEF(vmaxav,	_vmaxav,  2, (RR, RMQ),				  mve_vmaxv),
+ mCEF(vminv,	_vminv,	  2, (RR, RMQ),				  mve_vmaxv),
+ mCEF(vminav,	_vminav,  2, (RR, RMQ),				  mve_vmaxv),
+
+ mCEF(vmlaldav,	  _vmlaldav,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlaldava,  _vmlaldava,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlaldavx,  _vmlaldavx,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlaldavax, _vmlaldavax,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlalv,	  _vmlaldav,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlalva,	  _vmlaldava,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlsldav,	  _vmlsldav,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlsldava,  _vmlsldava,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlsldavx,  _vmlsldavx,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlsldavax, _vmlsldavax,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mToC("vrmlaldavh", ee800f00,	   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlaldavha",ee800f20,	   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlaldavhx,  _vrmlaldavhx,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlaldavhax, _vrmlaldavhax, 4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlalvh",   ee800f00,	   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlalvha",  ee800f20,	   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavh,   _vrmlsldavh,   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavha,  _vrmlsldavha,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavhx,  _vrmlsldavhx,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavhax, _vrmlsldavhax, 4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+
+ mToC("vmlas",	  ee011e40,	3, (RMQ, RMQ, RR),		mve_vmlas),
+ mToC("vmulh",	  ee010e01,	3, (RMQ, RMQ, RMQ),		mve_vmulh),
+ mToC("vrmulh",	  ee011e01,	3, (RMQ, RMQ, RMQ),		mve_vmulh),
+ mToC("vpnot",	  fe310f4d,	0, (),				mve_vpnot),
+ mToC("vpsel",	  fe310f01,	3, (RMQ, RMQ, RMQ),		mve_vpsel),
+
+ mToC("vqdmladh",  ee000e00,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqdmladhx", ee001e00,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqrdmladh", ee000e01,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqrdmladhx",ee001e01,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqdmlsdh",  fe000e00,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqdmlsdhx", fe001e00,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqrdmlsdh", fe000e01,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqrdmlsdhx",fe001e01,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqdmlah",   ee000e60,	3, (RMQ, RMQ, RR),		mve_vqdmlah),
+ mToC("vqdmlash",  ee001e60,	3, (RMQ, RMQ, RR),		mve_vqdmlah),
+ mToC("vqrdmlash", ee001e40,	3, (RMQ, RMQ, RR),		mve_vqdmlah),
+ mToC("vqdmullt",  ee301f00,	3, (RMQ, RMQ, RMQRR),		mve_vqdmull),
+ mToC("vqdmullb",  ee300f00,	3, (RMQ, RMQ, RMQRR),		mve_vqdmull),
+ mCEF(vqmovnt,	  _vqmovnt,	2, (RMQ, RMQ),			mve_vqmovn),
+ mCEF(vqmovnb,	  _vqmovnb,	2, (RMQ, RMQ),			mve_vqmovn),
+ mCEF(vqmovunt,	  _vqmovunt,	2, (RMQ, RMQ),			mve_vqmovn),
+ mCEF(vqmovunb,	  _vqmovunb,	2, (RMQ, RMQ),			mve_vqmovn),
+
+ mCEF(vshrnt,	  _vshrnt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vshrnb,	  _vshrnb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vrshrnt,	  _vrshrnt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vrshrnb,	  _vrshrnb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqshrnt,	  _vqrshrnt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqshrnb,	  _vqrshrnb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqshrunt,	  _vqrshrunt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqshrunb,	  _vqrshrunb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqrshrnt,	  _vqrshrnt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqrshrnb,	  _vqrshrnb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqrshrunt,  _vqrshrunt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqrshrunb,  _vqrshrunb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+
+ mToC("vshlc",	    eea00fc0,	   3, (RMQ, RR, I32z),	    mve_vshlc),
+ mToC("vshllt",	    ee201e00,	   3, (RMQ, RMQ, I32),	    mve_vshll),
+ mToC("vshllb",	    ee200e00,	   3, (RMQ, RMQ, I32),	    mve_vshll),
+
+ toU("dlstp",	_dlstp, 2, (LR, RR),      t_loloop),
+ toU("wlstp",	_wlstp, 3, (LR, RR, EXP), t_loloop),
+ toU("letp",	_letp,  2, (LR, EXP),	  t_loloop),
+ toU("lctp",	_lctp,  0, (),		  t_loloop),
+
+#undef THUMB_VARIANT
+#define THUMB_VARIANT & mve_fp_ext
+ mToC("vcmul", ee300e00,   4, (RMQ, RMQ, RMQ, EXPi),		  mve_vcmul),
+ mToC("vfmas", ee311e40,   3, (RMQ, RMQ, RR),			  mve_vfmas),
+ mToC("vmaxnma", ee3f0e81, 2, (RMQ, RMQ),			  mve_vmaxnma_vminnma),
+ mToC("vminnma", ee3f1e81, 2, (RMQ, RMQ),			  mve_vmaxnma_vminnma),
+ mToC("vmaxnmv", eeee0f00, 2, (RR, RMQ),			  mve_vmaxnmv),
+ mToC("vmaxnmav",eeec0f00, 2, (RR, RMQ),			  mve_vmaxnmv),
+ mToC("vminnmv", eeee0f80, 2, (RR, RMQ),			  mve_vmaxnmv),
+ mToC("vminnmav",eeec0f80, 2, (RR, RMQ),			  mve_vmaxnmv),
 
 #undef  ARM_VARIANT
-#define ARM_VARIANT    & fpu_vfp_ext_v1xd
+#define ARM_VARIANT  & fpu_vfp_ext_v1
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT  & arm_ext_v6t2
+ mnCEF(vmla,     _vmla,    3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), neon_mac_maybe_scalar),
+ mnCEF(vmul,     _vmul,    3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), neon_mul),
+
+ mcCE(fcpyd,	eb00b40, 2, (RVD, RVD),	      vfp_dp_rd_rm),
+
+#undef  ARM_VARIANT
+#define ARM_VARIANT  & fpu_vfp_ext_v1xd
+
+ MNCE(vmov,   0,	1, (VMOV),	      neon_mov),
+ mcCE(fmrs,	e100a10, 2, (RR, RVS),	      vfp_reg_from_sp),
+ mcCE(fmsr,	e000a10, 2, (RVS, RR),	      vfp_sp_from_reg),
+ mcCE(fcpys,	eb00a40, 2, (RVS, RVS),	      vfp_sp_monadic),
 
  mCEF(vmullt, _vmullt,	3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ),	mve_vmull),
  mnCEF(vadd,  _vadd,	3, (RNSDQMQ, oRNSDQMQ, RNSDQMQR),	neon_addsub_if_i),
@@ -22961,12 +25646,75 @@ static const struct asm_opcode insns[] =
  MNCEF(vabs,  1b10300,	2, (RNSDQMQ, RNSDQMQ),	neon_abs_neg),
  MNCEF(vneg,  1b10380,	2, (RNSDQMQ, RNSDQMQ),	neon_abs_neg),
 
-#undef ARM_VARIANT
+ mCEF(vmovlt, _vmovlt,	1, (VMOV),		mve_movl),
+ mCEF(vmovlb, _vmovlb,	1, (VMOV),		mve_movl),
+
+ mnCE(vcmp,      _vcmp,    3, (RVSD_COND, RSVDMQ_FI0, oRMQRZ),    vfp_nsyn_cmp),
+ mnCE(vcmpe,     _vcmpe,   3, (RVSD_COND, RSVDMQ_FI0, oRMQRZ),    vfp_nsyn_cmp),
+
+#undef  ARM_VARIANT
+#define ARM_VARIANT  & fpu_vfp_ext_v2
+
+ mcCE(fmsrr,	c400a10, 3, (VRSLST, RR, RR), vfp_sp2_from_reg2),
+ mcCE(fmrrs,	c500a10, 3, (RR, RR, VRSLST), vfp_reg2_from_sp2),
+ mcCE(fmdrr,	c400b10, 3, (RVD, RR, RR),    vfp_dp_rm_rd_rn),
+ mcCE(fmrrd,	c500b10, 3, (RR, RR, RVD),    vfp_dp_rd_rn_rm),
+
+#undef  ARM_VARIANT
+#define ARM_VARIANT    & fpu_vfp_ext_armv8xd
+ mnUF(vcvta,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),		neon_cvta),
+ mnUF(vcvtp,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),		neon_cvtp),
+ mnUF(vcvtn,  _vcvta,  3, (RNSDQMQ, oRNSDQMQ, oI32z),	neon_cvtn),
+ mnUF(vcvtm,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),		neon_cvtm),
+ mnUF(vmaxnm, _vmaxnm, 3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),	vmaxnm),
+ mnUF(vminnm, _vminnm, 3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),	vmaxnm),
+
+#undef	ARM_VARIANT
 #define ARM_VARIANT & fpu_neon_ext_v1
- mnUF(vabd,      _vabd,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ mnUF(vabd,      _vabd,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
  mnUF(vabdl,     _vabdl,	  3, (RNQMQ, RNDMQ, RNDMQ),   neon_dyadic_long),
  mnUF(vaddl,     _vaddl,	  3, (RNQMQ, RNDMQ, RNDMQR),  neon_dyadic_long),
  mnUF(vsubl,     _vsubl,	  3, (RNQMQ, RNDMQ, RNDMQR),  neon_dyadic_long),
+ mnUF(vand,      _vand,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vbic,      _vbic,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vorr,      _vorr,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vorn,      _vorn,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(veor,      _veor,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ),      neon_logic),
+ MNUF(vcls,      1b00400,	  2, (RNDQMQ, RNDQMQ),		     neon_cls),
+ MNUF(vclz,      1b00480,	  2, (RNDQMQ, RNDQMQ),		     neon_clz),
+ mnCE(vdup,      _vdup,		  2, (RNDQMQ, RR_RNSC),		     neon_dup),
+ MNUF(vhadd,     00000000,	  3, (RNDQMQ, oRNDQMQ, RNDQMQR),  neon_dyadic_i_su),
+ MNUF(vrhadd,    00000100,	  3, (RNDQMQ, oRNDQMQ, RNDQMQ),	  neon_dyadic_i_su),
+ MNUF(vhsub,     00000200,	  3, (RNDQMQ, oRNDQMQ, RNDQMQR),  neon_dyadic_i_su),
+ mnUF(vmin,      _vmin,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ mnUF(vmax,      _vmax,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ MNUF(vqadd,     0000010,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_dyadic_i64_su),
+ MNUF(vqsub,     0000210,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_dyadic_i64_su),
+ mnUF(vmvn,      _vmvn,    2, (RNDQMQ, RNDQMQ_Ibig), neon_mvn),
+ MNUF(vqabs,     1b00700,  2, (RNDQMQ, RNDQMQ),     neon_sat_abs_neg),
+ MNUF(vqneg,     1b00780,  2, (RNDQMQ, RNDQMQ),     neon_sat_abs_neg),
+ mnUF(vqrdmlah,  _vqrdmlah,3, (RNDQMQ, oRNDQMQ, RNDQ_RNSC_RR), neon_qrdmlah),
+ mnUF(vqdmulh,   _vqdmulh, 3, (RNDQMQ, oRNDQMQ, RNDQMQ_RNSC_RR), neon_qdmulh),
+ mnUF(vqrdmulh,  _vqrdmulh,3, (RNDQMQ, oRNDQMQ, RNDQMQ_RNSC_RR), neon_qdmulh),
+ MNUF(vqrshl,    0000510,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_rshl),
+ MNUF(vrshl,     0000500,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_rshl),
+ MNUF(vshr,      0800010,  3, (RNDQMQ, oRNDQMQ, I64z), neon_rshift_round_imm),
+ MNUF(vrshr,     0800210,  3, (RNDQMQ, oRNDQMQ, I64z), neon_rshift_round_imm),
+ MNUF(vsli,      1800510,  3, (RNDQMQ, oRNDQMQ, I63),  neon_sli),
+ MNUF(vsri,      1800410,  3, (RNDQMQ, oRNDQMQ, I64z), neon_sri),
+ MNUF(vrev64,    1b00000,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ MNUF(vrev32,    1b00080,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ MNUF(vrev16,    1b00100,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ mnUF(vshl,	 _vshl,    3, (RNDQMQ, oRNDQMQ, RNDQMQ_I63b_RR), neon_shl),
+ mnUF(vqshl,     _vqshl,   3, (RNDQMQ, oRNDQMQ, RNDQMQ_I63b_RR), neon_qshl),
+ MNUF(vqshlu,    1800610,  3, (RNDQMQ, oRNDQMQ, I63),		 neon_qshlu_imm),
+
+#undef	ARM_VARIANT
+#define ARM_VARIANT & arm_ext_v8_3
+#undef	THUMB_VARIANT
+#define	THUMB_VARIANT & arm_ext_v6t2_v8m
+ MNUF (vcadd, 0, 4, (RNDQMQ, RNDQMQ, RNDQMQ, EXPi), vcadd),
+ MNUF (vcmla, 0, 4, (RNDQMQ, RNDQMQ, RNDQMQ_RNSC, EXPi), vcmla),
 };
 #undef ARM_VARIANT
 #undef THUMB_VARIANT
@@ -25119,11 +27867,12 @@ md_apply_fix (fixS *	fixP,
       break;
 
     case BFD_RELOC_ARM_SMC:
-      if (((unsigned long) value) > 0xffff)
+      if (((unsigned long) value) > 0xf)
 	as_bad_where (fixP->fx_file, fixP->fx_line,
 		      _("invalid smc expression"));
+
       newval = md_chars_to_number (buf, INSN_SIZE);
-      newval |= (value & 0xf) | ((value & 0xfff0) << 4);
+      newval |= (value & 0xf);
       md_number_to_chars (buf, newval, INSN_SIZE);
       break;
 
@@ -26264,9 +29013,10 @@ md_apply_fix (fixS *	fixP,
 	}
 
       bfd_vma insn = get_thumb32_insn (buf);
-      /* le lr, <label> or le <label> */
+      /* le lr, <label>, le <label> or letp lr, <label> */
       if (((insn & 0xffffffff) == 0xf00fc001)
-	  || ((insn & 0xffffffff) == 0xf02fc001))
+	  || ((insn & 0xffffffff) == 0xf02fc001)
+	  || ((insn & 0xffffffff) == 0xf01fc001))
 	value = -value;
 
       if (v8_1_branch_value_check (value, 12, FALSE) == FAIL)
@@ -27843,6 +30593,12 @@ static const struct arm_cpu_option_table arm_cpus[] =
   ARM_CPU_OPT ("cortex-a76",    "Cortex-A76",	       ARM_ARCH_V8_2A,
 	       ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
 	       FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
+  ARM_CPU_OPT ("cortex-a76ae",    "Cortex-A76AE",      ARM_ARCH_V8_2A,
+	       ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
+	       FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
+  ARM_CPU_OPT ("cortex-a77",    "Cortex-A77",	       ARM_ARCH_V8_2A,
+	       ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
+	       FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
   ARM_CPU_OPT ("ares",    "Ares",	       ARM_ARCH_V8_2A,
 	       ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
 	       FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
@@ -27864,6 +30620,9 @@ static const struct arm_cpu_option_table arm_cpus[] =
   ARM_CPU_OPT ("cortex-r52",	  "Cortex-R52",	       ARM_ARCH_V8R,
 	      ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
 	      FPU_ARCH_NEON_VFP_ARMV8),
+  ARM_CPU_OPT ("cortex-m35p",	  "Cortex-M35P",       ARM_ARCH_V8M_MAIN,
+	       ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
+	       FPU_NONE),
   ARM_CPU_OPT ("cortex-m33",	  "Cortex-M33",	       ARM_ARCH_V8M_MAIN,
 	       ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
 	       FPU_NONE),
@@ -28592,6 +31351,22 @@ arm_parse_extension (const char *str, const arm_feature_set *opt_set,
   return TRUE;
 }
 
+static bfd_boolean
+arm_parse_fp16_opt (const char *str)
+{
+  if (strcasecmp (str, "ieee") == 0)
+    fp16_format = ARM_FP16_FORMAT_IEEE;
+  else if (strcasecmp (str, "alternative") == 0)
+    fp16_format = ARM_FP16_FORMAT_ALTERNATIVE;
+  else
+    {
+      as_bad (_("unrecognised float16 format \"%s\""), str);
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
 static bfd_boolean
 arm_parse_cpu (const char *str)
 {
@@ -28783,6 +31558,12 @@ struct arm_long_option_table arm_long_opts[] =
    arm_parse_it_mode, NULL},
   {"mccs", N_("\t\t\t  TI CodeComposer Studio syntax compatibility mode"),
    arm_ccs_mode, NULL},
+  {"mfp16-format=",
+   N_("[ieee|alternative]\n\
+                          set the encoding for half precision floating point "
+			  "numbers to IEEE\n\
+                          or Arm alternative format."),
+   arm_parse_fp16_opt, NULL },
   {NULL, NULL, 0, NULL}
 };
 
@@ -29364,6 +32145,9 @@ aeabi_set_public_attributes (void)
     virt_sec |= 2;
   if (virt_sec != 0)
     aeabi_set_attribute_int (Tag_Virtualization_use, virt_sec);
+
+  if (fp16_format != ARM_FP16_FORMAT_DEFAULT)
+    aeabi_set_attribute_int (Tag_ABI_FP_16bit_format, fp16_format);
 }
 
 /* Post relaxation hook.  Recompute ARM attributes now that relaxation is