X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=gas%2Fconfig%2Ftc-arm.c;h=2a884b13dcdeb11066ebc23d73b08ca48fa68929;hb=3528c362d9471524cfe8a76c692081838b292d64;hp=b806916cc59400c3a435ccc1718c4157273e1f50;hpb=1b8833198c014f2d7f7a67097061f3b990707084;p=deliverable%2Fbinutils-gdb.git

diff --git a/gas/config/tc-arm.c b/gas/config/tc-arm.c
index b806916cc5..2a884b13dc 100644
--- a/gas/config/tc-arm.c
+++ b/gas/config/tc-arm.c
@@ -32,6 +32,7 @@
 #include "obstack.h"
 #include "libiberty.h"
 #include "opcode/arm.h"
+#include "cpu-arm.h"
 
 #ifdef OBJ_ELF
 #include "elf/arm.h"
@@ -106,6 +107,15 @@ enum arm_float_abi
    should define CPU_DEFAULT here.  */
 #endif
 
+/* Perform range checks on positive and negative overflows by checking if the
+   VALUE given fits within the range of an BITS sized immediate.  */
+static bfd_boolean out_of_range_p (offsetT value, offsetT bits)
+ {
+  gas_assert (bits < (offsetT)(sizeof (value) * 8));
+  return (value & ~((1 << bits)-1))
+	  && ((value & ~((1 << bits)-1)) != ~((1 << bits)-1));
+}
+
 #ifndef FPU_DEFAULT
 # ifdef TE_LINUX
 #  define FPU_DEFAULT FPU_ARCH_FPA
@@ -265,6 +275,10 @@ static const arm_feature_set arm_ext_sb =
   ARM_FEATURE_CORE_HIGH (ARM_EXT2_SB);
 static const arm_feature_set arm_ext_predres =
   ARM_FEATURE_CORE_HIGH (ARM_EXT2_PREDRES);
+static const arm_feature_set arm_ext_bf16 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16);
+static const arm_feature_set arm_ext_i8mm =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM);
 
 static const arm_feature_set arm_arch_any = ARM_ANY;
 #ifdef OBJ_ELF
@@ -345,6 +359,7 @@ static arm_feature_set selected_fpu = FPU_NONE;
 /* Feature bits selected by the last .object_arch directive.  */
 static arm_feature_set selected_object_arch = ARM_ARCH_NONE;
 /* Must be long enough to hold any of the names in arm_cpus.  */
+static const struct arm_ext_table * selected_ctx_ext_table = NULL;
 static char selected_cpu_name[20];
 
 extern FLONUM_TYPE generic_floating_point_number;
@@ -436,6 +451,7 @@ enum neon_el_type
   NT_float,
   NT_poly,
   NT_signed,
+  NT_bfloat,
   NT_unsigned
 };
 
@@ -883,6 +899,7 @@ struct asm_opcode
 	_("cannot use writeback with PC-relative addressing")
 #define BAD_RANGE	_("branch out of range")
 #define BAD_FP16	_("selected processor does not support fp16 instruction")
+#define BAD_BF16	_("selected processor does not support bf16 instruction")
 #define UNPRED_REG(R)	_("using " R " results in unpredictable behaviour")
 #define THUMB1_RELOC_ONLY  _("relocation valid in thumb1 code only")
 #define MVE_NOT_IT	_("Warning: instruction is UNPREDICTABLE in an IT " \
@@ -1009,6 +1026,9 @@ static void it_fsm_post_encode (void);
     }							\
   while (0)
 
+/* Toggle value[pos].  */
+#define TOGGLE_BIT(value, pos) (value ^ (1 << pos))
+
 /* Pure syntax.	 */
 
 /* This array holds the chars that always start a comment.  If the
@@ -1034,7 +1054,7 @@ const char EXP_CHARS[] = "eE";
 /* As in 0f12.456  */
 /* or	 0d1.2345e12  */
 
-const char FLT_CHARS[] = "rRsSfFdDxXeEpP";
+const char FLT_CHARS[] = "rRsSfFdDxXeEpPHh";
 
 /* Prefix characters that indicate the start of an immediate
    value.  */
@@ -1044,6 +1064,16 @@ const char FLT_CHARS[] = "rRsSfFdDxXeEpP";
 
 #define skip_whitespace(str)  do { if (*(str) == ' ') ++(str); } while (0)
 
+enum fp_16bit_format
+{
+  ARM_FP16_FORMAT_IEEE		= 0x1,
+  ARM_FP16_FORMAT_ALTERNATIVE	= 0x2,
+  ARM_FP16_FORMAT_DEFAULT	= 0x3
+};
+
+static enum fp_16bit_format fp16_format = ARM_FP16_FORMAT_DEFAULT;
+
+
 static inline int
 skip_past_char (char ** str, char c)
 {
@@ -1185,6 +1215,57 @@ md_atof (int type, char * litP, int * sizeP)
 
   switch (type)
     {
+    case 'H':
+    case 'h':
+      prec = 1;
+      break;
+
+    /* If this is a bfloat16, then parse it slightly differently, as it
+       does not follow the IEEE specification for floating point numbers
+       exactly.  */
+    case 'b':
+      {
+	FLONUM_TYPE generic_float;
+
+	t = atof_ieee_detail (input_line_pointer, 1, 8, words, &generic_float);
+
+	if (t)
+	  input_line_pointer = t;
+	else
+	  return _("invalid floating point number");
+
+	switch (generic_float.sign)
+	  {
+	  /* Is +Inf.  */
+	  case 'P':
+	    words[0] = 0x7f80;
+	    break;
+
+	  /* Is -Inf.  */
+	  case 'N':
+	    words[0] = 0xff80;
+	    break;
+
+	  /* Is NaN.  */
+	  /* bfloat16 has two types of NaN - quiet and signalling.
+	     Quiet NaN has bit[6] == 1 && faction != 0, whereas
+	     signalling NaN's have bit[0] == 0 && fraction != 0.
+	     Chosen this specific encoding as it is the same form
+	     as used by other IEEE 754 encodings in GAS.  */
+	  case 0:
+	    words[0] = 0x7fff;
+	    break;
+
+	  default:
+	    break;
+	  }
+
+	*sizeP = 2;
+
+	md_number_to_chars (litP, (valueT) words[0], sizeof (LITTLENUM_TYPE));
+
+	return NULL;
+      }
     case 'f':
     case 'F':
     case 's':
@@ -1219,34 +1300,29 @@ md_atof (int type, char * litP, int * sizeP)
     input_line_pointer = t;
   *sizeP = prec * sizeof (LITTLENUM_TYPE);
 
-  if (target_big_endian)
-    {
-      for (i = 0; i < prec; i++)
-	{
-	  md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
-	  litP += sizeof (LITTLENUM_TYPE);
-	}
-    }
+  if (target_big_endian || prec == 1)
+    for (i = 0; i < prec; i++)
+      {
+	md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
+	litP += sizeof (LITTLENUM_TYPE);
+      }
+  else if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_endian_pure))
+    for (i = prec - 1; i >= 0; i--)
+      {
+	md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
+	litP += sizeof (LITTLENUM_TYPE);
+      }
   else
-    {
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_endian_pure))
-	for (i = prec - 1; i >= 0; i--)
-	  {
-	    md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
-	    litP += sizeof (LITTLENUM_TYPE);
-	  }
-      else
-	/* For a 4 byte float the order of elements in `words' is 1 0.
-	   For an 8 byte float the order is 1 0 3 2.  */
-	for (i = 0; i < prec; i += 2)
-	  {
-	    md_number_to_chars (litP, (valueT) words[i + 1],
-				sizeof (LITTLENUM_TYPE));
-	    md_number_to_chars (litP + sizeof (LITTLENUM_TYPE),
-				(valueT) words[i], sizeof (LITTLENUM_TYPE));
-	    litP += 2 * sizeof (LITTLENUM_TYPE);
-	  }
-    }
+    /* For a 4 byte float the order of elements in `words' is 1 0.
+       For an 8 byte float the order is 1 0 3 2.  */
+    for (i = 0; i < prec; i += 2)
+      {
+	md_number_to_chars (litP, (valueT) words[i + 1],
+			    sizeof (LITTLENUM_TYPE));
+	md_number_to_chars (litP + sizeof (LITTLENUM_TYPE),
+			    (valueT) words[i], sizeof (LITTLENUM_TYPE));
+	litP += 2 * sizeof (LITTLENUM_TYPE);
+      }
 
   return NULL;
 }
@@ -1445,6 +1521,28 @@ parse_neon_type (struct neon_type *type, char **str)
 	  thissize = 64;
 	  ptr++;
 	  goto done;
+	case 'b':
+	  thistype = NT_bfloat;
+	  switch (TOLOWER (*(++ptr)))
+	    {
+	    case 'f':
+	      ptr += 1;
+	      thissize = strtoul (ptr, &ptr, 10);
+	      if (thissize != 16)
+		{
+		  as_bad (_("bad size %d in type specifier"), thissize);
+		  return FAIL;
+		}
+	      goto done;
+	    case '0': case '1': case '2': case '3': case '4':
+	    case '5': case '6': case '7': case '8': case '9':
+	    case ' ': case '.':
+	      as_bad (_("unexpected type character `b' -- did you mean `bf'?"));
+	      return FAIL;
+	    default:
+	      break;
+	    }
+	  break;
 	default:
 	  as_bad (_("unexpected character `%c' in type specifier"), *ptr);
 	  return FAIL;
@@ -4922,6 +5020,55 @@ pe_directive_secrel (int dummy ATTRIBUTE_UNUSED)
 }
 #endif /* TE_PE */
 
+int
+arm_is_largest_exponent_ok (int precision)
+{
+  /* precision == 1 ensures that this will only return
+     true for 16 bit floats.  */
+  return (precision == 1) && (fp16_format == ARM_FP16_FORMAT_ALTERNATIVE);
+}
+
+static void
+set_fp16_format (int dummy ATTRIBUTE_UNUSED)
+{
+  char saved_char;
+  char* name;
+  enum fp_16bit_format new_format;
+
+  new_format = ARM_FP16_FORMAT_DEFAULT;
+
+  name = input_line_pointer;
+  while (*input_line_pointer && !ISSPACE (*input_line_pointer))
+    input_line_pointer++;
+
+  saved_char = *input_line_pointer;
+  *input_line_pointer = 0;
+
+  if (strcasecmp (name, "ieee") == 0)
+    new_format = ARM_FP16_FORMAT_IEEE;
+  else if (strcasecmp (name, "alternative") == 0)
+    new_format = ARM_FP16_FORMAT_ALTERNATIVE;
+  else
+    {
+      as_bad (_("unrecognised float16 format \"%s\""), name);
+      goto cleanup;
+    }
+
+  /* Only set fp16_format if it is still the default (aka not already
+     been set yet).  */
+  if (fp16_format == ARM_FP16_FORMAT_DEFAULT)
+    fp16_format = new_format;
+  else
+    {
+      if (new_format != fp16_format)
+	as_warn (_("float16 format cannot be set more than once, ignoring."));
+    }
+
+cleanup:
+  *input_line_pointer = saved_char;
+  ignore_rest_of_line ();
+}
+
 /* This table describes all the machine specific pseudo-ops the assembler
    has to support.  The fields are:
      pseudo-op name without dot
@@ -4989,6 +5136,7 @@ const pseudo_typeS md_pseudo_table[] =
   { "extend",	   float_cons, 'x' },
   { "ldouble",	   float_cons, 'x' },
   { "packed",	   float_cons, 'p' },
+  { "bfloat16",	   float_cons, 'b' },
 #ifdef TE_PE
   {"secrel32", pe_directive_secrel, 0},
 #endif
@@ -4999,9 +5147,12 @@ const pseudo_typeS md_pseudo_table[] =
   {"asmfunc",      s_ccs_asmfunc,    0},
   {"endasmfunc",   s_ccs_endasmfunc, 0},
 
+  {"float16", float_cons, 'h' },
+  {"float16_format", set_fp16_format, 0 },
+
   { 0, 0, 0 }
 };
-
+
 /* Parser functions used exclusively in instruction operands.  */
 
 /* Generic immediate-value read function for use in insn parsing.
@@ -6678,8 +6829,10 @@ parse_neon_mov (char **str, int *which_operand)
 	      inst.operands[i].present = 1;
 	    }
 	}
-      else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype,
-					   &optype)) != FAIL)
+      else if (((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype,
+		&optype)) != FAIL)
+	       || ((val = arm_typed_reg_parse (&ptr, REG_TYPE_MQ, &rtype,
+		   &optype)) != FAIL))
 	{
 	  /* Case 0: VMOV<c><q> <Qd>, <Qm>
 	     Case 1: VMOV<c><q> <Dd>, <Dm>
@@ -6902,6 +7055,7 @@ enum operand_parse_code
   OP_RNSD,      /* Neon single or double precision register */
   OP_RNDQ,      /* Neon double or quad precision register */
   OP_RNDQMQ,     /* Neon double, quad or MVE vector register.  */
+  OP_RNDQMQR,   /* Neon double, quad, MVE vector or ARM register.  */
   OP_RNSDQ,	/* Neon single, double or quad precision register */
   OP_RNSC,      /* Neon scalar D[X] */
   OP_RVC,	/* VFP control register */
@@ -6922,12 +7076,14 @@ enum operand_parse_code
 		   GPR (no SP/SP)  */
   OP_RMQ,	/* MVE vector register.  */
   OP_RMQRZ,	/* MVE vector or ARM register including ZR.  */
+  OP_RMQRR,     /* MVE vector or ARM register.  */
 
   /* New operands for Armv8.1-M Mainline.  */
   OP_LR,	/* ARM LR register */
   OP_RRe,	/* ARM register, only even numbered.  */
   OP_RRo,	/* ARM register, only odd numbered, not r13 or r15.  */
   OP_RRnpcsp_I32, /* ARM register (no BadReg) or literal 1 .. 32 */
+  OP_RR_ZR,	/* ARM register or ZR but no PC */
 
   OP_REGLST,	/* ARM register list */
   OP_CLRMLST,	/* CLRM register list */
@@ -6950,11 +7106,21 @@ enum operand_parse_code
   OP_RNSDQ_RNSC, /* Vector S, D or Q reg, or Neon scalar.  */
   OP_RNSDQ_RNSC_MQ, /* Vector S, D or Q reg, Neon scalar or MVE vector register.
 		     */
+  OP_RNSDQ_RNSC_MQ_RR, /* Vector S, D or Q reg, or MVE vector reg , or Neon
+			  scalar, or ARM register.  */
   OP_RNDQ_RNSC, /* Neon D or Q reg, or Neon scalar.  */
+  OP_RNDQ_RNSC_RR, /* Neon D or Q reg, Neon scalar, or ARM register.  */
+  OP_RNDQMQ_RNSC_RR, /* Neon D or Q reg, Neon scalar, MVE vector or ARM
+			register.  */
+  OP_RNDQMQ_RNSC, /* Neon D, Q or MVE vector reg, or Neon scalar.  */
   OP_RND_RNSC,  /* Neon D reg, or Neon scalar.  */
   OP_VMOV,      /* Neon VMOV operands.  */
   OP_RNDQ_Ibig,	/* Neon D or Q reg, or big immediate for logic and VMVN.  */
+  /* Neon D, Q or MVE vector register, or big immediate for logic and VMVN.  */
+  OP_RNDQMQ_Ibig,
   OP_RNDQ_I63b, /* Neon D or Q reg, or immediate for shift.  */
+  OP_RNDQMQ_I63b_RR, /* Neon D or Q reg, immediate for shift, MVE vector or
+			ARM register.  */
   OP_RIWR_I32z, /* iWMMXt wR register, or immediate 0 .. 32 for iWMMXt2.  */
   OP_VLDR,	/* VLDR operand.  */
 
@@ -6967,6 +7133,7 @@ enum operand_parse_code
   OP_I31w,	/*		   0 .. 31, optional trailing ! */
   OP_I32,	/*		   1 .. 32 */
   OP_I32z,	/*		   0 .. 32 */
+  OP_I48_I64,	/*		   48 or 64 */
   OP_I63,	/*		   0 .. 63 */
   OP_I63s,	/*		 -64 .. 63 */
   OP_I64,	/*		   1 .. 64 */
@@ -7118,6 +7285,25 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
     }								\
   while (0)
 
+#define po_imm1_or_imm2_or_fail(imm1, imm2, popt)		\
+  do								\
+    {								\
+      expressionS exp;						\
+      my_get_expression (&exp, &str, popt);			\
+      if (exp.X_op != O_constant)				\
+	{							\
+	  inst.error = _("constant expression required");	\
+	  goto failure;						\
+	}							\
+      if (exp.X_add_number != imm1 && exp.X_add_number != imm2) \
+	{							\
+	  inst.error = _("immediate value 48 or 64 expected");	\
+	  goto failure;						\
+	}							\
+      inst.operands[i].imm = exp.X_add_number;			\
+    }								\
+  while (0)
+
 #define po_scalar_or_goto(elsz, label, reg_type)			\
   do									\
     {									\
@@ -7220,7 +7406,20 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  break;
 	  /* Also accept generic coprocessor regs for unknown registers.  */
 	  coproc_reg:
-	  po_reg_or_fail (REG_TYPE_CN);
+	  po_reg_or_goto (REG_TYPE_CN, vpr_po);
+	  break;
+	  /* Also accept P0 or p0 for VPR.P0.  Since P0 is already an
+	     existing register with a value of 0, this seems like the
+	     best way to parse P0.  */
+	  vpr_po:
+	  if (strncasecmp (str, "P0", 2) == 0)
+	    {
+	      str += 2;
+	      inst.operands[i].isreg = 1;
+	      inst.operands[i].reg = 13;
+	    }
+	  else
+	    goto failure;
 	  break;
 	case OP_RMF:   po_reg_or_fail (REG_TYPE_MVF);	  break;
 	case OP_RMD:   po_reg_or_fail (REG_TYPE_MVD);	  break;
@@ -7239,6 +7438,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	try_nq:
 	case OP_RNQ:   po_reg_or_fail (REG_TYPE_NQ);      break;
 	case OP_RNSD:  po_reg_or_fail (REG_TYPE_NSD);     break;
+	case OP_RNDQMQR:
+	  po_reg_or_goto (REG_TYPE_RN, try_rndqmq);
+	  break;
+	try_rndqmq:
 	case OP_oRNDQMQ:
 	case OP_RNDQMQ:
 	  po_reg_or_goto (REG_TYPE_MQ, try_rndq);
@@ -7268,6 +7471,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  po_reg_or_fail (REG_TYPE_NSDQ);
 	  inst.error = 0;
 	  break;
+	case OP_RMQRR:
+	  po_reg_or_goto (REG_TYPE_RN, try_rmq);
+	  break;
+	try_rmq:
 	case OP_RMQ:
 	  po_reg_or_fail (REG_TYPE_MQ);
 	  break;
@@ -7317,6 +7524,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  }
 	  break;
 
+	case OP_RNSDQ_RNSC_MQ_RR:
+	  po_reg_or_goto (REG_TYPE_RN, try_rnsdq_rnsc_mq);
+	  break;
+	try_rnsdq_rnsc_mq:
 	case OP_RNSDQ_RNSC_MQ:
 	  po_reg_or_goto (REG_TYPE_MQ, try_rnsdq_rnsc);
 	  break;
@@ -7344,6 +7555,17 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  }
 	  break;
 
+	case OP_RNDQMQ_RNSC_RR:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rndq_rnsc_rr);
+	  break;
+	try_rndq_rnsc_rr:
+	case OP_RNDQ_RNSC_RR:
+	  po_reg_or_goto (REG_TYPE_RN, try_rndq_rnsc);
+	  break;
+	case OP_RNDQMQ_RNSC:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rndq_rnsc);
+	  break;
+	try_rndq_rnsc:
 	case OP_RNDQ_RNSC:
 	  {
 	    po_scalar_or_goto (8, try_ndq, REG_TYPE_VFD);
@@ -7368,6 +7590,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  po_misc_or_fail (parse_neon_mov (&str, &i) == FAIL);
 	  break;
 
+	case OP_RNDQMQ_Ibig:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rndq_ibig);
+	  break;
+	try_rndq_ibig:
 	case OP_RNDQ_Ibig:
 	  {
 	    po_reg_or_goto (REG_TYPE_NDQ, try_immbig);
@@ -7384,6 +7610,13 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	  }
 	  break;
 
+	case OP_RNDQMQ_I63b_RR:
+	  po_reg_or_goto (REG_TYPE_MQ, try_rndq_i63b_rr);
+	  break;
+	try_rndq_i63b_rr:
+	  po_reg_or_goto (REG_TYPE_RN, try_rndq_i63b);
+	  break;
+	try_rndq_i63b:
 	case OP_RNDQ_I63b:
 	  {
 	    po_reg_or_goto (REG_TYPE_NDQ, try_shimm);
@@ -7415,6 +7648,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	case OP_I31:	 po_imm_or_fail (  0,	  31, FALSE);	break;
 	case OP_I32:	 po_imm_or_fail (  1,	  32, FALSE);	break;
 	case OP_I32z:	 po_imm_or_fail (  0,     32, FALSE);   break;
+	case OP_I48_I64: po_imm1_or_imm2_or_fail (48, 64, FALSE); break;
 	case OP_I63s:	 po_imm_or_fail (-64,	  63, FALSE);	break;
 	case OP_I63:	 po_imm_or_fail (  0,     63, FALSE);   break;
 	case OP_I64:	 po_imm_or_fail (  1,     64, FALSE);   break;
@@ -7513,6 +7747,9 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	case OP_RRnpc_I0: po_reg_or_goto (REG_TYPE_RN, I0);   break;
 	I0:		  po_imm_or_fail (0, 0, FALSE);	      break;
 
+	case OP_RRnpcsp_I32: po_reg_or_goto (REG_TYPE_RN, I32);	break;
+	I32:		     po_imm_or_fail (1, 32, FALSE);	break;
+
 	case OP_RF_IF:    po_reg_or_goto (REG_TYPE_FN, IF);   break;
 	IF:
 	  if (!is_immediate_prefix (*str))
@@ -7744,6 +7981,8 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 	case OP_oRMQRZ:
 	  po_reg_or_goto (REG_TYPE_MQ, try_rr_zr);
 	  break;
+
+	case OP_RR_ZR:
 	try_rr_zr:
 	  po_reg_or_goto (REG_TYPE_RN, ZR);
 	  break;
@@ -7772,6 +8011,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 
 	case OP_oRRnpcsp:
 	case OP_RRnpcsp:
+	case OP_RRnpcsp_I32:
 	  if (inst.operands[i].isreg)
 	    {
 	      if (inst.operands[i].reg == REG_PC)
@@ -7830,6 +8070,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
 
 	case OP_RMQRZ:
 	case OP_oRMQRZ:
+	case OP_RR_ZR:
 	  if (!inst.operands[i].iszr && inst.operands[i].reg == REG_PC)
 	    inst.error = BAD_PC;
 	  break;
@@ -8650,6 +8891,11 @@ move_or_literal_pool (int i, enum lit_type t, bfd_boolean mode_3)
 		      inst.instruction |= (imm & 0x0800) << 15;
 		      inst.instruction |= (imm & 0x0700) << 4;
 		      inst.instruction |= (imm & 0x00ff);
+		      /*  In case this replacement is being done on Armv8-M
+			  Baseline we need to make sure to disable the
+			  instruction size check, as otherwise GAS will reject
+			  the use of this T32 instruction.  */
+		      inst.size_req = 0;
 		      return TRUE;
 		    }
 		}
@@ -9774,10 +10020,42 @@ do_vmrs (void)
       return;
     }
 
-  /* MVFR2 is only valid at ARMv8-A.  */
-  if (inst.operands[1].reg == 5)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-		_(BAD_FPU));
+  switch (inst.operands[1].reg)
+    {
+    /* MVFR2 is only valid for Armv8-A.  */
+    case 5:
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+		  _(BAD_FPU));
+      break;
+
+    /* Check for new Armv8.1-M Mainline changes to <spec_reg>.  */
+    case 1: /* fpscr.  */
+      constraint (!(ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		    || ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+		  _(BAD_FPU));
+      break;
+
+    case 14: /* fpcxt_ns.  */
+    case 15: /* fpcxt_s.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main),
+		  _("selected processor does not support instruction"));
+      break;
+
+    case  2: /* fpscr_nzcvqc.  */
+    case 12: /* vpr.  */
+    case 13: /* p0.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main)
+		  || (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		      && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+		  _("selected processor does not support instruction"));
+      if (inst.operands[0].reg != 2
+	  && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	as_warn (_("accessing MVE system register without MVE is UNPREDICTABLE"));
+      break;
+
+    default:
+      break;
+    }
 
   /* APSR_ sets isvec. All other refs to PC are illegal.  */
   if (!inst.operands[0].isvec && Rt == REG_PC)
@@ -9805,10 +10083,42 @@ do_vmsr (void)
       return;
     }
 
-  /* MVFR2 is only valid for ARMv8-A.  */
-  if (inst.operands[0].reg == 5)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-		_(BAD_FPU));
+  switch (inst.operands[0].reg)
+    {
+    /* MVFR2 is only valid for Armv8-A.  */
+    case 5:
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+		  _(BAD_FPU));
+      break;
+
+    /* Check for new Armv8.1-M Mainline changes to <spec_reg>.  */
+    case  1: /* fpcr.  */
+      constraint (!(ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		    || ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+		  _(BAD_FPU));
+      break;
+
+    case 14: /* fpcxt_ns.  */
+    case 15: /* fpcxt_s.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main),
+		  _("selected processor does not support instruction"));
+      break;
+
+    case  2: /* fpscr_nzcvqc.  */
+    case 12: /* vpr.  */
+    case 13: /* p0.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main)
+		  || (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+		      && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+		  _("selected processor does not support instruction"));
+      if (inst.operands[0].reg != 2
+	  && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	as_warn (_("accessing MVE system register without MVE is UNPREDICTABLE"));
+      break;
+
+    default:
+      break;
+    }
 
   /* If we get through parsing the register name, we just insert the number
      generated into the instruction without further validation.  */
@@ -10108,6 +10418,9 @@ do_shift (void)
 static void
 do_smc (void)
 {
+  unsigned int value = inst.relocs[0].exp.X_add_number;
+  constraint (value > 0xf, _("immediate too large (bigger than 0xF)"));
+
   inst.relocs[0].type = BFD_RELOC_ARM_SMC;
   inst.relocs[0].pc_rel = 0;
 }
@@ -11052,7 +11365,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
     inst.error = _("instruction does not accept unindexed addressing");
 }
 
-/* Table of Thumb instructions which exist in both 16- and 32-bit
+/* Table of Thumb instructions which exist in 16- and/or 32-bit
    encodings (the latter only in post-V6T2 cores).  The index is the
    value used in the insns table below.  When there is more than one
    possible 16-bit encoding for the instruction, this table always
@@ -11081,16 +11394,27 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
   X(_bflx,  0000, f070e001),			\
   X(_bic,   4380, ea200000),			\
   X(_bics,  4380, ea300000),			\
+  X(_cinc,  0000, ea509000),			\
+  X(_cinv,  0000, ea50a000),			\
   X(_cmn,   42c0, eb100f00),			\
   X(_cmp,   2800, ebb00f00),			\
+  X(_cneg,  0000, ea50b000),			\
   X(_cpsie, b660, f3af8400),			\
   X(_cpsid, b670, f3af8600),			\
   X(_cpy,   4600, ea4f0000),			\
+  X(_csel,  0000, ea508000),			\
+  X(_cset,  0000, ea5f900f),			\
+  X(_csetm, 0000, ea5fa00f),			\
+  X(_csinc, 0000, ea509000),			\
+  X(_csinv, 0000, ea50a000),			\
+  X(_csneg, 0000, ea50b000),			\
   X(_dec_sp,80dd, f1ad0d00),			\
   X(_dls,   0000, f040e001),			\
+  X(_dlstp, 0000, f000e001),			\
   X(_eor,   4040, ea800000),			\
   X(_eors,  4040, ea900000),			\
   X(_inc_sp,00dd, f10d0d00),			\
+  X(_lctp,  0000, f00fe001),			\
   X(_ldmia, c800, e8900000),			\
   X(_ldr,   6800, f8500000),			\
   X(_ldrb,  7800, f8100000),			\
@@ -11101,6 +11425,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
   X(_ldr_pc2,4800, f85f0000),			\
   X(_ldr_sp,9800, f85d0000),			\
   X(_le,    0000, f00fc001),			\
+  X(_letp,  0000, f01fc001),			\
   X(_lsl,   0000, fa00f000),			\
   X(_lsls,  0000, fa10f000),			\
   X(_lsr,   0800, fa20f000),			\
@@ -11143,6 +11468,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
   X(_wfe,   bf20, f3af8002),			\
   X(_wfi,   bf30, f3af8003),			\
   X(_wls,   0000, f040c001),			\
+  X(_wlstp, 0000, f000c001),			\
   X(_sev,   bf40, f3af8004),                    \
   X(_sevl,  bf50, f3af8005),			\
   X(_udf,   de00, f7f0a000)
@@ -11896,6 +12222,60 @@ do_t_clz (void)
   inst.instruction |= Rm;
 }
 
+/* For the Armv8.1-M conditional instructions.  */
+static void
+do_t_cond (void)
+{
+  unsigned Rd, Rn, Rm;
+  signed int cond;
+
+  constraint (inst.cond != COND_ALWAYS, BAD_COND);
+
+  Rd = inst.operands[0].reg;
+  switch (inst.instruction)
+    {
+      case T_MNEM_csinc:
+      case T_MNEM_csinv:
+      case T_MNEM_csneg:
+      case T_MNEM_csel:
+	Rn = inst.operands[1].reg;
+	Rm = inst.operands[2].reg;
+	cond = inst.operands[3].imm;
+	constraint (Rn == REG_SP, BAD_SP);
+	constraint (Rm == REG_SP, BAD_SP);
+	break;
+
+      case T_MNEM_cinc:
+      case T_MNEM_cinv:
+      case T_MNEM_cneg:
+	Rn = inst.operands[1].reg;
+	cond = inst.operands[2].imm;
+	/* Invert the last bit to invert the cond.  */
+	cond = TOGGLE_BIT (cond, 0);
+	constraint (Rn == REG_SP, BAD_SP);
+	Rm = Rn;
+	break;
+
+      case T_MNEM_csetm:
+      case T_MNEM_cset:
+	cond = inst.operands[1].imm;
+	/* Invert the last bit to invert the cond.  */
+	cond = TOGGLE_BIT (cond, 0);
+	Rn = REG_PC;
+	Rm = REG_PC;
+	break;
+
+      default: abort ();
+    }
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  inst.instruction = THUMB_OP32 (inst.instruction);
+  inst.instruction |= Rd << 8;
+  inst.instruction |= Rn << 16;
+  inst.instruction |= Rm;
+  inst.instruction |= cond << 4;
+}
+
 static void
 do_t_csdb (void)
 {
@@ -13671,10 +14051,11 @@ do_t_smc (void)
 	      _("SMC is not permitted on this architecture"));
   constraint (inst.relocs[0].exp.X_op != O_constant,
 	      _("expression too complex"));
+  constraint (value > 0xf, _("immediate too large (bigger than 0xF)"));
+
   inst.relocs[0].type = BFD_RELOC_UNUSED;
-  inst.instruction |= (value & 0xf000) >> 12;
-  inst.instruction |= (value & 0x0ff0);
   inst.instruction |= (value & 0x000f) << 16;
+
   /* PR gas/15623: SMC instructions must be last in an IT block.  */
   set_pred_insn_type_last ();
 }
@@ -14068,35 +14449,52 @@ v8_1_loop_reloc (int is_le)
     }
 }
 
-/* To handle the Scalar Low Overhead Loop instructions
-   in Armv8.1-M Mainline.  */
+/* For shifts with four operands in MVE.  */
 static void
-do_t_loloop (void)
+do_mve_scalar_shift1 (void)
 {
-  unsigned long insn = inst.instruction;
+  unsigned int value = inst.operands[2].imm;
 
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
-  inst.instruction = THUMB_OP32 (inst.instruction);
+  inst.instruction |= inst.operands[0].reg << 16;
+  inst.instruction |= inst.operands[1].reg << 8;
 
-  switch (insn)
-    {
-    case T_MNEM_le:
-      /* le <label>.  */
-      if (!inst.operands[0].present)
-	inst.instruction |= 1 << 21;
+  /* Setting the bit for saturation.  */
+  inst.instruction |= ((value == 64) ? 0: 1) << 7;
 
-      v8_1_loop_reloc (TRUE);
-      break;
+  /* Assuming Rm is already checked not to be 11x1.  */
+  constraint (inst.operands[3].reg == inst.operands[0].reg, BAD_OVERLAP);
+  constraint (inst.operands[3].reg == inst.operands[1].reg, BAD_OVERLAP);
+  inst.instruction |= inst.operands[3].reg << 12;
+}
 
-    case T_MNEM_wls:
-      v8_1_loop_reloc (FALSE);
-      /* Fall through.  */
-    case T_MNEM_dls:
-      constraint (inst.operands[1].isreg != 1, BAD_ARGS);
-      inst.instruction |= (inst.operands[1].reg << 16);
-      break;
+/* For shifts in MVE.  */
+static void
+do_mve_scalar_shift (void)
+{
+  if (!inst.operands[2].present)
+    {
+      inst.operands[2] = inst.operands[1];
+      inst.operands[1].reg = 0xf;
+    }
+
+  inst.instruction |= inst.operands[0].reg << 16;
+  inst.instruction |= inst.operands[1].reg << 8;
 
-    default: abort();
+  if (inst.operands[2].isreg)
+    {
+      /* Assuming Rm is already checked not to be 11x1.  */
+      constraint (inst.operands[2].reg == inst.operands[0].reg, BAD_OVERLAP);
+      constraint (inst.operands[2].reg == inst.operands[1].reg, BAD_OVERLAP);
+      inst.instruction |= inst.operands[2].reg << 12;
+    }
+  else
+    {
+      /* Assuming imm is already checked as [1,32].  */
+      unsigned int value = inst.operands[2].imm;
+      inst.instruction |= (value & 0x1c) << 10;
+      inst.instruction |= (value & 0x03) << 6;
+      /* Change last 4 bits from 0xd to 0xf.  */
+      inst.instruction |= 0x2;
     }
 }
 
@@ -14112,6 +14510,7 @@ do_t_loloop (void)
 #define M_MNEM_vmlsdavax  0xeef01e21
 #define M_MNEM_vmullt	0xee011e00
 #define M_MNEM_vmullb	0xee010e00
+#define M_MNEM_vctp	0xf000e801
 #define M_MNEM_vst20	0xfc801e00
 #define M_MNEM_vst21	0xfc801e20
 #define M_MNEM_vst40	0xfc801e01
@@ -14136,6 +14535,55 @@ do_t_loloop (void)
 #define M_MNEM_vmovlb	0xeea00f40
 #define M_MNEM_vmovnt	0xfe311e81
 #define M_MNEM_vmovnb	0xfe310e81
+#define M_MNEM_vadc	0xee300f00
+#define M_MNEM_vadci	0xee301f00
+#define M_MNEM_vbrsr	0xfe011e60
+#define M_MNEM_vaddlv	0xee890f00
+#define M_MNEM_vaddlva	0xee890f20
+#define M_MNEM_vaddv	0xeef10f00
+#define M_MNEM_vaddva	0xeef10f20
+#define M_MNEM_vddup	0xee011f6e
+#define M_MNEM_vdwdup	0xee011f60
+#define M_MNEM_vidup	0xee010f6e
+#define M_MNEM_viwdup	0xee010f60
+#define M_MNEM_vmaxv	0xeee20f00
+#define M_MNEM_vmaxav	0xeee00f00
+#define M_MNEM_vminv	0xeee20f80
+#define M_MNEM_vminav	0xeee00f80
+#define M_MNEM_vmlaldav	  0xee800e00
+#define M_MNEM_vmlaldava  0xee800e20
+#define M_MNEM_vmlaldavx  0xee801e00
+#define M_MNEM_vmlaldavax 0xee801e20
+#define M_MNEM_vmlsldav	  0xee800e01
+#define M_MNEM_vmlsldava  0xee800e21
+#define M_MNEM_vmlsldavx  0xee801e01
+#define M_MNEM_vmlsldavax 0xee801e21
+#define M_MNEM_vrmlaldavhx  0xee801f00
+#define M_MNEM_vrmlaldavhax 0xee801f20
+#define M_MNEM_vrmlsldavh   0xfe800e01
+#define M_MNEM_vrmlsldavha  0xfe800e21
+#define M_MNEM_vrmlsldavhx  0xfe801e01
+#define M_MNEM_vrmlsldavhax 0xfe801e21
+#define M_MNEM_vqmovnt	  0xee331e01
+#define M_MNEM_vqmovnb	  0xee330e01
+#define M_MNEM_vqmovunt	  0xee311e81
+#define M_MNEM_vqmovunb	  0xee310e81
+#define M_MNEM_vshrnt	    0xee801fc1
+#define M_MNEM_vshrnb	    0xee800fc1
+#define M_MNEM_vrshrnt	    0xfe801fc1
+#define M_MNEM_vqshrnt	    0xee801f40
+#define M_MNEM_vqshrnb	    0xee800f40
+#define M_MNEM_vqshrunt	    0xee801fc0
+#define M_MNEM_vqshrunb	    0xee800fc0
+#define M_MNEM_vrshrnb	    0xfe800fc1
+#define M_MNEM_vqrshrnt	    0xee801f41
+#define M_MNEM_vqrshrnb	    0xee800f41
+#define M_MNEM_vqrshrunt    0xfe801fc0
+#define M_MNEM_vqrshrunb    0xfe800fc0
+
+/* Bfloat16 instruction encoder helpers.  */
+#define B_MNEM_vfmat 0xfc300850
+#define B_MNEM_vfmab 0xfc300810
 
 /* Neon instruction encoder helpers.  */
 
@@ -14300,8 +14748,11 @@ NEON_ENC_TAB
      - a table used to drive neon_select_shape.  */
 
 #define NEON_SHAPE_DEF			\
+  X(4, (R, R, Q, Q), QUAD),		\
+  X(4, (Q, R, R, I), QUAD),		\
   X(4, (R, R, S, S), QUAD),		\
   X(4, (S, S, R, R), QUAD),		\
+  X(3, (Q, R, I), QUAD),		\
   X(3, (I, Q, Q), QUAD),		\
   X(3, (I, Q, R), QUAD),		\
   X(3, (R, Q, Q), QUAD),		\
@@ -14312,6 +14763,8 @@ NEON_ENC_TAB
   X(3, (D, D, S), DOUBLE),		\
   X(3, (Q, Q, S), QUAD),		\
   X(3, (Q, Q, R), QUAD),		\
+  X(3, (R, R, Q), QUAD),		\
+  X(2, (R, Q),	  QUAD),		\
   X(2, (D, D), DOUBLE),			\
   X(2, (Q, Q), QUAD),			\
   X(2, (D, S), DOUBLE),			\
@@ -14348,6 +14801,8 @@ NEON_ENC_TAB
   X(2, (R, S), SINGLE),			\
   X(2, (F, R), SINGLE),			\
   X(2, (R, F), SINGLE),			\
+/* Used for MVE tail predicated loop instructions.  */\
+  X(2, (R, R), QUAD),			\
 /* Half float shape supported so far.  */\
   X (2, (H, D), MIXED),			\
   X (2, (D, H), MIXED),			\
@@ -14475,6 +14930,7 @@ enum neon_type_mask
   N_F32  = 0x0080000,
   N_F64  = 0x0100000,
   N_P64	 = 0x0200000,
+  N_BF16 = 0x0400000,
   N_KEY  = 0x1000000, /* Key element (main type specifier).  */
   N_EQK  = 0x2000000, /* Given operand has the same type & size as the key.  */
   N_VFP  = 0x4000000, /* VFP mode: operand size must match register width.  */
@@ -14773,6 +15229,10 @@ type_chk_of_el_type (enum neon_el_type type, unsigned size)
 	}
       break;
 
+    case NT_bfloat:
+      if (size == 16) return N_BF16;
+      break;
+
     default: ;
     }
 
@@ -14791,7 +15251,8 @@ el_type_of_type_chk (enum neon_el_type *type, unsigned *size,
 
   if ((mask & (N_S8 | N_U8 | N_I8 | N_8 | N_P8)) != 0)
     *size = 8;
-  else if ((mask & (N_S16 | N_U16 | N_I16 | N_16 | N_F16 | N_P16)) != 0)
+  else if ((mask & (N_S16 | N_U16 | N_I16 | N_16 | N_F16 | N_P16 | N_BF16))
+	   != 0)
     *size = 16;
   else if ((mask & (N_S32 | N_U32 | N_I32 | N_32 | N_F32)) != 0)
     *size = 32;
@@ -14812,6 +15273,8 @@ el_type_of_type_chk (enum neon_el_type *type, unsigned *size,
     *type = NT_poly;
   else if ((mask & (N_F_ALL)) != 0)
     *type = NT_float;
+  else if ((mask & (N_BF16)) != 0)
+    *type = NT_bfloat;
   else
     return FAIL;
 
@@ -15429,6 +15892,45 @@ mve_get_vcmp_vpt_cond (struct neon_type_el et)
   abort ();
 }
 
+/* For VCTP (create vector tail predicate) in MVE.  */
+static void
+do_mve_vctp (void)
+{
+  int dt = 0;
+  unsigned size = 0x0;
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  /* This is a typical MVE instruction which has no type but have size 8, 16,
+     32 and 64.  For instructions with no type, inst.vectype.el[j].type is set
+     to NT_untyped and size is updated in inst.vectype.el[j].size.  */
+  if ((inst.operands[0].present) && (inst.vectype.el[0].type == NT_untyped))
+    dt = inst.vectype.el[0].size;
+
+  /* Setting this does not indicate an actual NEON instruction, but only
+     indicates that the mnemonic accepts neon-style type suffixes.  */
+  inst.is_neon = 1;
+
+  switch (dt)
+    {
+      case 8:
+	break;
+      case 16:
+	size = 0x1; break;
+      case 32:
+	size = 0x2; break;
+      case 64:
+	size = 0x3; break;
+      default:
+	first_error (_("Type is not allowed for this instruction"));
+    }
+  inst.instruction |= size << 20;
+  inst.instruction |= inst.operands[0].reg << 16;
+}
+
 static void
 do_mve_vpt (void)
 {
@@ -15560,20 +16062,410 @@ do_mve_vcmp (void)
 }
 
 static void
-do_vfp_nsyn_cmp (void)
+do_mve_vmaxa_vmina (void)
 {
-  enum neon_shape rs;
-  if (!inst.operands[0].isreg)
-    {
-      do_mve_vcmp ();
-      return;
-    }
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
   else
-    {
-      constraint (inst.operands[2].present, BAD_SYNTAX);
-      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd),
-		  BAD_FPU);
-    }
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  enum neon_shape rs = neon_select_shape (NS_QQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_KEY | N_S8 | N_S16 | N_S32);
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vfmas (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_F_MVE | N_KEY, N_EQK, N_EQK);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[2].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[2].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  inst.instruction |= (et.size == 16) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_viddup (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned imm = inst.relocs[0].exp.X_add_number;
+  constraint (imm != 1 && imm != 2 && imm != 4 && imm != 8,
+	      _("immediate must be either 1, 2, 4 or 8"));
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  unsigned Rm;
+  if (inst.instruction == M_MNEM_vddup || inst.instruction == M_MNEM_vidup)
+    {
+      rs = neon_select_shape (NS_QRI, NS_NULL);
+      et = neon_check_type (2, rs, N_KEY | N_U8 | N_U16 | N_U32, N_EQK);
+      Rm = 7;
+    }
+  else
+    {
+      constraint ((inst.operands[2].reg % 2) != 1, BAD_EVEN);
+      if (inst.operands[2].reg == REG_SP)
+	as_tsktsk (MVE_BAD_SP);
+      else if (inst.operands[2].reg == REG_PC)
+	first_error (BAD_PC);
+
+      rs = neon_select_shape (NS_QRRI, NS_NULL);
+      et = neon_check_type (3, rs, N_KEY | N_U8 | N_U16 | N_U32, N_EQK, N_EQK);
+      Rm = inst.operands[2].reg >> 1;
+    }
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= inst.operands[1].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (imm > 2) << 7;
+  inst.instruction |= Rm << 1;
+  inst.instruction |= (imm == 2 || imm == 8);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vmlas (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+  if (inst.operands[2].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[2].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshll (void)
+{
+  struct neon_type_el et
+    = neon_check_type (2, NS_QQI, N_EQK, N_S8 | N_U8 | N_S16 | N_U16 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || (unsigned)imm > et.size,
+	      _("immediate value out of range"));
+
+  if ((unsigned)imm == et.size)
+    {
+      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= 0x110001;
+    }
+  else
+    {
+      inst.instruction |= (et.size + imm) << 16;
+      inst.instruction |= 0x800140;
+    }
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshlc (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[1].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[1].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || imm > 32, _("immediate value out of range"));
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (imm & 0x1f) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= inst.operands[1].reg;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vshrn (void)
+{
+  unsigned types;
+  switch (inst.instruction)
+    {
+    case M_MNEM_vshrnt:
+    case M_MNEM_vshrnb:
+    case M_MNEM_vrshrnt:
+    case M_MNEM_vrshrnb:
+      types = N_I16 | N_I32;
+      break;
+    case M_MNEM_vqshrnt:
+    case M_MNEM_vqshrnb:
+    case M_MNEM_vqrshrnt:
+    case M_MNEM_vqrshrnb:
+      types = N_U16 | N_U32 | N_S16 | N_S32;
+      break;
+    case M_MNEM_vqshrunt:
+    case M_MNEM_vqshrunb:
+    case M_MNEM_vqrshrunt:
+    case M_MNEM_vqrshrunb:
+      types = N_S16 | N_S32;
+      break;
+    default:
+      abort ();
+    }
+
+  struct neon_type_el et = neon_check_type (2, NS_QQI, N_EQK, types | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned Qd = inst.operands[0].reg;
+  unsigned Qm = inst.operands[1].reg;
+  unsigned imm = inst.operands[2].imm;
+  constraint (imm < 1 || ((unsigned) imm) > (et.size / 2),
+	      et.size == 16
+	      ? _("immediate operand expected in the range [1,8]")
+	      : _("immediate operand expected in the range [1,16]"));
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (Qd) << 22;
+  inst.instruction |= (et.size - imm) << 16;
+  inst.instruction |= LOW4 (Qd) << 12;
+  inst.instruction |= HI1 (Qm) << 5;
+  inst.instruction |= LOW4 (Qm);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vqmovn (void)
+{
+  struct neon_type_el et;
+  if (inst.instruction == M_MNEM_vqmovnt
+     || inst.instruction == M_MNEM_vqmovnb)
+    et = neon_check_type (2, NS_QQ, N_EQK,
+			  N_U16 | N_U32 | N_S16 | N_S32 | N_KEY);
+  else
+    et = neon_check_type (2, NS_QQ, N_EQK, N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (et.size == 32) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vpsel (void)
+{
+  neon_select_shape (NS_QQQ, NS_NULL);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vpnot (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+}
+
+static void
+do_mve_vmaxnma_vminnma (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.size == 16) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vcmul (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQI, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
+	      _("immediate out of range"));
+
+  if (et.size == 32 && (inst.operands[0].reg == inst.operands[1].reg
+			|| inst.operands[0].reg == inst.operands[2].reg))
+    as_tsktsk (BAD_MVE_SRCDEST);
+
+  inst.instruction |= (et.size == 32) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (rot > 90) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.instruction |= (rot == 90 || rot == 270);
+  inst.is_neon = 1;
+}
+
+/* To handle the Low Overhead Loop instructions
+   in Armv8.1-M Mainline and MVE.  */
+static void
+do_t_loloop (void)
+{
+  unsigned long insn = inst.instruction;
+
+  inst.instruction = THUMB_OP32 (inst.instruction);
+
+  if (insn == T_MNEM_lctp)
+    return;
+
+  set_pred_insn_type (MVE_OUTSIDE_PRED_INSN);
+
+  if (insn == T_MNEM_wlstp || insn == T_MNEM_dlstp)
+    {
+      struct neon_type_el et
+       = neon_check_type (2, NS_RR, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+      inst.instruction |= neon_logbits (et.size) << 20;
+      inst.is_neon = 1;
+    }
+
+  switch (insn)
+    {
+    case T_MNEM_letp:
+      constraint (!inst.operands[0].present,
+		  _("expected LR"));
+      /* fall through.  */
+    case T_MNEM_le:
+      /* le <label>.  */
+      if (!inst.operands[0].present)
+       inst.instruction |= 1 << 21;
+
+      v8_1_loop_reloc (TRUE);
+      break;
+
+    case T_MNEM_wls:
+    case T_MNEM_wlstp:
+      v8_1_loop_reloc (FALSE);
+      /* fall through.  */
+    case T_MNEM_dlstp:
+    case T_MNEM_dls:
+      constraint (inst.operands[1].isreg != 1, BAD_ARGS);
+
+      if (insn == T_MNEM_wlstp || insn == T_MNEM_dlstp)
+       constraint (inst.operands[1].reg == REG_PC, BAD_PC);
+      else if (inst.operands[1].reg == REG_PC)
+       as_tsktsk (MVE_BAD_PC);
+      if (inst.operands[1].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+
+      inst.instruction |= (inst.operands[1].reg << 16);
+      break;
+
+    default:
+      abort ();
+    }
+}
+
+
+static void
+do_vfp_nsyn_cmp (void)
+{
+  enum neon_shape rs;
+  if (!inst.operands[0].isreg)
+    {
+      do_mve_vcmp ();
+      return;
+    }
+  else
+    {
+      constraint (inst.operands[2].present, BAD_SYNTAX);
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd),
+		  BAD_FPU);
+    }
 
   if (inst.operands[1].isreg)
     {
@@ -15693,7 +16585,7 @@ neon_dp_fixup (struct arm_it* insn)
 }
 
 static void
-mve_encode_qqr (int size, int fp)
+mve_encode_qqr (int size, int U, int fp)
 {
   if (inst.operands[2].reg == REG_SP)
     as_tsktsk (MVE_BAD_SP);
@@ -15708,6 +16600,9 @@ mve_encode_qqr (int size, int fp)
       /* vsub.  */
       else if (((unsigned)inst.instruction) == 0x200d00)
 	inst.instruction = 0xee301f40;
+      /* vmul.  */
+      else if (((unsigned)inst.instruction) == 0x1000d10)
+	inst.instruction = 0xee310e60;
 
       /* Setting size which is 1 for F16 and 0 for F32.  */
       inst.instruction |= (size == 16) << 28;
@@ -15720,6 +16615,37 @@ mve_encode_qqr (int size, int fp)
       /* vsub.  */
       else if (((unsigned)inst.instruction) == 0x1000800)
 	inst.instruction = 0xee011f40;
+      /* vhadd.  */
+      else if (((unsigned)inst.instruction) == 0)
+	inst.instruction = 0xee000f40;
+      /* vhsub.  */
+      else if (((unsigned)inst.instruction) == 0x200)
+	inst.instruction = 0xee001f40;
+      /* vmla.  */
+      else if (((unsigned)inst.instruction) == 0x900)
+	inst.instruction = 0xee010e40;
+      /* vmul.  */
+      else if (((unsigned)inst.instruction) == 0x910)
+	inst.instruction = 0xee011e60;
+      /* vqadd.  */
+      else if (((unsigned)inst.instruction) == 0x10)
+	inst.instruction = 0xee000f60;
+      /* vqsub.  */
+      else if (((unsigned)inst.instruction) == 0x210)
+	inst.instruction = 0xee001f60;
+      /* vqrdmlah.  */
+      else if (((unsigned)inst.instruction) == 0x3000b10)
+	inst.instruction = 0xee000e40;
+      /* vqdmulh.  */
+      else if (((unsigned)inst.instruction) == 0x0000b00)
+	inst.instruction = 0xee010e60;
+      /* vqrdmulh.  */
+      else if (((unsigned)inst.instruction) == 0x1000b00)
+	inst.instruction = 0xfe010e60;
+
+      /* Set U-bit.  */
+      inst.instruction |= U << 28;
+
       /* Setting bits for size.  */
       inst.instruction |= neon_logbits (size) << 20;
     }
@@ -15760,17 +16686,34 @@ mve_encode_qqq (int ubit, int size)
   inst.is_neon = 1;
 }
 
+static void
+mve_encode_rq (unsigned bit28, unsigned size)
+{
+  inst.instruction |= bit28 << 28;
+  inst.instruction |= neon_logbits (size) << 18;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
 
-/* Encode insns with bit pattern:
-
-  |28/24|23|22 |21 20|19 16|15 12|11    8|7|6|5|4|3  0|
-  |  U  |x |D  |size | Rn  | Rd  |x x x x|N|Q|M|x| Rm |
+static void
+mve_encode_rrqq (unsigned U, unsigned size)
+{
+  constraint (inst.operands[3].reg > 14, MVE_BAD_QREG);
 
-  SIZE is passed in bits. -1 means size field isn't changed, in case it has a
-  different meaning for some instruction.  */
+  inst.instruction |= U << 28;
+  inst.instruction |= (inst.operands[1].reg >> 1) << 20;
+  inst.instruction |= LOW4 (inst.operands[2].reg) << 16;
+  inst.instruction |= (size == 32) << 16;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 7;
+  inst.instruction |= inst.operands[3].reg;
+  inst.is_neon = 1;
+}
 
+/* Helper function for neon_three_same handling the operands.  */
 static void
-neon_three_same (int isquad, int ubit, int size)
+neon_three_args (int isquad)
 {
   inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
   inst.instruction |= HI1 (inst.operands[0].reg) << 22;
@@ -15779,6 +16722,21 @@ neon_three_same (int isquad, int ubit, int size)
   inst.instruction |= LOW4 (inst.operands[2].reg);
   inst.instruction |= HI1 (inst.operands[2].reg) << 5;
   inst.instruction |= (isquad != 0) << 6;
+  inst.is_neon = 1;
+}
+
+/* Encode insns with bit pattern:
+
+  |28/24|23|22 |21 20|19 16|15 12|11    8|7|6|5|4|3  0|
+  |  U  |x |D  |size | Rn  | Rd  |x x x x|N|Q|M|x| Rm |
+
+  SIZE is passed in bits. -1 means size field isn't changed, in case it has a
+  different meaning for some instruction.  */
+
+static void
+neon_three_same (int isquad, int ubit, int size)
+{
+  neon_three_args (isquad);
   inst.instruction |= (ubit != 0) << 24;
   if (size != -1)
     inst.instruction |= neon_logbits (size) << 20;
@@ -15809,24 +16767,141 @@ neon_two_same (int qbit, int ubit, int size)
   neon_dp_fixup (&inst);
 }
 
+enum vfp_or_neon_is_neon_bits
+{
+NEON_CHECK_CC = 1,
+NEON_CHECK_ARCH = 2,
+NEON_CHECK_ARCH8 = 4
+};
+
+/* Call this function if an instruction which may have belonged to the VFP or
+ Neon instruction sets, but turned out to be a Neon instruction (due to the
+ operand types involved, etc.). We have to check and/or fix-up a couple of
+ things:
+
+   - Make sure the user hasn't attempted to make a Neon instruction
+     conditional.
+   - Alter the value in the condition code field if necessary.
+   - Make sure that the arch supports Neon instructions.
+
+ Which of these operations take place depends on bits from enum
+ vfp_or_neon_is_neon_bits.
+
+ WARNING: This function has side effects! If NEON_CHECK_CC is used and the
+ current instruction's condition is COND_ALWAYS, the condition field is
+ changed to inst.uncond_value.  This is necessary because instructions shared
+ between VFP and Neon may be conditional for the VFP variants only, and the
+ unconditional Neon version must have, e.g., 0xF in the condition field.  */
+
+static int
+vfp_or_neon_is_neon (unsigned check)
+{
+/* Conditions are always legal in Thumb mode (IT blocks).  */
+if (!thumb_mode && (check & NEON_CHECK_CC))
+  {
+    if (inst.cond != COND_ALWAYS)
+      {
+	first_error (_(BAD_COND));
+	return FAIL;
+      }
+    if (inst.uncond_value != -1)
+      inst.instruction |= inst.uncond_value << 28;
+  }
+
+
+  if (((check & NEON_CHECK_ARCH) && !mark_feature_used (&fpu_neon_ext_v1))
+      || ((check & NEON_CHECK_ARCH8)
+	  && !mark_feature_used (&fpu_neon_ext_armv8)))
+    {
+      first_error (_(BAD_FPU));
+      return FAIL;
+    }
+
+return SUCCESS;
+}
+
+
+/* Return TRUE if the SIMD instruction is available for the current
+   cpu_variant.  FP is set to TRUE if this is a SIMD floating-point
+   instruction.  CHECK contains th.  CHECK contains the set of bits to pass to
+   vfp_or_neon_is_neon for the NEON specific checks.  */
+
+static bfd_boolean
+check_simd_pred_availability (int fp, unsigned check)
+{
+if (inst.cond > COND_ALWAYS)
+  {
+    if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      {
+	inst.error = BAD_FPU;
+	return FALSE;
+      }
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  }
+else if (inst.cond < COND_ALWAYS)
+  {
+    if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+    else if (vfp_or_neon_is_neon (check) == FAIL)
+      return FALSE;
+  }
+else
+  {
+    if (!ARM_CPU_HAS_FEATURE (cpu_variant, fp ? mve_fp_ext : mve_ext)
+	&& vfp_or_neon_is_neon (check) == FAIL)
+      return FALSE;
+
+    if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+  }
+return TRUE;
+}
+
 /* Neon instruction encoders, in approximate order of appearance.  */
 
 static void
 do_neon_dyadic_i_su (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_32 | N_KEY);
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+
+  et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_32 | N_KEY);
+
+
+  if (rs != NS_QQR)
+    neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  else
+    mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
 }
 
 static void
 do_neon_dyadic_i64_su (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_ALL | N_KEY);
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_ALL | N_KEY);
+    }
+  if (rs == NS_QQR)
+    mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
+  else
+    neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
 }
 
 static void
@@ -15849,12 +16924,25 @@ neon_imm_shift (int write_ubit, int uval, int isquad, struct neon_type_el et,
 }
 
 static void
-do_neon_shl_imm (void)
+do_neon_shl (void)
 {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
   if (!inst.operands[2].isreg)
     {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQI, NS_NULL);
+	  et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_MVE);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+	  et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
+	}
       int imm = inst.operands[2].imm;
 
       constraint (imm < 0 || (unsigned)imm >= et.size,
@@ -15864,33 +16952,77 @@ do_neon_shl_imm (void)
     }
   else
     {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
-      unsigned int tmp;
-
-      /* VSHL/VQSHL 3-register variants have syntax such as:
-	   vshl.xx Dd, Dm, Dn
-	 whereas other 3-register operations encoded by neon_three_same have
-	 syntax like:
-	   vadd.xx Dd, Dn, Dm
-	 (i.e. with Dn & Dm reversed). Swap operands[1].reg and operands[2].reg
-	 here.  */
-      tmp = inst.operands[2].reg;
-      inst.operands[2].reg = inst.operands[1].reg;
-      inst.operands[1].reg = tmp;
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+	  et = neon_check_type (3, rs, N_EQK, N_SU_MVE | N_KEY, N_EQK | N_EQK);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+	  et = neon_check_type (3, rs, N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
+	}
+
+
+      if (rs == NS_QQR)
+	{
+	  constraint (inst.operands[0].reg != inst.operands[1].reg,
+		       _("invalid instruction shape"));
+	  if (inst.operands[2].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  else if (inst.operands[2].reg == REG_PC)
+	    as_tsktsk (MVE_BAD_PC);
+
+	  inst.instruction = 0xee311e60;
+	  inst.instruction |= (et.type == NT_unsigned) << 28;
+	  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+	  inst.instruction |= neon_logbits (et.size) << 18;
+	  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+	  inst.instruction |= inst.operands[2].reg;
+	  inst.is_neon = 1;
+	}
+      else
+	{
+	  unsigned int tmp;
+
+	  /* VSHL/VQSHL 3-register variants have syntax such as:
+	       vshl.xx Dd, Dm, Dn
+	     whereas other 3-register operations encoded by neon_three_same have
+	     syntax like:
+	       vadd.xx Dd, Dn, Dm
+	     (i.e. with Dn & Dm reversed). Swap operands[1].reg and
+	     operands[2].reg here.  */
+	  tmp = inst.operands[2].reg;
+	  inst.operands[2].reg = inst.operands[1].reg;
+	  inst.operands[1].reg = tmp;
+	  NEON_ENCODE (INTEGER, inst);
+	  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+	}
     }
 }
 
 static void
-do_neon_qshl_imm (void)
+do_neon_qshl (void)
 {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
   if (!inst.operands[2].isreg)
     {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQI, NS_NULL);
+	  et = neon_check_type (2, rs, N_EQK, N_KEY | N_SU_MVE);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+	  et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+	}
       int imm = inst.operands[2].imm;
 
       constraint (imm < 0 || (unsigned)imm >= et.size,
@@ -15900,32 +17032,103 @@ do_neon_qshl_imm (void)
     }
   else
     {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
-      unsigned int tmp;
+      enum neon_shape rs;
+      struct neon_type_el et;
 
-      /* See note in do_neon_shl_imm.  */
-      tmp = inst.operands[2].reg;
-      inst.operands[2].reg = inst.operands[1].reg;
-      inst.operands[1].reg = tmp;
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+	  et = neon_check_type (3, rs, N_EQK, N_SU_MVE | N_KEY, N_EQK | N_EQK);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+	  et = neon_check_type (3, rs, N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
+	}
+
+      if (rs == NS_QQR)
+	{
+	  constraint (inst.operands[0].reg != inst.operands[1].reg,
+		       _("invalid instruction shape"));
+	  if (inst.operands[2].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  else if (inst.operands[2].reg == REG_PC)
+	    as_tsktsk (MVE_BAD_PC);
+
+	  inst.instruction = 0xee311ee0;
+	  inst.instruction |= (et.type == NT_unsigned) << 28;
+	  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+	  inst.instruction |= neon_logbits (et.size) << 18;
+	  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+	  inst.instruction |= inst.operands[2].reg;
+	  inst.is_neon = 1;
+	}
+      else
+	{
+	  unsigned int tmp;
+
+	  /* See note in do_neon_shl.  */
+	  tmp = inst.operands[2].reg;
+	  inst.operands[2].reg = inst.operands[1].reg;
+	  inst.operands[1].reg = tmp;
+	  NEON_ENCODE (INTEGER, inst);
+	  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+	}
     }
 }
 
 static void
 do_neon_rshl (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_ALL | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_ALL | N_KEY);
+    }
+
   unsigned int tmp;
 
-  tmp = inst.operands[2].reg;
-  inst.operands[2].reg = inst.operands[1].reg;
-  inst.operands[1].reg = tmp;
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  if (rs == NS_QQR)
+    {
+      if (inst.operands[2].reg == REG_PC)
+	as_tsktsk (MVE_BAD_PC);
+      else if (inst.operands[2].reg == REG_SP)
+	as_tsktsk (MVE_BAD_SP);
+
+      constraint (inst.operands[0].reg != inst.operands[1].reg,
+		  _("invalid instruction shape"));
+
+      if (inst.instruction == 0x0000510)
+	/* We are dealing with vqrshl.  */
+	inst.instruction = 0xee331ee0;
+      else
+	/* We are dealing with vrshl.  */
+	inst.instruction = 0xee331e60;
+
+      inst.instruction |= (et.type == NT_unsigned) << 28;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= inst.operands[2].reg;
+      inst.is_neon = 1;
+    }
+  else
+    {
+      tmp = inst.operands[2].reg;
+      inst.operands[2].reg = inst.operands[1].reg;
+      inst.operands[1].reg = tmp;
+      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+    }
 }
 
 static int
@@ -15990,6 +17193,14 @@ do_neon_logic (void)
   if (inst.operands[2].present && inst.operands[2].isreg)
     {
       enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      if (rs == NS_QQQ
+	  && !check_simd_pred_availability (FALSE,
+					    NEON_CHECK_ARCH | NEON_CHECK_CC))
+	return;
+      else if (rs != NS_QQQ
+	       && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+	first_error (BAD_FPU);
+
       neon_check_type (3, rs, N_IGNORE_TYPE);
       /* U bit and size field were set as part of the bitmask.  */
       NEON_ENCODE (INTEGER, inst);
@@ -16003,14 +17214,29 @@ do_neon_logic (void)
       enum neon_shape rs = (three_ops_form
 			    ? neon_select_shape (NS_DDI, NS_QQI, NS_NULL)
 			    : neon_select_shape (NS_DI, NS_QI, NS_NULL));
-      struct neon_type_el et = neon_check_type (2, rs,
-	N_I8 | N_I16 | N_I32 | N_I64 | N_F32 | N_KEY, N_EQK);
+      /* Because neon_select_shape makes the second operand a copy of the first
+	 if the second operand is not present.  */
+      if (rs == NS_QQI
+	  && !check_simd_pred_availability (FALSE,
+					    NEON_CHECK_ARCH | NEON_CHECK_CC))
+	return;
+      else if (rs != NS_QQI
+	       && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+	first_error (BAD_FPU);
+
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	et = neon_check_type (2, rs, N_I32 | N_I16 | N_KEY, N_EQK);
+      else
+	et = neon_check_type (2, rs, N_I8 | N_I16 | N_I32 | N_I64 | N_F32
+			      | N_KEY, N_EQK);
+
+      if (et.type == NT_invtype)
+	return;
       enum neon_opc opcode = (enum neon_opc) inst.instruction & 0x0fffffff;
       unsigned immbits;
       int cmode;
 
-      if (et.type == NT_invtype)
-	return;
 
       if (three_ops_form)
 	constraint (inst.operands[0].reg != inst.operands[1].reg,
@@ -16080,128 +17306,44 @@ do_neon_bitfield (void)
 
 static void
 neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
-		  unsigned destbits)
-{
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_QQR, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs, N_EQK | destbits, N_EQK,
-					    types | N_KEY);
-  if (et.type == NT_float)
-    {
-      NEON_ENCODE (FLOAT, inst);
-      if (rs == NS_QQR)
-	mve_encode_qqr (et.size, 1);
-      else
-	neon_three_same (neon_quad (rs), 0, et.size == 16 ? (int) et.size : -1);
-    }
-  else
-    {
-      NEON_ENCODE (INTEGER, inst);
-      if (rs == NS_QQR)
-	mve_encode_qqr (et.size, 0);
-      else
-	neon_three_same (neon_quad (rs), et.type == ubit_meaning, et.size);
-    }
-}
-
-
-static void
-do_neon_dyadic_if_su_d (void)
-{
-  /* This version only allow D registers, but that constraint is enforced during
-     operand parsing so we don't need to do anything extra here.  */
-  neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
-}
-
-static void
-do_neon_dyadic_if_i_d (void)
-{
-  /* The "untyped" case can't happen. Do this to stop the "U" bit being
-     affected if we specify unsigned args.  */
-  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
-}
-
-enum vfp_or_neon_is_neon_bits
-{
-  NEON_CHECK_CC = 1,
-  NEON_CHECK_ARCH = 2,
-  NEON_CHECK_ARCH8 = 4
-};
-
-/* Call this function if an instruction which may have belonged to the VFP or
-   Neon instruction sets, but turned out to be a Neon instruction (due to the
-   operand types involved, etc.). We have to check and/or fix-up a couple of
-   things:
-
-     - Make sure the user hasn't attempted to make a Neon instruction
-       conditional.
-     - Alter the value in the condition code field if necessary.
-     - Make sure that the arch supports Neon instructions.
-
-   Which of these operations take place depends on bits from enum
-   vfp_or_neon_is_neon_bits.
-
-   WARNING: This function has side effects! If NEON_CHECK_CC is used and the
-   current instruction's condition is COND_ALWAYS, the condition field is
-   changed to inst.uncond_value. This is necessary because instructions shared
-   between VFP and Neon may be conditional for the VFP variants only, and the
-   unconditional Neon version must have, e.g., 0xF in the condition field.  */
-
-static int
-vfp_or_neon_is_neon (unsigned check)
-{
-  /* Conditions are always legal in Thumb mode (IT blocks).  */
-  if (!thumb_mode && (check & NEON_CHECK_CC))
-    {
-      if (inst.cond != COND_ALWAYS)
-	{
-	  first_error (_(BAD_COND));
-	  return FAIL;
-	}
-      if (inst.uncond_value != -1)
-	inst.instruction |= inst.uncond_value << 28;
-    }
-
-
-    if (((check & NEON_CHECK_ARCH) && !mark_feature_used (&fpu_neon_ext_v1))
-	|| ((check & NEON_CHECK_ARCH8)
-	    && !mark_feature_used (&fpu_neon_ext_armv8)))
-      {
-	first_error (_(BAD_FPU));
-	return FAIL;
-      }
-
-  return SUCCESS;
-}
-
-static int
-check_simd_pred_availability (int fp, unsigned check)
+		  unsigned destbits)
 {
-  if (inst.cond > COND_ALWAYS)
-    {
-      if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-	{
-	  inst.error = BAD_FPU;
-	  return 1;
-	}
-      inst.pred_insn_type = INSIDE_VPT_INSN;
-    }
-  else if (inst.cond < COND_ALWAYS)
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_QQR, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs, N_EQK | destbits, N_EQK,
+					    types | N_KEY);
+  if (et.type == NT_float)
     {
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-	inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
-      else if (vfp_or_neon_is_neon (check) == FAIL)
-	return 2;
+      NEON_ENCODE (FLOAT, inst);
+      if (rs == NS_QQR)
+	mve_encode_qqr (et.size, 0, 1);
+      else
+	neon_three_same (neon_quad (rs), 0, et.size == 16 ? (int) et.size : -1);
     }
   else
     {
-      if (!ARM_CPU_HAS_FEATURE (cpu_variant, fp ? mve_fp_ext : mve_ext)
-	  && vfp_or_neon_is_neon (check) == FAIL)
-	return 3;
-
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-	inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+      NEON_ENCODE (INTEGER, inst);
+      if (rs == NS_QQR)
+	mve_encode_qqr (et.size, et.type == ubit_meaning, 0);
+      else
+	neon_three_same (neon_quad (rs), et.type == ubit_meaning, et.size);
     }
-  return 0;
+}
+
+
+static void
+do_neon_dyadic_if_su_d (void)
+{
+  /* This version only allow D registers, but that constraint is enforced during
+     operand parsing so we don't need to do anything extra here.  */
+  neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
+}
+
+static void
+do_neon_dyadic_if_i_d (void)
+{
+  /* The "untyped" case can't happen. Do this to stop the "U" bit being
+     affected if we specify unsigned args.  */
+  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
 }
 
 static void
@@ -16245,6 +17387,7 @@ static void
 do_mve_vstr_vldr_RQ (int size, int elsize, int load)
 {
     unsigned os = inst.operands[1].imm >> 5;
+    unsigned type = inst.vectype.el[0].type;
     constraint (os != 0 && size == 8,
 		_("can not shift offsets when accessing less than half-word"));
     constraint (os && os != neon_logbits (size),
@@ -16275,15 +17418,14 @@ do_mve_vstr_vldr_RQ (int size, int elsize, int load)
 	constraint (inst.operands[0].reg == (inst.operands[1].imm & 0x1f),
 		    _("destination register and offset register may not be"
 		    " the same"));
-	constraint (size == elsize && inst.vectype.el[0].type != NT_unsigned,
+	constraint (size == elsize && type == NT_signed, BAD_EL_TYPE);
+	constraint (size != elsize && type != NT_unsigned && type != NT_signed,
 		    BAD_EL_TYPE);
-	constraint (inst.vectype.el[0].type != NT_unsigned
-		    && inst.vectype.el[0].type != NT_signed, BAD_EL_TYPE);
-	inst.instruction |= (inst.vectype.el[0].type == NT_unsigned) << 28;
+	inst.instruction |= ((size == elsize) || (type == NT_unsigned)) << 28;
       }
     else
       {
-	constraint (inst.vectype.el[0].type != NT_untyped, BAD_EL_TYPE);
+	constraint (type != NT_untyped, BAD_EL_TYPE);
       }
 
     inst.instruction |= 1 << 23;
@@ -16483,6 +17625,30 @@ do_mve_vst_vld (void)
   inst.is_neon = 1;
 }
 
+static void
+do_mve_vaddlv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RRQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S32 | N_U32 | N_KEY);
+
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= inst.operands[1].reg << 19;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
+}
+
 static void
 do_neon_dyadic_if_su (void)
 {
@@ -16490,8 +17656,13 @@ do_neon_dyadic_if_su (void)
   struct neon_type_el et = neon_check_type (3, rs, N_EQK , N_EQK,
 					    N_SUF_32 | N_KEY);
 
-  if (check_simd_pred_availability (et.type == NT_float,
-				    NEON_CHECK_ARCH | NEON_CHECK_CC))
+  constraint ((inst.instruction == ((unsigned) N_MNEM_vmax)
+	       || inst.instruction == ((unsigned) N_MNEM_vmin))
+	      && et.type == NT_float
+	      && !ARM_CPU_HAS_FEATURE (cpu_variant,fpu_neon_ext_v1), BAD_FPU);
+
+  if (!check_simd_pred_availability (et.type == NT_float,
+				     NEON_CHECK_ARCH | NEON_CHECK_CC))
     return;
 
   neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
@@ -16515,8 +17686,8 @@ do_neon_addsub_if_i (void)
      they are predicated or not.  */
   if ((rs == NS_QQQ || rs == NS_QQR) && et.size != 64)
     {
-      if (check_simd_pred_availability (et.type == NT_float,
-					NEON_CHECK_ARCH | NEON_CHECK_CC))
+      if (!check_simd_pred_availability (et.type == NT_float,
+					 NEON_CHECK_ARCH | NEON_CHECK_CC))
 	return;
     }
   else
@@ -16633,131 +17804,434 @@ neon_scalar_for_mul (unsigned scalar, unsigned elsize)
 	goto bad_scalar;
       return regno | (elno << 3);
 
-    case 32:
-      if (regno > 15 || elno > 1)
-	goto bad_scalar;
-      return regno | (elno << 4);
+    case 32:
+      if (regno > 15 || elno > 1)
+	goto bad_scalar;
+      return regno | (elno << 4);
+
+    default:
+    bad_scalar:
+      first_error (_("scalar out of range for multiply instruction"));
+    }
+
+  return 0;
+}
+
+/* Encode multiply / multiply-accumulate scalar instructions.  */
+
+static void
+neon_mul_mac (struct neon_type_el et, int ubit)
+{
+  unsigned scalar;
+
+  /* Give a more helpful error message if we have an invalid type.  */
+  if (et.type == NT_invtype)
+    return;
+
+  scalar = neon_scalar_for_mul (inst.operands[2].reg, et.size);
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= LOW4 (scalar);
+  inst.instruction |= HI1 (scalar) << 5;
+  inst.instruction |= (et.type == NT_float) << 8;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= (ubit != 0) << 24;
+
+  neon_dp_fixup (&inst);
+}
+
+static void
+do_neon_mac_maybe_scalar (void)
+{
+  if (try_vfp_nsyn (3, do_vfp_nsyn_mla_mls) == SUCCESS)
+    return;
+
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  if (inst.operands[2].isscalar)
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs,
+	N_EQK, N_EQK, N_I16 | N_I32 | N_F_16_32 | N_KEY);
+      NEON_ENCODE (SCALAR, inst);
+      neon_mul_mac (et, neon_quad (rs));
+    }
+  else if (!inst.operands[2].isvec)
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+
+      enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+      neon_dyadic_misc (NT_unsigned, N_SU_MVE, 0);
+    }
+  else
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      /* The "untyped" case can't happen.  Do this to stop the "U" bit being
+	 affected if we specify unsigned args.  */
+      neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+    }
+}
+
+static void
+do_bfloat_vfma (void)
+{
+  constraint (!mark_feature_used (&fpu_neon_ext_armv8), _(BAD_FPU));
+  constraint (!mark_feature_used (&arm_ext_bf16), _(BAD_BF16));
+  enum neon_shape rs;
+  int t_bit = 0;
+
+  if (inst.instruction != B_MNEM_vfmab)
+  {
+      t_bit = 1;
+      inst.instruction = B_MNEM_vfmat;
+  }
+
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint (!(index < 4), _("index must be in the range 0 to 3"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 8),
+		  _("indexed register must be less than 8"));
+      neon_three_args (t_bit);
+      inst.instruction |= ((index & 1) << 3);
+      inst.instruction |= ((index & 2) << 4);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_QQQ, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+      neon_three_args (t_bit);
+    }
+
+}
+
+static void
+do_neon_fmac (void)
+{
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_fma)
+      && try_vfp_nsyn (3, do_vfp_nsyn_fma_fms) == SUCCESS)
+    return;
+
+  if (!check_simd_pred_availability (TRUE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+    {
+      enum neon_shape rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs, N_F_MVE | N_KEY, N_EQK,
+						N_EQK);
+
+      if (rs == NS_QQR)
+	{
+
+	  if (inst.operands[2].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  else if (inst.operands[2].reg == REG_PC)
+	    as_tsktsk (MVE_BAD_PC);
+
+	  inst.instruction = 0xee310e40;
+	  inst.instruction |= (et.size == 16) << 28;
+	  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+	  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+	  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+	  inst.instruction |= HI1 (inst.operands[1].reg) << 6;
+	  inst.instruction |= inst.operands[2].reg;
+	  inst.is_neon = 1;
+	  return;
+	}
+    }
+  else
+    {
+      constraint (!inst.operands[2].isvec, BAD_FPU);
+    }
+
+  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+}
+
+static void
+do_mve_vfma (void)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_bf16) &&
+      inst.cond == COND_ALWAYS)
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      inst.instruction = N_MNEM_vfma;
+      inst.pred_insn_type = INSIDE_VPT_INSN;
+      inst.cond = 0xf;
+      return do_neon_fmac();
+    }
+  else
+    {
+      do_bfloat_vfma();
+    }
+}
+
+static void
+do_neon_tst (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs,
+    N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+  neon_three_same (neon_quad (rs), 0, et.size);
+}
+
+/* VMUL with 3 registers allows the P8 type. The scalar version supports the
+   same types as the MAC equivalents. The polynomial type for this instruction
+   is encoded the same as the integer type.  */
+
+static void
+do_neon_mul (void)
+{
+  if (try_vfp_nsyn (3, do_vfp_nsyn_mul) == SUCCESS)
+    return;
+
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  if (inst.operands[2].isscalar)
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      do_neon_mac_maybe_scalar ();
+    }
+  else
+    {
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  enum neon_shape rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+	  struct neon_type_el et
+	    = neon_check_type (3, rs, N_EQK, N_EQK, N_I_MVE | N_F_MVE | N_KEY);
+	  if (et.type == NT_float)
+	    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+			BAD_FPU);
+
+	  neon_dyadic_misc (NT_float, N_I_MVE | N_F_MVE, 0);
+	}
+      else
+	{
+	  constraint (!inst.operands[2].isvec, BAD_FPU);
+	  neon_dyadic_misc (NT_poly,
+			    N_I8 | N_I16 | N_I32 | N_F16 | N_F32 | N_P8, 0);
+	}
+    }
+}
+
+static void
+do_neon_qdmulh (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  if (inst.operands[2].isscalar)
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs,
+	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+      NEON_ENCODE (SCALAR, inst);
+      neon_mul_mac (et, neon_quad (rs));
+    }
+  else
+    {
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+	  et = neon_check_type (3, rs,
+	    N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+	}
+      else
+	{
+	  rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+	  et = neon_check_type (3, rs,
+	    N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+	}
+
+      NEON_ENCODE (INTEGER, inst);
+      if (rs == NS_QQR)
+	mve_encode_qqr (et.size, 0, 0);
+      else
+	/* The U bit (rounding) comes from bit mask.  */
+	neon_three_same (neon_quad (rs), 0, et.size);
+    }
+}
+
+static void
+do_mve_vaddv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK,  N_SU_32 | N_KEY);
+
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
 
-    default:
-    bad_scalar:
-      first_error (_("scalar out of range for multiply instruction"));
-    }
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
 
-  return 0;
+  mve_encode_rq (et.type == NT_unsigned, et.size);
 }
 
-/* Encode multiply / multiply-accumulate scalar instructions.  */
-
 static void
-neon_mul_mac (struct neon_type_el et, int ubit)
+do_mve_vhcadd (void)
 {
-  unsigned scalar;
+  enum neon_shape rs = neon_select_shape (NS_QQQI, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
 
-  /* Give a more helpful error message if we have an invalid type.  */
-  if (et.type == NT_invtype)
-    return;
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
 
-  scalar = neon_scalar_for_mul (inst.operands[2].reg, et.size);
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
-  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-  inst.instruction |= LOW4 (scalar);
-  inst.instruction |= HI1 (scalar) << 5;
-  inst.instruction |= (et.type == NT_float) << 8;
-  inst.instruction |= neon_logbits (et.size) << 20;
-  inst.instruction |= (ubit != 0) << 24;
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 90 && rot != 270, _("immediate out of range"));
 
-  neon_dp_fixup (&inst);
+  if (et.size == 32 && inst.operands[0].reg == inst.operands[2].reg)
+    as_tsktsk (_("Warning: 32-bit element size and same first and third "
+		 "operand makes instruction UNPREDICTABLE"));
+
+  mve_encode_qqq (0, et.size);
+  inst.instruction |= (rot == 270) << 12;
+  inst.is_neon = 1;
 }
 
 static void
-do_neon_mac_maybe_scalar (void)
+do_mve_vqdmull (void)
 {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_mla_mls) == SUCCESS)
-    return;
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
 
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
+  if (et.size == 32
+      && (inst.operands[0].reg == inst.operands[1].reg
+	  || (rs == NS_QQQ && inst.operands[0].reg == inst.operands[2].reg)))
+    as_tsktsk (BAD_MVE_SRCDEST);
 
-  if (inst.operands[2].isscalar)
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (rs == NS_QQQ)
     {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_I16 | N_I32 | N_F_16_32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
+      mve_encode_qqq (et.size == 32, 64);
+      inst.instruction |= 1;
     }
   else
     {
-      /* The "untyped" case can't happen.  Do this to stop the "U" bit being
-	 affected if we specify unsigned args.  */
-      neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+      mve_encode_qqr (64, et.size == 32, 0);
+      inst.instruction |= 0x3 << 5;
     }
 }
 
 static void
-do_neon_fmac (void)
+do_mve_vadc (void)
 {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_fma_fms) == SUCCESS)
-    return;
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_KEY | N_I32, N_EQK, N_EQK);
 
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
 
-  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (0, 64);
 }
 
 static void
-do_neon_tst (void)
+do_mve_vbrsr (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
-  neon_three_same (neon_quad (rs), 0, et.size);
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqr (et.size, 0, 0);
 }
 
-/* VMUL with 3 registers allows the P8 type. The scalar version supports the
-   same types as the MAC equivalents. The polynomial type for this instruction
-   is encoded the same as the integer type.  */
+static void
+do_mve_vsbc (void)
+{
+  neon_check_type (3, NS_QQQ, N_EQK, N_EQK, N_I32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (1, 64);
+}
 
 static void
-do_neon_mul (void)
+do_mve_vmulh (void)
 {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_mul) == SUCCESS)
-    return;
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
 
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
 
-  if (inst.operands[2].isscalar)
-    do_neon_mac_maybe_scalar ();
+  mve_encode_qqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vqdmlah (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S_32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
   else
-    neon_dyadic_misc (NT_poly, N_I8 | N_I16 | N_I32 | N_F16 | N_F32 | N_P8, 0);
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
 }
 
 static void
-do_neon_qdmulh (void)
+do_mve_vqdmladh (void)
 {
-  if (inst.operands[2].isscalar)
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
-    }
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
   else
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (INTEGER, inst);
-      /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (neon_quad (rs), 0, et.size);
-    }
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (0, et.size);
 }
 
+
 static void
 do_mve_vmull (void)
 {
@@ -16873,34 +18347,159 @@ do_mve_vmladav (void)
 }
 
 static void
-do_neon_qrdmlah (void)
+do_mve_vmlaldav (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RRQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (4, rs, N_EQK, N_EQK, N_EQK,
+		       N_S16 | N_S32 | N_U16 | N_U32 | N_KEY);
+
+  if (et.type == NT_unsigned
+      && (inst.instruction == M_MNEM_vmlsldav
+	  || inst.instruction == M_MNEM_vmlsldava
+	  || inst.instruction == M_MNEM_vmlsldavx
+	  || inst.instruction == M_MNEM_vmlsldavax))
+    first_error (BAD_SIMD_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rrqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vrmlaldavh (void)
 {
-  /* Check we're on the correct architecture.  */
-  if (!mark_feature_used (&fpu_neon_ext_armv8))
-    inst.error =
-      _("instruction form not available on this architecture.");
-  else if (!mark_feature_used (&fpu_neon_ext_v8_1))
+  struct neon_type_el et;
+  if (inst.instruction == M_MNEM_vrmlsldavh
+     || inst.instruction == M_MNEM_vrmlsldavha
+     || inst.instruction == M_MNEM_vrmlsldavhx
+     || inst.instruction == M_MNEM_vrmlsldavhax)
     {
-      as_warn (_("this instruction implies use of ARMv8.1 AdvSIMD."));
-      record_feature_use (&fpu_neon_ext_v8_1);
+      et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK, N_S32 | N_KEY);
+      if (inst.operands[1].reg == REG_SP)
+	as_tsktsk (MVE_BAD_SP);
     }
+  else
+    {
+      if (inst.instruction == M_MNEM_vrmlaldavhx
+	  || inst.instruction == M_MNEM_vrmlaldavhax)
+	et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK, N_S32 | N_KEY);
+      else
+	et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK,
+			      N_U32 | N_S32 | N_KEY);
+      /* vrmlaldavh's encoding with SP as the second, odd, GPR operand may alias
+	 with vmax/min instructions, making the use of SP in assembly really
+	 nonsensical, so instead of issuing a warning like we do for other uses
+	 of SP for the odd register operand we error out.  */
+      constraint (inst.operands[1].reg == REG_SP, BAD_SP);
+    }
+
+  /* Make sure we still check the second operand is an odd one and that PC is
+     disallowed.  This because we are parsing for any GPR operand, to be able
+     to distinguish between giving a warning or an error for SP as described
+     above.  */
+  constraint ((inst.operands[1].reg % 2) != 1, BAD_EVEN);
+  constraint (inst.operands[1].reg == REG_PC, BAD_PC);
 
-  if (inst.operands[2].isscalar)
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rrqq (et.type == NT_unsigned, 0);
+}
+
+
+static void
+do_mve_vmaxnmv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[0].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[0].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  mve_encode_rq (et.size == 16, 64);
+}
+
+static void
+do_mve_vmaxv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et;
+
+  if (inst.instruction == M_MNEM_vmaxv || inst.instruction == M_MNEM_vminv)
+    et = neon_check_type (2, rs, N_EQK, N_SU_MVE | N_KEY);
+  else
+    et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[0].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[0].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  mve_encode_rq (et.type == NT_unsigned, et.size);
+}
+
+
+static void
+do_neon_qrdmlah (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
     {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
+      /* Check we're on the correct architecture.  */
+      if (!mark_feature_used (&fpu_neon_ext_armv8))
+	inst.error
+	  = _("instruction form not available on this architecture.");
+      else if (!mark_feature_used (&fpu_neon_ext_v8_1))
+	{
+	  as_warn (_("this instruction implies use of ARMv8.1 AdvSIMD."));
+	  record_feature_use (&fpu_neon_ext_v8_1);
+	}
+	if (inst.operands[2].isscalar)
+	  {
+	    enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+	    struct neon_type_el et = neon_check_type (3, rs,
+	      N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+	    NEON_ENCODE (SCALAR, inst);
+	    neon_mul_mac (et, neon_quad (rs));
+	  }
+	else
+	  {
+	    enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+	    struct neon_type_el et = neon_check_type (3, rs,
+	      N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+	    NEON_ENCODE (INTEGER, inst);
+	    /* The U bit (rounding) comes from bit mask.  */
+	    neon_three_same (neon_quad (rs), 0, et.size);
+	  }
     }
   else
     {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-	N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+      enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+      struct neon_type_el et
+	= neon_check_type (3, rs, N_EQK, N_EQK, N_S_32 | N_KEY);
+
       NEON_ENCODE (INTEGER, inst);
-      /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (neon_quad (rs), 0, et.size);
+      mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
     }
 }
 
@@ -16942,8 +18541,8 @@ do_neon_abs_neg (void)
   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   et = neon_check_type (2, rs, N_EQK, N_S_32 | N_F_16_32 | N_KEY);
 
-  if (check_simd_pred_availability (et.type == NT_float,
-				    NEON_CHECK_ARCH | NEON_CHECK_CC))
+  if (!check_simd_pred_availability (et.type == NT_float,
+				     NEON_CHECK_ARCH | NEON_CHECK_CC))
     return;
 
   inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -16960,9 +18559,23 @@ do_neon_abs_neg (void)
 static void
 do_neon_sli (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+    }
+
+
   int imm = inst.operands[2].imm;
   constraint (imm < 0 || (unsigned)imm >= et.size,
 	      _("immediate out of range for insert"));
@@ -16972,21 +18585,48 @@ do_neon_sli (void)
 static void
 do_neon_sri (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+    }
+
   int imm = inst.operands[2].imm;
   constraint (imm < 1 || (unsigned)imm > et.size,
 	      _("immediate out of range for insert"));
   neon_imm_shift (FALSE, 0, neon_quad (rs), et, et.size - imm);
 }
 
-static void
-do_neon_qshlu_imm (void)
-{
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK | N_UNS, N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
+static void
+do_neon_qshlu_imm (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK | N_UNS,
+			    N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
+    }
+
   int imm = inst.operands[2].imm;
   constraint (imm < 0 || (unsigned)imm >= et.size,
 	      _("immediate out of range for shift"));
@@ -17166,6 +18806,7 @@ do_neon_shll (void)
   CVT_VAR (f16_u32, N_F16 | N_KEY, N_U32, N_VFP, "fultos", "fuitos", NULL)    \
   CVT_VAR (u32_f16, N_U32, N_F16 | N_KEY, N_VFP, "ftouls", "ftouis", "ftouizs")\
   CVT_VAR (s32_f16, N_S32, N_F16 | N_KEY, N_VFP, "ftosls", "ftosis", "ftosizs")\
+  CVT_VAR (bf16_f32, N_BF16, N_F32, whole_reg,   NULL, NULL, NULL)	      \
   /* VFP instructions.  */						      \
   CVT_VAR (f32_f64, N_F32, N_F64, N_VFP,       NULL,     "fcvtsd", NULL)      \
   CVT_VAR (f64_f32, N_F64, N_F32, N_VFP,       NULL,     "fcvtds", NULL)      \
@@ -17443,7 +19084,8 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
 	      || flavour == neon_cvt_flavour_s32_f32
 	      || flavour == neon_cvt_flavour_u32_f32))
 	{
-	  if (check_simd_pred_availability (1, NEON_CHECK_CC | NEON_CHECK_ARCH))
+	  if (!check_simd_pred_availability (TRUE,
+					     NEON_CHECK_CC | NEON_CHECK_ARCH))
 	    return;
 	}
       else if (mode == neon_cvt_mode_n)
@@ -17530,8 +19172,8 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
 	      || flavour == neon_cvt_flavour_s32_f32
 	      || flavour == neon_cvt_flavour_u32_f32))
 	{
-	  if (check_simd_pred_availability (1,
-					    NEON_CHECK_CC | NEON_CHECK_ARCH8))
+	  if (!check_simd_pred_availability (TRUE,
+					     NEON_CHECK_CC | NEON_CHECK_ARCH8))
 	    return;
 	}
       else if (mode == neon_cvt_mode_z
@@ -17544,8 +19186,8 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
 		   || flavour == neon_cvt_flavour_s32_f32
 		   || flavour == neon_cvt_flavour_u32_f32))
 	{
-	  if (check_simd_pred_availability (1,
-					    NEON_CHECK_CC | NEON_CHECK_ARCH))
+	  if (!check_simd_pred_availability (TRUE,
+					     NEON_CHECK_CC | NEON_CHECK_ARCH))
 	    return;
 	}
       /* fall through.  */
@@ -17554,8 +19196,8 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
 	{
 
 	  NEON_ENCODE (FLOAT, inst);
-	  if (check_simd_pred_availability (1,
-					    NEON_CHECK_CC | NEON_CHECK_ARCH8))
+	  if (!check_simd_pred_availability (TRUE,
+					     NEON_CHECK_CC | NEON_CHECK_ARCH8))
 	    return;
 
 	  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17632,8 +19274,21 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
 	  }
 
       if (rs == NS_DQ)
-	inst.instruction = 0x3b60600;
+	{
+	  if (flavour == neon_cvt_flavour_bf16_f32)
+	    {
+	      if (vfp_or_neon_is_neon (NEON_CHECK_ARCH8) == FAIL)
+		return;
+	      constraint (!mark_feature_used (&arm_ext_bf16), _(BAD_BF16));
+	      /* VCVT.bf16.f32.  */
+	      inst.instruction = 0x11b60640;
+	    }
+	  else
+	    /* VCVT.f16.f32.  */
+	    inst.instruction = 0x3b60600;
+	}
       else
+	/* VCVT.f32.f16.  */
 	inst.instruction = 0x3b60700;
 
       inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17715,7 +19370,7 @@ do_neon_cvttb_1 (bfd_boolean t)
   else if (rs == NS_QQ || rs == NS_QQI)
     {
       int single_to_half = 0;
-      if (check_simd_pred_availability (1, NEON_CHECK_ARCH))
+      if (!check_simd_pred_availability (TRUE, NEON_CHECK_ARCH))
 	return;
 
       enum neon_cvt_flavour flavour = get_neon_cvt_flavour (rs);
@@ -17783,6 +19438,14 @@ do_neon_cvttb_1 (bfd_boolean t)
       inst.error = NULL;
       do_neon_cvttb_2 (t, /*to=*/FALSE, /*is_double=*/TRUE);
     }
+  else if (neon_check_type (2, rs, N_BF16 | N_VFP, N_F32).type != NT_invtype)
+    {
+      constraint (!mark_feature_used (&arm_ext_bf16), _(BAD_BF16));
+      inst.error = NULL;
+      inst.instruction |= (1 << 8);
+      inst.instruction &= ~(1 << 9);
+      do_neon_cvttb_2 (t, /*to=*/TRUE, /*is_double=*/FALSE);
+    }
   else
     return;
 }
@@ -17855,9 +19518,16 @@ neon_move_immediate (void)
 static void
 do_neon_mvn (void)
 {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
   if (inst.operands[1].isreg)
     {
-      enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+      enum neon_shape rs;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	rs = neon_select_shape (NS_QQ, NS_NULL);
+      else
+	rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
 
       NEON_ENCODE (INTEGER, inst);
       inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17873,6 +19543,13 @@ do_neon_mvn (void)
     }
 
   neon_dp_fixup (&inst);
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      constraint (!inst.operands[1].isreg && !inst.operands[0].isquad, BAD_FPU);
+      constraint ((inst.instruction & 0xd00) == 0xd00,
+		  _("immediate value out of range"));
+    }
 }
 
 /* Encode instructions of form:
@@ -18020,16 +19697,6 @@ do_neon_fmac_maybe_scalar_long (int subtype)
      0x2.  */
   int size = -1;
 
-  if (inst.cond != COND_ALWAYS)
-    as_warn (_("vfmal/vfmsl with FP16 type cannot be conditional, the "
-	       "behaviour is UNPREDICTABLE"));
-
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_fp16_fml),
-	      _(BAD_FP16));
-
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-	      _(BAD_FPU));
-
   /* vfmal/vfmsl are in three-same D/Q register format or the third operand can
      be a scalar index register.  */
   if (inst.operands[2].isscalar)
@@ -18048,7 +19715,16 @@ do_neon_fmac_maybe_scalar_long (int subtype)
       rs = neon_select_shape (NS_DHH, NS_QDD, NS_NULL);
     }
 
-  neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16);
+
+  if (inst.cond != COND_ALWAYS)
+    as_warn (_("vfmal/vfmsl with FP16 type cannot be conditional, the "
+	       "behaviour is UNPREDICTABLE"));
+
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_fp16_fml),
+	      _(BAD_FP16));
+
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
+	      _(BAD_FPU));
 
   /* "opcode" from template has included "ubit", so simply pass 0 here.  Also,
      the "S" bit in size field has been reused to differentiate vfmal and vfmsl,
@@ -18185,14 +19861,29 @@ do_neon_ext (void)
 static void
 do_neon_rev (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_8 | N_16 | N_32 | N_KEY);
+
   unsigned op = (inst.instruction >> 7) & 3;
   /* N (width of reversed regions) is encoded as part of the bitmask. We
      extract it here to check the elements to be reversed are smaller.
      Otherwise we'd get a reserved instruction.  */
   unsigned elsize = (op == 2) ? 16 : (op == 1) ? 32 : (op == 0) ? 64 : 0;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext) && elsize == 64
+      && inst.operands[0].reg == inst.operands[1].reg)
+    as_tsktsk (_("Warning: 64-bit element size and same destination and source"
+		 " operands makes instruction UNPREDICTABLE"));
+
   gas_assert (elsize != 0);
   constraint (et.size >= elsize,
 	      _("elements must be smaller than reversal region"));
@@ -18204,6 +19895,8 @@ do_neon_dup (void)
 {
   if (inst.operands[1].isscalar)
     {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1),
+		  BAD_FPU);
       enum neon_shape rs = neon_select_shape (NS_DS, NS_QS, NS_NULL);
       struct neon_type_el et = neon_check_type (2, rs,
 	N_EQK, N_8 | N_16 | N_32 | N_KEY);
@@ -18231,6 +19924,23 @@ do_neon_dup (void)
       enum neon_shape rs = neon_select_shape (NS_DR, NS_QR, NS_NULL);
       struct neon_type_el et = neon_check_type (2, rs,
 	N_8 | N_16 | N_32 | N_KEY, N_EQK);
+      if (rs == NS_QR)
+	{
+	  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH))
+	    return;
+	}
+      else
+	constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1),
+		    BAD_FPU);
+
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+	{
+	  if (inst.operands[1].reg == REG_SP)
+	    as_tsktsk (MVE_BAD_SP);
+	  else if (inst.operands[1].reg == REG_PC)
+	    as_tsktsk (MVE_BAD_PC);
+	}
+
       /* Duplicate ARM register to lanes of vector.  */
       NEON_ENCODE (ARMREG, inst);
       switch (et.size)
@@ -18373,7 +20083,13 @@ do_neon_mov (void)
       et = neon_check_type (2, rs, N_EQK, N_F64 | N_KEY);
       /* It is not an error here if no type is given.  */
       inst.error = NULL;
-      if (et.type == NT_float && et.size == 64)
+
+      /* In MVE we interpret the following instructions as same, so ignoring
+	 the following type (float) and size (64) checks.
+	 a: VMOV<c><q> <Dd>, <Dm>
+	 b: VMOV<c><q>.F64 <Dd>, <Dm>.  */
+      if ((et.type == NT_float && et.size == 64)
+	  || (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)))
 	{
 	  do_vfp_nsyn_opcode ("fcpyd");
 	  break;
@@ -18382,7 +20098,8 @@ do_neon_mov (void)
 
     case NS_QQ:  /* case 0/1.  */
       {
-	if (check_simd_pred_availability (0, NEON_CHECK_CC | NEON_CHECK_ARCH))
+	if (!check_simd_pred_availability (FALSE,
+					   NEON_CHECK_CC | NEON_CHECK_ARCH))
 	  return;
 	/* The architecture manual I have doesn't explicitly state which
 	   value the U bit should have for register->register moves, but
@@ -18412,7 +20129,8 @@ do_neon_mov (void)
       /* fall through.  */
 
     case NS_QI:  /* case 2/3.  */
-      if (check_simd_pred_availability (0, NEON_CHECK_CC | NEON_CHECK_ARCH))
+      if (!check_simd_pred_availability (FALSE,
+					 NEON_CHECK_CC | NEON_CHECK_ARCH))
 	return;
       inst.instruction = 0x0800010;
       neon_move_immediate ();
@@ -18717,8 +20435,22 @@ do_mve_movl (void)
 static void
 do_neon_rshift_round_imm (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+    }
   int imm = inst.operands[2].imm;
 
   /* imm == 0 case is encoded as VMOV for V{R}SHR.  */
@@ -18800,7 +20532,14 @@ do_neon_zip_uzp (void)
 static void
 do_neon_sat_abs_neg (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
   neon_two_same (neon_quad (rs), 1, et.size);
@@ -18829,7 +20568,15 @@ do_neon_recip_est (void)
 static void
 do_neon_cls (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+   rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
   neon_two_same (neon_quad (rs), 1, et.size);
@@ -18838,7 +20585,15 @@ do_neon_cls (void)
 static void
 do_neon_clz (void)
 {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+   rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
   struct neon_type_el et = neon_check_type (2, rs,
     N_EQK, N_I8 | N_I16 | N_I32 | N_KEY);
   neon_two_same (neon_quad (rs), 1, et.size);
@@ -19377,12 +21132,13 @@ do_vsel (void)
 static void
 do_vmaxnm (void)
 {
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    set_pred_insn_type (OUTSIDE_PRED_INSN);
 
   if (try_vfp_nsyn (3, do_vfp_nsyn_fpv8) == SUCCESS)
     return;
 
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
+  if (!check_simd_pred_availability (TRUE, NEON_CHECK_CC | NEON_CHECK_ARCH8))
     return;
 
   neon_dyadic_misc (NT_untyped, N_F_16_32, 0);
@@ -19446,12 +21202,12 @@ do_vrint_1 (enum neon_cvt_mode mode)
       if (et.type == NT_invtype)
 	return;
 
-      set_pred_insn_type (OUTSIDE_PRED_INSN);
-      NEON_ENCODE (FLOAT, inst);
-
-      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
+      if (!check_simd_pred_availability (TRUE,
+					 NEON_CHECK_CC | NEON_CHECK_ARCH8))
 	return;
 
+      NEON_ENCODE (FLOAT, inst);
+
       inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
       inst.instruction |= HI1 (inst.operands[0].reg) << 22;
       inst.instruction |= LOW4 (inst.operands[1].reg);
@@ -19540,16 +21296,24 @@ neon_scalar_for_vcmla (unsigned opnd, unsigned elsize)
 static void
 do_vcmla (void)
 {
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-	      _(BAD_FPU));
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext)
+	      && (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8)
+		  || !mark_feature_used (&arm_ext_v8_3)), (BAD_FPU));
   constraint (inst.relocs[0].exp.X_op != O_constant,
 	      _("expression too complex"));
   unsigned rot = inst.relocs[0].exp.X_add_number;
   constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
 	      _("immediate out of range"));
   rot /= 90;
+
+  if (!check_simd_pred_availability (TRUE,
+				     NEON_CHECK_ARCH8 | NEON_CHECK_CC))
+    return;
+
   if (inst.operands[2].isscalar)
     {
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+	first_error (_("invalid instruction shape"));
       enum neon_shape rs = neon_select_shape (NS_DDSI, NS_QQSI, NS_NULL);
       unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
 				       N_KEY | N_F16 | N_F32).size;
@@ -19568,9 +21332,19 @@ do_vcmla (void)
     }
   else
     {
-      enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+      enum neon_shape rs;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+	rs = neon_select_shape (NS_QQQI, NS_NULL);
+      else
+	rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+
       unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
 				       N_KEY | N_F16 | N_F32).size;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext) && size == 32
+	  && (inst.operands[0].reg == inst.operands[1].reg
+	      || inst.operands[0].reg == inst.operands[2].reg))
+	as_tsktsk (BAD_MVE_SRCDEST);
+
       neon_three_same (neon_quad (rs), 0, -1);
       inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
       inst.instruction |= 0xfc200800;
@@ -19582,20 +21356,60 @@ do_vcmla (void)
 static void
 do_vcadd (void)
 {
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-	      _(BAD_FPU));
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+	      && (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8)
+		  || !mark_feature_used (&arm_ext_v8_3)), (BAD_FPU));
   constraint (inst.relocs[0].exp.X_op != O_constant,
 	      _("expression too complex"));
+
   unsigned rot = inst.relocs[0].exp.X_add_number;
   constraint (rot != 90 && rot != 270, _("immediate out of range"));
-  enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
-  unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
-				   N_KEY | N_F16 | N_F32).size;
-  neon_three_same (neon_quad (rs), 0, -1);
-  inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
-  inst.instruction |= 0xfc800800;
-  inst.instruction |= (rot == 270) << 24;
-  inst.instruction |= (size == 32) << 20;
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16 | N_F32);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_QQQI, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16 | N_F32 | N_I8
+			    | N_I16 | N_I32);
+      if (et.size == 32 && inst.operands[0].reg == inst.operands[2].reg)
+	as_tsktsk (_("Warning: 32-bit element size and same first and third "
+		     "operand makes instruction UNPREDICTABLE"));
+    }
+
+  if (et.type == NT_invtype)
+    return;
+
+  if (!check_simd_pred_availability (et.type == NT_float,
+				     NEON_CHECK_ARCH8 | NEON_CHECK_CC))
+    return;
+
+  if (et.type == NT_float)
+    {
+      neon_three_same (neon_quad (rs), 0, -1);
+      inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
+      inst.instruction |= 0xfc800800;
+      inst.instruction |= (rot == 270) << 24;
+      inst.instruction |= (et.size == 32) << 20;
+    }
+  else
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      inst.instruction = 0xfe000f00;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= neon_logbits (et.size) << 20;
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= (rot == 270) << 12;
+      inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+      inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+      inst.instruction |= LOW4 (inst.operands[2].reg);
+      inst.is_neon = 1;
+    }
 }
 
 /* Dot Product instructions encoding support.  */
@@ -19671,6 +21485,79 @@ do_neon_dotproduct_u (void)
   return do_neon_dotproduct (1);
 }
 
+static void
+do_vusdot (void)
+{
+  enum neon_shape rs;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+		  _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
+  else
+    {
+      inst.instruction |= (1 << 21);
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+      neon_three_args (rs == NS_QQQ);
+    }
+}
+
+static void
+do_vsudot (void)
+{
+  enum neon_shape rs;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_U8 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+		  _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
+}
+
+static void
+do_vsmmla (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  neon_three_args (1);
+
+}
+
+static void
+do_vummla (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_U8 | N_KEY);
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  neon_three_args (1);
+
+}
+
 /* Crypto v1 instructions.  */
 static void
 do_crypto_2op_1 (unsigned elttype, int op)
@@ -19860,6 +21747,46 @@ do_vjcvt (void)
   do_vfp_cond_or_thumb ();
 }
 
+static void
+do_vdot (void)
+{
+  enum neon_shape rs;
+  constraint (!mark_feature_used (&fpu_neon_ext_armv8), _(BAD_FPU));
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+		  _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+      neon_three_args (rs == NS_QQQ);
+    }
+}
+
+static void
+do_vmmla (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+
+  constraint (!mark_feature_used (&fpu_neon_ext_armv8), _(BAD_FPU));
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  neon_three_args (1);
+}
+
 
 /* Overall per-instruction processing.	*/
 
@@ -21248,7 +23175,7 @@ arm_frob_label (symbolS * sym)
      out of the jump table, and chaos would ensue.  */
   if (label_is_thumb_function_name
       && (S_GET_NAME (sym)[0] != '.' || S_GET_NAME (sym)[1] != 'L')
-      && (bfd_get_section_flags (stdoutput, now_seg) & SEC_CODE) != 0)
+      && (bfd_section_flags (now_seg) & SEC_CODE) != 0)
     {
       /* When the address of a Thumb function is taken the bottom
 	 bit of that address should be set.  This will allow
@@ -21408,6 +23335,10 @@ static const struct reg_entry reg_names[] =
   REGDEF(mvfr0,7,VFC), REGDEF(mvfr1,6,VFC),
   REGDEF(MVFR0,7,VFC), REGDEF(MVFR1,6,VFC),
   REGDEF(mvfr2,5,VFC), REGDEF(MVFR2,5,VFC),
+  REGDEF(fpscr_nzcvqc,2,VFC), REGDEF(FPSCR_nzcvqc,2,VFC),
+  REGDEF(vpr,12,VFC), REGDEF(VPR,12,VFC),
+  REGDEF(fpcxt_ns,14,VFC), REGDEF(FPCXT_NS,14,VFC),
+  REGDEF(fpcxt_s,15,VFC), REGDEF(FPCXT_S,15,VFC),
 
   /* Maverick DSP coprocessor registers.  */
   REGSET(mvf,MVF),  REGSET(mvd,MVD),  REGSET(mvfx,MVFX),  REGSET(mvdx,MVDX),
@@ -22528,15 +24459,13 @@ static const struct asm_opcode insns[] =
   nUF(vselvs, _vselvs, 3, (RVSD, RVSD, RVSD),		vsel),
   nUF(vselge, _vselge, 3, (RVSD, RVSD, RVSD),		vsel),
   nUF(vselgt, _vselgt, 3, (RVSD, RVSD, RVSD),		vsel),
-  nUF(vmaxnm, _vmaxnm, 3, (RNSDQ, oRNSDQ, RNSDQ),	vmaxnm),
-  nUF(vminnm, _vminnm, 3, (RNSDQ, oRNSDQ, RNSDQ),	vmaxnm),
   nCE(vrintr, _vrintr, 2, (RNSDQ, oRNSDQ),		vrintr),
-  nCE(vrintz, _vrintr, 2, (RNSDQ, oRNSDQ),		vrintz),
-  nCE(vrintx, _vrintr, 2, (RNSDQ, oRNSDQ),		vrintx),
-  nUF(vrinta, _vrinta, 2, (RNSDQ, oRNSDQ),		vrinta),
-  nUF(vrintn, _vrinta, 2, (RNSDQ, oRNSDQ),		vrintn),
-  nUF(vrintp, _vrinta, 2, (RNSDQ, oRNSDQ),		vrintp),
-  nUF(vrintm, _vrinta, 2, (RNSDQ, oRNSDQ),		vrintm),
+  mnCE(vrintz, _vrintr, 2, (RNSDQMQ, oRNSDQMQ),		vrintz),
+  mnCE(vrintx, _vrintr, 2, (RNSDQMQ, oRNSDQMQ),		vrintx),
+  mnUF(vrinta, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),		vrinta),
+  mnUF(vrintn, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),		vrintn),
+  mnUF(vrintp, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),		vrintp),
+  mnUF(vrintm, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),		vrintm),
 
   /* Crypto v1 extensions.  */
 #undef  ARM_VARIANT
@@ -22582,8 +24511,6 @@ static const struct asm_opcode insns[] =
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT & arm_ext_v8_3
  NCE (vjcvt, eb90bc0, 2, (RVS, RVD), vjcvt),
- NUF (vcmla, 0, 4, (RNDQ, RNDQ, RNDQ_RNSC, EXPi), vcmla),
- NUF (vcadd, 0, 4, (RNDQ, RNDQ, RNDQ, EXPi), vcadd),
 
 #undef  ARM_VARIANT
 #define ARM_VARIANT   & fpu_neon_ext_dotprod
@@ -23039,11 +24966,14 @@ static const struct asm_opcode insns[] =
 
 #undef  ARM_VARIANT
 #define ARM_VARIANT  & fpu_vfp_ext_v1xd  /* VFP V1xD (single precision).  */
+#undef THUMB_VARIANT
+#define THUMB_VARIANT  & arm_ext_v6t2
+ mcCE(vmrs,	ef00a10, 2, (APSR_RR, RVC),   vmrs),
+ mcCE(vmsr,	ee00a10, 2, (RVC, RR),        vmsr),
+#undef THUMB_VARIANT
 
   /* Moves and type conversions.  */
  cCE("fmstat",	ef1fa10, 0, (),		      noargs),
- cCE("vmrs",	ef00a10, 2, (APSR_RR, RVC),   vmrs),
- cCE("vmsr",	ee00a10, 2, (RVC, RR),        vmsr),
  cCE("fsitos",	eb80ac0, 2, (RVS, RVS),	      vfp_sp_monadic),
  cCE("fuitos",	eb80a40, 2, (RVS, RVS),	      vfp_sp_monadic),
  cCE("ftosis",	ebd0a40, 2, (RVS, RVS),	      vfp_sp_monadic),
@@ -23165,8 +25095,6 @@ static const struct asm_opcode insns[] =
  NCE(vcvtz,     0,       2, (RVSD, RVSD),       vfp_nsyn_cvtz),
 
   /* Mnemonics shared by Neon and VFP.  */
- nCEF(vmul,     _vmul,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mul),
- nCEF(vmla,     _vmla,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
  nCEF(vmls,     _vmls,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
 
  NCE(vldm,      c900b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
@@ -23204,8 +25132,8 @@ static const struct asm_opcode insns[] =
  NCE (vins,      eb00ac0,       2, (RVS, RVS), neon_movhf),
 
  /* New backported fma/fms instructions optional in v8.2.  */
- NCE (vfmal, 810, 3, (RNDQ, RNSD, RNSD_RNSC), neon_vfmal),
- NCE (vfmsl, 810, 3, (RNDQ, RNSD, RNSD_RNSC), neon_vfmsl),
+ NUF (vfmsl, 810, 3, (RNDQ, RNSD, RNSD_RNSC), neon_vfmsl),
+ NUF (vfmal, 810, 3, (RNDQ, RNSD, RNSD_RNSC), neon_vfmal),
 
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT  & fpu_neon_ext_v1
@@ -23216,38 +25144,24 @@ static const struct asm_opcode insns[] =
   /* integer ops, valid types S8 S16 S32 U8 U16 U32.  */
  NUF(vaba,      0000710, 3, (RNDQ, RNDQ,  RNDQ), neon_dyadic_i_su),
  NUF(vabaq,     0000710, 3, (RNQ,  RNQ,   RNQ),  neon_dyadic_i_su),
- NUF(vhadd,     0000000, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
  NUF(vhaddq,    0000000, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
- NUF(vrhadd,    0000100, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
  NUF(vrhaddq,   0000100, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
- NUF(vhsub,     0000200, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
  NUF(vhsubq,    0000200, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
   /* integer ops, valid types S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vqadd,     0000010, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i64_su),
  NUF(vqaddq,    0000010, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i64_su),
- NUF(vqsub,     0000210, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i64_su),
  NUF(vqsubq,    0000210, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i64_su),
- NUF(vrshl,     0000500, 3, (RNDQ, oRNDQ, RNDQ), neon_rshl),
  NUF(vrshlq,    0000500, 3, (RNQ,  oRNQ,  RNQ),  neon_rshl),
- NUF(vqrshl,    0000510, 3, (RNDQ, oRNDQ, RNDQ), neon_rshl),
  NUF(vqrshlq,   0000510, 3, (RNQ,  oRNQ,  RNQ),  neon_rshl),
   /* If not immediate, fall back to neon_dyadic_i64_su.
-     shl_imm should accept I8 I16 I32 I64,
-     qshl_imm should accept S8 S16 S32 S64 U8 U16 U32 U64.  */
- nUF(vshl,      _vshl,    3, (RNDQ, oRNDQ, RNDQ_I63b), neon_shl_imm),
- nUF(vshlq,     _vshl,    3, (RNQ,  oRNQ,  RNDQ_I63b), neon_shl_imm),
- nUF(vqshl,     _vqshl,   3, (RNDQ, oRNDQ, RNDQ_I63b), neon_qshl_imm),
- nUF(vqshlq,    _vqshl,   3, (RNQ,  oRNQ,  RNDQ_I63b), neon_qshl_imm),
+     shl should accept I8 I16 I32 I64,
+     qshl should accept S8 S16 S32 S64 U8 U16 U32 U64.  */
+ nUF(vshlq,     _vshl,    3, (RNQ,  oRNQ,  RNDQ_I63b), neon_shl),
+ nUF(vqshlq,    _vqshl,   3, (RNQ,  oRNQ,  RNDQ_I63b), neon_qshl),
   /* Logic ops, types optional & ignored.  */
- nUF(vand,      _vand,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
  nUF(vandq,     _vand,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vbic,      _vbic,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
  nUF(vbicq,     _vbic,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vorr,      _vorr,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
  nUF(vorrq,     _vorr,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vorn,      _vorn,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
  nUF(vornq,     _vorn,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(veor,      _veor,    3, (RNDQ, oRNDQ, RNDQ),      neon_logic),
  nUF(veorq,     _veor,    3, (RNQ,  oRNQ,  RNQ),       neon_logic),
   /* Bitfield ops, untyped.  */
  NUF(vbsl,      1100110, 3, (RNDQ, RNDQ, RNDQ), neon_bitfield),
@@ -23258,9 +25172,7 @@ static const struct asm_opcode insns[] =
  NUF(vbifq,     1300110, 3, (RNQ,  RNQ,  RNQ),  neon_bitfield),
   /* Int and float variants, types S8 S16 S32 U8 U16 U32 F16 F32.  */
  nUF(vabdq,     _vabd,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
- nUF(vmax,      _vmax,    3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_if_su),
  nUF(vmaxq,     _vmax,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
- nUF(vmin,      _vmin,    3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_if_su),
  nUF(vminq,     _vmin,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
   /* Comparisons. Types S8 S16 S32 U8 U16 U32 F32. Non-immediate versions fall
      back to neon_dyadic_if_su.  */
@@ -23291,9 +25203,7 @@ static const struct asm_opcode insns[] =
   /* VMUL takes I8 I16 I32 F32 P8.  */
  nUF(vmulq,     _vmul,     3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_mul),
   /* VQD{R}MULH takes S16 S32.  */
- nUF(vqdmulh,   _vqdmulh,  3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
  nUF(vqdmulhq,  _vqdmulh,  3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
- nUF(vqrdmulh,  _vqrdmulh, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
  nUF(vqrdmulhq, _vqrdmulh, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
  NUF(vacge,     0000e10,  3, (RNDQ, oRNDQ, RNDQ), neon_fcmp_absolute),
  NUF(vacgeq,    0000e10,  3, (RNQ,  oRNQ,  RNQ),  neon_fcmp_absolute),
@@ -23308,7 +25218,6 @@ static const struct asm_opcode insns[] =
  NUF(vrsqrts,   0200f10,  3, (RNDQ, oRNDQ, RNDQ), neon_step),
  NUF(vrsqrtsq,  0200f10,  3, (RNQ,  oRNQ,  RNQ),  neon_step),
  /* ARM v8.1 extension.  */
- nUF (vqrdmlah,  _vqrdmlah, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qrdmlah),
  nUF (vqrdmlahq, _vqrdmlah, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qrdmlah),
  nUF (vqrdmlsh,  _vqrdmlsh, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qrdmlah),
  nUF (vqrdmlshq, _vqrdmlsh, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qrdmlah),
@@ -23320,21 +25229,16 @@ static const struct asm_opcode insns[] =
   /* Data processing with two registers and a shift amount.  */
   /* Right shifts, and variants with rounding.
      Types accepted S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vshr,      0800010, 3, (RNDQ, oRNDQ, I64z), neon_rshift_round_imm),
  NUF(vshrq,     0800010, 3, (RNQ,  oRNQ,  I64z), neon_rshift_round_imm),
- NUF(vrshr,     0800210, 3, (RNDQ, oRNDQ, I64z), neon_rshift_round_imm),
  NUF(vrshrq,    0800210, 3, (RNQ,  oRNQ,  I64z), neon_rshift_round_imm),
  NUF(vsra,      0800110, 3, (RNDQ, oRNDQ, I64),  neon_rshift_round_imm),
  NUF(vsraq,     0800110, 3, (RNQ,  oRNQ,  I64),  neon_rshift_round_imm),
  NUF(vrsra,     0800310, 3, (RNDQ, oRNDQ, I64),  neon_rshift_round_imm),
  NUF(vrsraq,    0800310, 3, (RNQ,  oRNQ,  I64),  neon_rshift_round_imm),
   /* Shift and insert. Sizes accepted 8 16 32 64.  */
- NUF(vsli,      1800510, 3, (RNDQ, oRNDQ, I63), neon_sli),
  NUF(vsliq,     1800510, 3, (RNQ,  oRNQ,  I63), neon_sli),
- NUF(vsri,      1800410, 3, (RNDQ, oRNDQ, I64), neon_sri),
  NUF(vsriq,     1800410, 3, (RNQ,  oRNQ,  I64), neon_sri),
   /* QSHL{U} immediate accepts S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vqshlu,    1800610, 3, (RNDQ, oRNDQ, I63), neon_qshlu_imm),
  NUF(vqshluq,   1800610, 3, (RNQ,  oRNQ,  I63), neon_qshlu_imm),
   /* Right shift immediate, saturating & narrowing, with rounding variants.
      Types accepted S16 S32 S64 U16 U32 U64.  */
@@ -23351,7 +25255,6 @@ static const struct asm_opcode insns[] =
   /* CVT with optional immediate for fixed-point variant.  */
  nUF(vcvtq,     _vcvt,    3, (RNQ, RNQ, oI32b), neon_cvt),
 
- nUF(vmvn,      _vmvn,    2, (RNDQ, RNDQ_Ibig), neon_mvn),
  nUF(vmvnq,     _vmvn,    2, (RNQ,  RNDQ_Ibig), neon_mvn),
 
   /* Data processing, three registers of different lengths.  */
@@ -23383,14 +25286,10 @@ static const struct asm_opcode insns[] =
 
   /* Two registers, miscellaneous.  */
   /* Reverse. Sizes 8 16 32 (must be < size in opcode).  */
- NUF(vrev64,    1b00000, 2, (RNDQ, RNDQ),     neon_rev),
  NUF(vrev64q,   1b00000, 2, (RNQ,  RNQ),      neon_rev),
- NUF(vrev32,    1b00080, 2, (RNDQ, RNDQ),     neon_rev),
  NUF(vrev32q,   1b00080, 2, (RNQ,  RNQ),      neon_rev),
- NUF(vrev16,    1b00100, 2, (RNDQ, RNDQ),     neon_rev),
  NUF(vrev16q,   1b00100, 2, (RNQ,  RNQ),      neon_rev),
   /* Vector replicate. Sizes 8 16 32.  */
- nCE(vdup,      _vdup,    2, (RNDQ, RR_RNSC),  neon_dup),
  nCE(vdupq,     _vdup,    2, (RNQ,  RR_RNSC),  neon_dup),
   /* VMOVL. Types S8 S16 S32 U8 U16 U32.  */
  NUF(vmovl,     0800a10, 2, (RNQ, RND),       neon_movl),
@@ -23406,9 +25305,7 @@ static const struct asm_opcode insns[] =
  NUF(vuzp,      1b20100, 2, (RNDQ, RNDQ),     neon_zip_uzp),
  NUF(vuzpq,     1b20100, 2, (RNQ,  RNQ),      neon_zip_uzp),
   /* VQABS / VQNEG. Types S8 S16 S32.  */
- NUF(vqabs,     1b00700, 2, (RNDQ, RNDQ),     neon_sat_abs_neg),
  NUF(vqabsq,    1b00700, 2, (RNQ,  RNQ),      neon_sat_abs_neg),
- NUF(vqneg,     1b00780, 2, (RNDQ, RNDQ),     neon_sat_abs_neg),
  NUF(vqnegq,    1b00780, 2, (RNQ,  RNQ),      neon_sat_abs_neg),
   /* Pairwise, lengthening. Types S8 S16 S32 U8 U16 U32.  */
  NUF(vpadal,    1b00600, 2, (RNDQ, RNDQ),     neon_pair_long),
@@ -23421,10 +25318,8 @@ static const struct asm_opcode insns[] =
  NUF(vrsqrte,   1b30480, 2, (RNDQ, RNDQ),     neon_recip_est),
  NUF(vrsqrteq,  1b30480, 2, (RNQ,  RNQ),      neon_recip_est),
   /* VCLS. Types S8 S16 S32.  */
- NUF(vcls,      1b00400, 2, (RNDQ, RNDQ),     neon_cls),
  NUF(vclsq,     1b00400, 2, (RNQ,  RNQ),      neon_cls),
   /* VCLZ. Types I8 I16 I32.  */
- NUF(vclz,      1b00480, 2, (RNDQ, RNDQ),     neon_clz),
  NUF(vclzq,     1b00480, 2, (RNQ,  RNQ),      neon_clz),
   /* VCNT. Size 8.  */
  NUF(vcnt,      1b00500, 2, (RNDQ, RNDQ),     neon_cnt),
@@ -23488,11 +25383,13 @@ static const struct asm_opcode insns[] =
 #define ARM_VARIANT    & fpu_vfp_ext_fma
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT  & fpu_vfp_ext_fma
- /* Mnemonics shared by Neon and VFP.  These are included in the
+ /* Mnemonics shared by Neon, VFP, MVE and BF16.  These are included in the
     VFP FMA variant; NEON and VFP FMA always includes the NEON
     FMA instructions.  */
- nCEF(vfma,     _vfma,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_fmac),
- nCEF(vfms,     _vfms,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_fmac),
+ mnCEF(vfma,     _vfma,    3, (RNSDQMQ, oRNSDQMQ, RNSDQMQR), neon_fmac),
+ TUF ("vfmat",    c300850,    fc300850,  3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), mve_vfma, mve_vfma),
+ mnCEF(vfms,     _vfms,    3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),  neon_fmac),
+
  /* ffmas/ffmad/ffmss/ffmsd are dummy mnemonics to satisfy gas;
     the v form should always be used.  */
  cCE("ffmas",	ea00a00, 3, (RVS, RVS, RVS),  vfp_sp_dyadic),
@@ -23860,6 +25757,16 @@ static const struct asm_opcode insns[] =
  /* Armv8.1-M Mainline instructions.  */
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT & arm_ext_v8_1m_main
+ toU("cinc",  _cinc,  3, (RRnpcsp, RR_ZR, COND),	t_cond),
+ toU("cinv",  _cinv,  3, (RRnpcsp, RR_ZR, COND),	t_cond),
+ toU("cneg",  _cneg,  3, (RRnpcsp, RR_ZR, COND),	t_cond),
+ toU("csel",  _csel,  4, (RRnpcsp, RR_ZR, RR_ZR, COND),	t_cond),
+ toU("csetm", _csetm, 2, (RRnpcsp, COND),		t_cond),
+ toU("cset",  _cset,  2, (RRnpcsp, COND),		t_cond),
+ toU("csinc", _csinc, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),	t_cond),
+ toU("csinv", _csinv, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),	t_cond),
+ toU("csneg", _csneg, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),	t_cond),
+
  toC("bf",     _bf,	2, (EXPs, EXPs),	     t_branch_future),
  toU("bfcsel", _bfcsel,	4, (EXPs, EXPs, EXPs, COND), t_branch_future),
  toC("bfx",    _bfx,	2, (EXPs, RRnpcsp),	     t_branch_future),
@@ -23875,6 +25782,21 @@ static const struct asm_opcode insns[] =
 
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT & mve_ext
+ ToC("lsll",	ea50010d, 3, (RRe, RRo, RRnpcsp_I32), mve_scalar_shift),
+ ToC("lsrl",	ea50011f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("asrl",	ea50012d, 3, (RRe, RRo, RRnpcsp_I32), mve_scalar_shift),
+ ToC("uqrshll",	ea51010d, 4, (RRe, RRo, I48_I64, RRnpcsp), mve_scalar_shift1),
+ ToC("sqrshrl",	ea51012d, 4, (RRe, RRo, I48_I64, RRnpcsp), mve_scalar_shift1),
+ ToC("uqshll",	ea51010f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("urshrl",	ea51011f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("srshrl",	ea51012f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("sqshll",	ea51013f, 3, (RRe, RRo, I32),	      mve_scalar_shift),
+ ToC("uqrshl",	ea500f0d, 2, (RRnpcsp, RRnpcsp),      mve_scalar_shift),
+ ToC("sqrshr",	ea500f2d, 2, (RRnpcsp, RRnpcsp),      mve_scalar_shift),
+ ToC("uqshl",	ea500f0f, 2, (RRnpcsp, I32),	      mve_scalar_shift),
+ ToC("urshr",	ea500f1f, 2, (RRnpcsp, I32),	      mve_scalar_shift),
+ ToC("srshr",	ea500f2f, 2, (RRnpcsp, I32),	      mve_scalar_shift),
+ ToC("sqshl",	ea500f3f, 2, (RRnpcsp, I32),	      mve_scalar_shift),
 
  ToC("vpt",	ee410f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
  ToC("vptt",	ee018f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
@@ -23909,6 +25831,12 @@ static const struct asm_opcode insns[] =
  ToC("vpsteee",	fe712f4d, 0, (), mve_vpt),
 
  /* MVE and MVE FP only.  */
+ mToC("vhcadd",	ee000f00,   4, (RMQ, RMQ, RMQ, EXPi),		  mve_vhcadd),
+ mCEF(vctp,	_vctp,      1, (RRnpc),				  mve_vctp),
+ mCEF(vadc,	_vadc,      3, (RMQ, RMQ, RMQ),			  mve_vadc),
+ mCEF(vadci,	_vadci,     3, (RMQ, RMQ, RMQ),			  mve_vadc),
+ mToC("vsbc",	fe300f00,   3, (RMQ, RMQ, RMQ),			  mve_vsbc),
+ mToC("vsbci",	fe301f00,   3, (RMQ, RMQ, RMQ),			  mve_vsbc),
  mCEF(vmullb,	_vmullb,    3, (RMQ, RMQ, RMQ),			  mve_vmull),
  mCEF(vabav,	_vabav,	    3, (RRnpcsp, RMQ, RMQ),		  mve_vabav),
  mCEF(vmladav,	  _vmladav,	3, (RRe, RMQ, RMQ),		mve_vmladav),
@@ -23945,11 +25873,106 @@ static const struct asm_opcode insns[] =
 
  mCEF(vmovnt,	_vmovnt,    2, (RMQ, RMQ),			  mve_movn),
  mCEF(vmovnb,	_vmovnb,    2, (RMQ, RMQ),			  mve_movn),
+ mCEF(vbrsr,	_vbrsr,     3, (RMQ, RMQ, RR),			  mve_vbrsr),
+ mCEF(vaddlv,	_vaddlv,    3, (RRe, RRo, RMQ),			  mve_vaddlv),
+ mCEF(vaddlva,	_vaddlva,   3, (RRe, RRo, RMQ),			  mve_vaddlv),
+ mCEF(vaddv,	_vaddv,	    2, (RRe, RMQ),			  mve_vaddv),
+ mCEF(vaddva,	_vaddva,    2, (RRe, RMQ),			  mve_vaddv),
+ mCEF(vddup,	_vddup,	    3, (RMQ, RRe, EXPi),		  mve_viddup),
+ mCEF(vdwdup,	_vdwdup,    4, (RMQ, RRe, RR, EXPi),		  mve_viddup),
+ mCEF(vidup,	_vidup,	    3, (RMQ, RRe, EXPi),		  mve_viddup),
+ mCEF(viwdup,	_viwdup,    4, (RMQ, RRe, RR, EXPi),		  mve_viddup),
+ mToC("vmaxa",	ee330e81,   2, (RMQ, RMQ),			  mve_vmaxa_vmina),
+ mToC("vmina",	ee331e81,   2, (RMQ, RMQ),			  mve_vmaxa_vmina),
+ mCEF(vmaxv,	_vmaxv,	  2, (RR, RMQ),				  mve_vmaxv),
+ mCEF(vmaxav,	_vmaxav,  2, (RR, RMQ),				  mve_vmaxv),
+ mCEF(vminv,	_vminv,	  2, (RR, RMQ),				  mve_vmaxv),
+ mCEF(vminav,	_vminav,  2, (RR, RMQ),				  mve_vmaxv),
+
+ mCEF(vmlaldav,	  _vmlaldav,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlaldava,  _vmlaldava,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlaldavx,  _vmlaldavx,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlaldavax, _vmlaldavax,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlalv,	  _vmlaldav,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlalva,	  _vmlaldava,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlsldav,	  _vmlsldav,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlsldava,  _vmlsldava,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlsldavx,  _vmlsldavx,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mCEF(vmlsldavax, _vmlsldavax,	4, (RRe, RRo, RMQ, RMQ),	mve_vmlaldav),
+ mToC("vrmlaldavh", ee800f00,	   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlaldavha",ee800f20,	   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlaldavhx,  _vrmlaldavhx,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlaldavhax, _vrmlaldavhax, 4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlalvh",   ee800f00,	   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlalvha",  ee800f20,	   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavh,   _vrmlsldavh,   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavha,  _vrmlsldavha,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavhx,  _vrmlsldavhx,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavhax, _vrmlsldavhax, 4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+
+ mToC("vmlas",	  ee011e40,	3, (RMQ, RMQ, RR),		mve_vmlas),
+ mToC("vmulh",	  ee010e01,	3, (RMQ, RMQ, RMQ),		mve_vmulh),
+ mToC("vrmulh",	  ee011e01,	3, (RMQ, RMQ, RMQ),		mve_vmulh),
+ mToC("vpnot",	  fe310f4d,	0, (),				mve_vpnot),
+ mToC("vpsel",	  fe310f01,	3, (RMQ, RMQ, RMQ),		mve_vpsel),
+
+ mToC("vqdmladh",  ee000e00,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqdmladhx", ee001e00,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqrdmladh", ee000e01,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqrdmladhx",ee001e01,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqdmlsdh",  fe000e00,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqdmlsdhx", fe001e00,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqrdmlsdh", fe000e01,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqrdmlsdhx",fe001e01,	3, (RMQ, RMQ, RMQ),		mve_vqdmladh),
+ mToC("vqdmlah",   ee000e60,	3, (RMQ, RMQ, RR),		mve_vqdmlah),
+ mToC("vqdmlash",  ee001e60,	3, (RMQ, RMQ, RR),		mve_vqdmlah),
+ mToC("vqrdmlash", ee001e40,	3, (RMQ, RMQ, RR),		mve_vqdmlah),
+ mToC("vqdmullt",  ee301f00,	3, (RMQ, RMQ, RMQRR),		mve_vqdmull),
+ mToC("vqdmullb",  ee300f00,	3, (RMQ, RMQ, RMQRR),		mve_vqdmull),
+ mCEF(vqmovnt,	  _vqmovnt,	2, (RMQ, RMQ),			mve_vqmovn),
+ mCEF(vqmovnb,	  _vqmovnb,	2, (RMQ, RMQ),			mve_vqmovn),
+ mCEF(vqmovunt,	  _vqmovunt,	2, (RMQ, RMQ),			mve_vqmovn),
+ mCEF(vqmovunb,	  _vqmovunb,	2, (RMQ, RMQ),			mve_vqmovn),
+
+ mCEF(vshrnt,	  _vshrnt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vshrnb,	  _vshrnb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vrshrnt,	  _vrshrnt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vrshrnb,	  _vrshrnb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqshrnt,	  _vqrshrnt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqshrnb,	  _vqrshrnb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqshrunt,	  _vqrshrunt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqshrunb,	  _vqrshrunb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqrshrnt,	  _vqrshrnt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqrshrnb,	  _vqrshrnb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqrshrunt,  _vqrshrunt,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+ mCEF(vqrshrunb,  _vqrshrunb,	3, (RMQ, RMQ, I32z),	mve_vshrn),
+
+ mToC("vshlc",	    eea00fc0,	   3, (RMQ, RR, I32z),	    mve_vshlc),
+ mToC("vshllt",	    ee201e00,	   3, (RMQ, RMQ, I32),	    mve_vshll),
+ mToC("vshllb",	    ee200e00,	   3, (RMQ, RMQ, I32),	    mve_vshll),
+
+ toU("dlstp",	_dlstp, 2, (LR, RR),      t_loloop),
+ toU("wlstp",	_wlstp, 3, (LR, RR, EXP), t_loloop),
+ toU("letp",	_letp,  2, (LR, EXP),	  t_loloop),
+ toU("lctp",	_lctp,  0, (),		  t_loloop),
+
+#undef THUMB_VARIANT
+#define THUMB_VARIANT & mve_fp_ext
+ mToC("vcmul", ee300e00,   4, (RMQ, RMQ, RMQ, EXPi),		  mve_vcmul),
+ mToC("vfmas", ee311e40,   3, (RMQ, RMQ, RR),			  mve_vfmas),
+ mToC("vmaxnma", ee3f0e81, 2, (RMQ, RMQ),			  mve_vmaxnma_vminnma),
+ mToC("vminnma", ee3f1e81, 2, (RMQ, RMQ),			  mve_vmaxnma_vminnma),
+ mToC("vmaxnmv", eeee0f00, 2, (RR, RMQ),			  mve_vmaxnmv),
+ mToC("vmaxnmav",eeec0f00, 2, (RR, RMQ),			  mve_vmaxnmv),
+ mToC("vminnmv", eeee0f80, 2, (RR, RMQ),			  mve_vmaxnmv),
+ mToC("vminnmav",eeec0f80, 2, (RR, RMQ),			  mve_vmaxnmv),
 
 #undef  ARM_VARIANT
 #define ARM_VARIANT  & fpu_vfp_ext_v1
 #undef  THUMB_VARIANT
 #define THUMB_VARIANT  & arm_ext_v6t2
+ mnCEF(vmla,     _vmla,    3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), neon_mac_maybe_scalar),
+ mnCEF(vmul,     _vmul,    3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), neon_mul),
 
  mcCE(fcpyd,	eb00b40, 2, (RVD, RVD),	      vfp_dp_rd_rm),
 
@@ -23988,13 +26011,73 @@ static const struct asm_opcode insns[] =
  mnUF(vcvtp,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),		neon_cvtp),
  mnUF(vcvtn,  _vcvta,  3, (RNSDQMQ, oRNSDQMQ, oI32z),	neon_cvtn),
  mnUF(vcvtm,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),		neon_cvtm),
+ mnUF(vmaxnm, _vmaxnm, 3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),	vmaxnm),
+ mnUF(vminnm, _vminnm, 3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),	vmaxnm),
 
 #undef	ARM_VARIANT
 #define ARM_VARIANT & fpu_neon_ext_v1
- mnUF(vabd,      _vabd,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ mnUF(vabd,      _vabd,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
  mnUF(vabdl,     _vabdl,	  3, (RNQMQ, RNDMQ, RNDMQ),   neon_dyadic_long),
  mnUF(vaddl,     _vaddl,	  3, (RNQMQ, RNDMQ, RNDMQR),  neon_dyadic_long),
  mnUF(vsubl,     _vsubl,	  3, (RNQMQ, RNDMQ, RNDMQR),  neon_dyadic_long),
+ mnUF(vand,      _vand,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vbic,      _vbic,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vorr,      _vorr,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vorn,      _vorn,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(veor,      _veor,		  3, (RNDQMQ, oRNDQMQ, RNDQMQ),      neon_logic),
+ MNUF(vcls,      1b00400,	  2, (RNDQMQ, RNDQMQ),		     neon_cls),
+ MNUF(vclz,      1b00480,	  2, (RNDQMQ, RNDQMQ),		     neon_clz),
+ mnCE(vdup,      _vdup,		  2, (RNDQMQ, RR_RNSC),		     neon_dup),
+ MNUF(vhadd,     00000000,	  3, (RNDQMQ, oRNDQMQ, RNDQMQR),  neon_dyadic_i_su),
+ MNUF(vrhadd,    00000100,	  3, (RNDQMQ, oRNDQMQ, RNDQMQ),	  neon_dyadic_i_su),
+ MNUF(vhsub,     00000200,	  3, (RNDQMQ, oRNDQMQ, RNDQMQR),  neon_dyadic_i_su),
+ mnUF(vmin,      _vmin,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ mnUF(vmax,      _vmax,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ MNUF(vqadd,     0000010,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_dyadic_i64_su),
+ MNUF(vqsub,     0000210,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_dyadic_i64_su),
+ mnUF(vmvn,      _vmvn,    2, (RNDQMQ, RNDQMQ_Ibig), neon_mvn),
+ MNUF(vqabs,     1b00700,  2, (RNDQMQ, RNDQMQ),     neon_sat_abs_neg),
+ MNUF(vqneg,     1b00780,  2, (RNDQMQ, RNDQMQ),     neon_sat_abs_neg),
+ mnUF(vqrdmlah,  _vqrdmlah,3, (RNDQMQ, oRNDQMQ, RNDQ_RNSC_RR), neon_qrdmlah),
+ mnUF(vqdmulh,   _vqdmulh, 3, (RNDQMQ, oRNDQMQ, RNDQMQ_RNSC_RR), neon_qdmulh),
+ mnUF(vqrdmulh,  _vqrdmulh,3, (RNDQMQ, oRNDQMQ, RNDQMQ_RNSC_RR), neon_qdmulh),
+ MNUF(vqrshl,    0000510,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_rshl),
+ MNUF(vrshl,     0000500,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_rshl),
+ MNUF(vshr,      0800010,  3, (RNDQMQ, oRNDQMQ, I64z), neon_rshift_round_imm),
+ MNUF(vrshr,     0800210,  3, (RNDQMQ, oRNDQMQ, I64z), neon_rshift_round_imm),
+ MNUF(vsli,      1800510,  3, (RNDQMQ, oRNDQMQ, I63),  neon_sli),
+ MNUF(vsri,      1800410,  3, (RNDQMQ, oRNDQMQ, I64z), neon_sri),
+ MNUF(vrev64,    1b00000,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ MNUF(vrev32,    1b00080,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ MNUF(vrev16,    1b00100,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ mnUF(vshl,	 _vshl,    3, (RNDQMQ, oRNDQMQ, RNDQMQ_I63b_RR), neon_shl),
+ mnUF(vqshl,     _vqshl,   3, (RNDQMQ, oRNDQMQ, RNDQMQ_I63b_RR), neon_qshl),
+ MNUF(vqshlu,    1800610,  3, (RNDQMQ, oRNDQMQ, I63),		 neon_qshlu_imm),
+
+#undef	ARM_VARIANT
+#define ARM_VARIANT & arm_ext_v8_3
+#undef	THUMB_VARIANT
+#define	THUMB_VARIANT & arm_ext_v6t2_v8m
+ MNUF (vcadd, 0, 4, (RNDQMQ, RNDQMQ, RNDQMQ, EXPi), vcadd),
+ MNUF (vcmla, 0, 4, (RNDQMQ, RNDQMQ, RNDQMQ_RNSC, EXPi), vcmla),
+
+#undef	ARM_VARIANT
+#define ARM_VARIANT &arm_ext_bf16
+#undef	THUMB_VARIANT
+#define	THUMB_VARIANT &arm_ext_bf16
+ TUF ("vdot", c000d00, fc000d00, 3, (RNDQ, RNDQ, RNDQ_RNSC), vdot, vdot),
+ TUF ("vmmla", c000c40, fc000c40, 3, (RNQ, RNQ, RNQ), vmmla, vmmla),
+ TUF ("vfmab", c300810, fc300810, 3, (RNDQ, RNDQ, RNDQ_RNSC), bfloat_vfma, bfloat_vfma),
+
+#undef	ARM_VARIANT
+#define ARM_VARIANT &arm_ext_i8mm
+#undef	THUMB_VARIANT
+#define	THUMB_VARIANT &arm_ext_i8mm
+ TUF ("vsmmla", c200c40, fc200c40, 3, (RNQ, RNQ, RNQ), vsmmla, vsmmla),
+ TUF ("vummla", c200c50, fc200c50, 3, (RNQ, RNQ, RNQ), vummla, vummla),
+ TUF ("vusmmla", ca00c40, fca00c40, 3, (RNQ, RNQ, RNQ), vsmmla, vsmmla),
+ TUF ("vusdot", c800d00, fc800d00, 3, (RNDQ, RNDQ, RNDQ_RNSC), vusdot, vusdot),
+ TUF ("vsudot", c800d10, fc800d10, 3, (RNDQ, RNDQ, RNSC), vsudot, vsudot),
 };
 #undef ARM_VARIANT
 #undef THUMB_VARIANT
@@ -24724,7 +26807,7 @@ arm_init_frag (fragS * fragP, int max_chars)
 
   /* PR 21809: Do not set a mapping state for debug sections
      - it just confuses other tools.  */
-  if (bfd_get_section_flags (NULL, now_seg) & SEC_DEBUGGING)
+  if (bfd_section_flags (now_seg) & SEC_DEBUGGING)
     return;
 
   frag_thumb_mode = fragP->tc_frag_data.thumb_mode ^ MODE_RECORDED;
@@ -26147,11 +28230,12 @@ md_apply_fix (fixS *	fixP,
       break;
 
     case BFD_RELOC_ARM_SMC:
-      if (((unsigned long) value) > 0xffff)
+      if (((unsigned long) value) > 0xf)
 	as_bad_where (fixP->fx_file, fixP->fx_line,
 		      _("invalid smc expression"));
+
       newval = md_chars_to_number (buf, INSN_SIZE);
-      newval |= (value & 0xf) | ((value & 0xfff0) << 4);
+      newval |= (value & 0xf);
       md_number_to_chars (buf, newval, INSN_SIZE);
       break;
 
@@ -26320,7 +28404,7 @@ md_apply_fix (fixS *	fixP,
       break;
 
     case BFD_RELOC_THUMB_PCREL_BRANCH9: /* Conditional branch.	*/
-      if ((value & ~0xff) && ((value & ~0xff) != ~0xff))
+      if (out_of_range_p (value, 8))
 	as_bad_where (fixP->fx_file, fixP->fx_line, BAD_RANGE);
 
       if (fixP->fx_done || !seg->use_rela_p)
@@ -26332,7 +28416,7 @@ md_apply_fix (fixS *	fixP,
       break;
 
     case BFD_RELOC_THUMB_PCREL_BRANCH12: /* Unconditional branch.  */
-      if ((value & ~0x7ff) && ((value & ~0x7ff) != ~0x7ff))
+      if (out_of_range_p (value, 11))
 	as_bad_where (fixP->fx_file, fixP->fx_line, BAD_RANGE);
 
       if (fixP->fx_done || !seg->use_rela_p)
@@ -26343,6 +28427,7 @@ md_apply_fix (fixS *	fixP,
 	}
       break;
 
+    /* This relocation is misnamed, it should be BRANCH21.  */
     case BFD_RELOC_THUMB_PCREL_BRANCH20:
       if (fixP->fx_addsy
 	  && (S_GET_SEGMENT (fixP->fx_addsy) == seg)
@@ -26353,7 +28438,7 @@ md_apply_fix (fixS *	fixP,
 	  /* Force a relocation for a branch 20 bits wide.  */
 	  fixP->fx_done = 0;
 	}
-      if ((value & ~0x1fffff) && ((value & ~0x0fffff) != ~0x0fffff))
+      if (out_of_range_p (value, 20))
 	as_bad_where (fixP->fx_file, fixP->fx_line,
 		      _("conditional branch out of range"));
 
@@ -26432,12 +28517,11 @@ md_apply_fix (fixS *	fixP,
 	 fixP->fx_r_type = BFD_RELOC_THUMB_PCREL_BRANCH23;
 #endif
 
-      if ((value & ~0x3fffff) && ((value & ~0x3fffff) != ~0x3fffff))
+      if (out_of_range_p (value, 22))
 	{
 	  if (!(ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v6t2)))
 	    as_bad_where (fixP->fx_file, fixP->fx_line, BAD_RANGE);
-	  else if ((value & ~0x1ffffff)
-		   && ((value & ~0x1ffffff) != ~0x1ffffff))
+	  else if (out_of_range_p (value, 24))
 	    as_bad_where (fixP->fx_file, fixP->fx_line,
 			  _("Thumb2 branch out of range"));
 	}
@@ -26448,7 +28532,7 @@ md_apply_fix (fixS *	fixP,
       break;
 
     case BFD_RELOC_THUMB_PCREL_BRANCH25:
-      if ((value & ~0x0ffffff) && ((value & ~0x0ffffff) != ~0x0ffffff))
+      if (out_of_range_p (value, 24))
 	as_bad_where (fixP->fx_file, fixP->fx_line, BAD_RANGE);
 
       if (fixP->fx_done || !seg->use_rela_p)
@@ -27292,9 +29376,10 @@ md_apply_fix (fixS *	fixP,
 	}
 
       bfd_vma insn = get_thumb32_insn (buf);
-      /* le lr, <label> or le <label> */
+      /* le lr, <label>, le <label> or letp lr, <label> */
       if (((insn & 0xffffffff) == 0xf00fc001)
-	  || ((insn & 0xffffffff) == 0xf02fc001))
+	  || ((insn & 0xffffffff) == 0xf02fc001)
+	  || ((insn & 0xffffffff) == 0xf01fc001))
 	value = -value;
 
       if (v8_1_branch_value_check (value, 12, FALSE) == FAIL)
@@ -28250,9 +30335,8 @@ md_begin (void)
 
 	if (sec != NULL)
 	  {
-	    bfd_set_section_flags
-	      (stdoutput, sec, SEC_READONLY | SEC_DEBUGGING /* | SEC_HAS_CONTENTS */);
-	    bfd_set_section_size (stdoutput, sec, 0);
+	    bfd_set_section_flags (sec, SEC_READONLY | SEC_DEBUGGING);
+	    bfd_set_section_size (sec, 0);
 	    bfd_set_section_contents (stdoutput, sec, NULL, 0, 0);
 	  }
       }
@@ -28871,6 +30955,12 @@ static const struct arm_cpu_option_table arm_cpus[] =
   ARM_CPU_OPT ("cortex-a76",    "Cortex-A76",	       ARM_ARCH_V8_2A,
 	       ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
 	       FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
+  ARM_CPU_OPT ("cortex-a76ae",    "Cortex-A76AE",      ARM_ARCH_V8_2A,
+	       ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
+	       FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
+  ARM_CPU_OPT ("cortex-a77",    "Cortex-A77",	       ARM_ARCH_V8_2A,
+	       ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
+	       FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
   ARM_CPU_OPT ("ares",    "Ares",	       ARM_ARCH_V8_2A,
 	       ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
 	       FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
@@ -28892,6 +30982,9 @@ static const struct arm_cpu_option_table arm_cpus[] =
   ARM_CPU_OPT ("cortex-r52",	  "Cortex-R52",	       ARM_ARCH_V8R,
 	      ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
 	      FPU_ARCH_NEON_VFP_ARMV8),
+  ARM_CPU_OPT ("cortex-m35p",	  "Cortex-M35P",       ARM_ARCH_V8M_MAIN,
+	       ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
+	       FPU_NONE),
   ARM_CPU_OPT ("cortex-m33",	  "Cortex-M33",	       ARM_ARCH_V8M_MAIN,
 	       ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
 	       FPU_NONE),
@@ -29109,6 +31202,8 @@ static const struct arm_ext_table armv82a_ext_table[] =
   ARM_ADD ("simd", FPU_ARCH_NEON_VFP_ARMV8_1),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_2_FP16),
   ARM_ADD ("fp16fml", FPU_ARCH_NEON_VFP_ARMV8_2_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_1,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
   ARM_ADD ("dotprod", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
@@ -29125,6 +31220,8 @@ static const struct arm_ext_table armv84a_ext_table[] =
 {
   ARM_ADD ("simd", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_4_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_4,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
 
@@ -29140,6 +31237,8 @@ static const struct arm_ext_table armv85a_ext_table[] =
 {
   ARM_ADD ("simd", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
   ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_4_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
   ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_4,
 	   ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
 
@@ -29149,6 +31248,12 @@ static const struct arm_ext_table armv85a_ext_table[] =
   { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE }
 };
 
+static const struct arm_ext_table armv86a_ext_table[] =
+{
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
+  { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE }
+};
+
 static const struct arm_ext_table armv8m_main_ext_table[] =
 {
   ARM_EXT ("dsp", ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
@@ -29254,6 +31359,7 @@ static const struct arm_arch_option_table arm_archs[] =
   ARM_ARCH_OPT2 ("armv8-r",	  ARM_ARCH_V8R,		FPU_ARCH_VFP, armv8r),
   ARM_ARCH_OPT2 ("armv8.4-a",	  ARM_ARCH_V8_4A,	FPU_ARCH_VFP, armv84a),
   ARM_ARCH_OPT2 ("armv8.5-a",	  ARM_ARCH_V8_5A,	FPU_ARCH_VFP, armv85a),
+  ARM_ARCH_OPT2 ("armv8.6-a",	  ARM_ARCH_V8_6A,	FPU_ARCH_VFP, armv86a),
   ARM_ARCH_OPT ("xscale",	  ARM_ARCH_XSCALE,	FPU_ARCH_VFP),
   ARM_ARCH_OPT ("iwmmxt",	  ARM_ARCH_IWMMXT,	FPU_ARCH_VFP),
   ARM_ARCH_OPT ("iwmmxt2",	  ARM_ARCH_IWMMXT2,	FPU_ARCH_VFP),
@@ -29620,6 +31726,22 @@ arm_parse_extension (const char *str, const arm_feature_set *opt_set,
   return TRUE;
 }
 
+static bfd_boolean
+arm_parse_fp16_opt (const char *str)
+{
+  if (strcasecmp (str, "ieee") == 0)
+    fp16_format = ARM_FP16_FORMAT_IEEE;
+  else if (strcasecmp (str, "alternative") == 0)
+    fp16_format = ARM_FP16_FORMAT_ALTERNATIVE;
+  else
+    {
+      as_bad (_("unrecognised float16 format \"%s\""), str);
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
 static bfd_boolean
 arm_parse_cpu (const char *str)
 {
@@ -29699,6 +31821,7 @@ arm_parse_arch (const char *str)
 	  march_ext_opt = XNEW (arm_feature_set);
 	*march_ext_opt = arm_arch_none;
 	march_fpu_opt = &opt->default_fpu;
+	selected_ctx_ext_table = opt->ext_table;
 	strcpy (selected_cpu_name, opt->name);
 
 	if (ext != NULL)
@@ -29811,6 +31934,12 @@ struct arm_long_option_table arm_long_opts[] =
    arm_parse_it_mode, NULL},
   {"mccs", N_("\t\t\t  TI CodeComposer Studio syntax compatibility mode"),
    arm_ccs_mode, NULL},
+  {"mfp16-format=",
+   N_("[ieee|alternative]\n\
+                          set the encoding for half precision floating point "
+			  "numbers to IEEE\n\
+                          or Arm alternative format."),
+   arm_parse_fp16_opt, NULL },
   {NULL, NULL, 0, NULL}
 };
 
@@ -30012,7 +32141,8 @@ static const cpu_arch_ver_table cpu_arch_ver[] =
     {TAG_CPU_ARCH_V8,	      ARM_ARCH_V8_4A},
     {TAG_CPU_ARCH_V8,	      ARM_ARCH_V8_5A},
     {TAG_CPU_ARCH_V8_1M_MAIN, ARM_ARCH_V8_1M_MAIN},
-    {-1,		      ARM_ARCH_NONE}
+    {TAG_CPU_ARCH_V8,	    ARM_ARCH_V8_6A},
+    {-1,		    ARM_ARCH_NONE}
 };
 
 /* Set an attribute if it has not already been set by the user.  */
@@ -30392,6 +32522,9 @@ aeabi_set_public_attributes (void)
     virt_sec |= 2;
   if (virt_sec != 0)
     aeabi_set_attribute_int (Tag_Virtualization_use, virt_sec);
+
+  if (fp16_format != ARM_FP16_FORMAT_DEFAULT)
+    aeabi_set_attribute_int (Tag_ABI_FP_16bit_format, fp16_format);
 }
 
 /* Post relaxation hook.  Recompute ARM attributes now that relaxation is
@@ -30549,6 +32682,35 @@ s_arm_arch_extension (int ignored ATTRIBUTE_UNUSED)
       name += 2;
     }
 
+  /* Check the context specific extension table */
+  if (selected_ctx_ext_table)
+    {
+      const struct arm_ext_table * ext_opt;
+      for (ext_opt = selected_ctx_ext_table; ext_opt->name != NULL; ext_opt++)
+        {
+          if (streq (ext_opt->name, name))
+	    {
+	      if (adding_value)
+		{
+		  if (ARM_FEATURE_ZERO (ext_opt->merge))
+		    /* TODO: Option not supported.  When we remove the
+		    legacy table this case should error out.  */
+		    continue;
+		  ARM_MERGE_FEATURE_SETS (selected_ext, selected_ext,
+					  ext_opt->merge);
+		}
+	      else
+		ARM_CLEAR_FEATURE (selected_ext, selected_ext, ext_opt->clear);
+
+	      ARM_MERGE_FEATURE_SETS (selected_cpu, selected_arch, selected_ext);
+	      ARM_MERGE_FEATURE_SETS (cpu_variant, selected_cpu, selected_fpu);
+	      *input_line_pointer = saved_char;
+	      demand_empty_rest_of_line ();
+	      return;
+	    }
+	}
+    }
+
   for (opt = arm_extensions; opt->name != NULL; opt++)
     if (streq (opt->name, name))
       {