Port gas/config/* to str_htab.

[deliverable/binutils-gdb.git] / gas / config / tc-arm.c
diff --git a/gas/config/tc-arm.c b/gas/config/tc-arm.c

index 717f1b1a58a0e343a6625d41b33310f2d87991e5..3b17e20d4d397fe3085bd7aa9ff5a318a9fa3f15 100644 (file)
--- a/gas/config/tc-arm.c
+++ b/gas/config/tc-arm.c
@@ -1,5 +1,5 @@
  /* tc-arm.c -- Assemble for the ARM
-   Copyright (C) 1994-2019 Free Software Foundation, Inc.
+   Copyright (C) 1994-2020 Free Software Foundation, Inc.
     Contributed by Richard Earnshaw (rwe@pegasus.esprit.ec.org)
         Modified by David Taylor (dtaylor@armltd.co.uk)
         Cirrus coprocessor mods by Aldy Hernandez (aldyh@redhat.com)
@@ -32,6 +32,7 @@
  #include "obstack.h"
  #include "libiberty.h"
  #include "opcode/arm.h"
+#include "cpu-arm.h"
  
  #ifdef OBJ_ELF
  #include "elf/arm.h"
@@ -106,6 +107,15 @@ enum arm_float_abi
     should define CPU_DEFAULT here.  */
  #endif
  
+/* Perform range checks on positive and negative overflows by checking if the
+   VALUE given fits within the range of an BITS sized immediate.  */
+static bfd_boolean out_of_range_p (offsetT value, offsetT bits)
+ {
+  gas_assert (bits < (offsetT)(sizeof (value) * 8));
+  return (value & ~((1 << bits)-1))
+         && ((value & ~((1 << bits)-1)) != ~((1 << bits)-1));
+}
+
  #ifndef FPU_DEFAULT
  # ifdef TE_LINUX
  #  define FPU_DEFAULT FPU_ARCH_FPA
@@ -144,6 +154,7 @@ static int pic_code      = FALSE;
  static int fix_v4bx         = FALSE;
  /* Warn on using deprecated features.  */
  static int warn_on_deprecated = TRUE;
+static int warn_on_restrict_it = FALSE;
  
  /* Understand CodeComposer Studio assembly syntax.  */
  bfd_boolean codecomposer_syntax = FALSE;
@@ -219,6 +230,7 @@ static const arm_feature_set arm_ext_div = ARM_FEATURE_CORE_LOW (ARM_EXT_DIV);
  static const arm_feature_set arm_ext_v7 = ARM_FEATURE_CORE_LOW (ARM_EXT_V7);
  static const arm_feature_set arm_ext_v7a = ARM_FEATURE_CORE_LOW (ARM_EXT_V7A);
  static const arm_feature_set arm_ext_v7r = ARM_FEATURE_CORE_LOW (ARM_EXT_V7R);
+static const arm_feature_set arm_ext_v8r = ARM_FEATURE_CORE_HIGH (ARM_EXT2_V8R);
  #ifdef OBJ_ELF
  static const arm_feature_set ATTRIBUTE_UNUSED arm_ext_v7m = ARM_FEATURE_CORE_LOW (ARM_EXT_V7M);
  #endif
@@ -265,11 +277,33 @@ static const arm_feature_set arm_ext_sb =
    ARM_FEATURE_CORE_HIGH (ARM_EXT2_SB);
  static const arm_feature_set arm_ext_predres =
    ARM_FEATURE_CORE_HIGH (ARM_EXT2_PREDRES);
+static const arm_feature_set arm_ext_bf16 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16);
+static const arm_feature_set arm_ext_i8mm =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM);
+static const arm_feature_set arm_ext_crc =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC);
+static const arm_feature_set arm_ext_cde =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE);
+static const arm_feature_set arm_ext_cde0 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE0);
+static const arm_feature_set arm_ext_cde1 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE1);
+static const arm_feature_set arm_ext_cde2 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE2);
+static const arm_feature_set arm_ext_cde3 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE3);
+static const arm_feature_set arm_ext_cde4 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE4);
+static const arm_feature_set arm_ext_cde5 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE5);
+static const arm_feature_set arm_ext_cde6 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE6);
+static const arm_feature_set arm_ext_cde7 =
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE7);
  
  static const arm_feature_set arm_arch_any = ARM_ANY;
-#ifdef OBJ_ELF
  static const arm_feature_set fpu_any = FPU_ANY;
-#endif
  static const arm_feature_set arm_arch_full ATTRIBUTE_UNUSED = ARM_FEATURE (-1, -1, -1);
  static const arm_feature_set arm_arch_t2 = ARM_ARCH_THUMB2;
  static const arm_feature_set arm_arch_none = ARM_ARCH_NONE;
@@ -303,9 +337,14 @@ static const arm_feature_set fpu_neon_ext_v1 =
  static const arm_feature_set fpu_vfp_v3_or_neon_ext =
    ARM_FEATURE_COPROC (FPU_NEON_EXT_V1 | FPU_VFP_EXT_V3);
  static const arm_feature_set mve_ext =
-  ARM_FEATURE_COPROC (FPU_MVE);
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_MVE);
  static const arm_feature_set mve_fp_ext =
-  ARM_FEATURE_COPROC (FPU_MVE_FP);
+  ARM_FEATURE_CORE_HIGH (ARM_EXT2_MVE_FP);
+/* Note: This has more than one bit set, which means using it with
+   mark_feature_used (which returns if *any* of the bits are set in the current
+   cpu variant) can give surprising results.  */
+static const arm_feature_set armv8m_fp =
+  ARM_FEATURE_COPROC (FPU_VFP_V5_SP_D16);
  #ifdef OBJ_ELF
  static const arm_feature_set fpu_vfp_fp16 =
    ARM_FEATURE_COPROC (FPU_VFP_EXT_FP16);
@@ -322,8 +361,6 @@ static const arm_feature_set fpu_neon_ext_armv8 =
    ARM_FEATURE_COPROC (FPU_NEON_EXT_ARMV8);
  static const arm_feature_set fpu_crypto_ext_armv8 =
    ARM_FEATURE_COPROC (FPU_CRYPTO_EXT_ARMV8);
-static const arm_feature_set crc_ext_armv8 =
-  ARM_FEATURE_COPROC (CRC_EXT_ARMV8);
  static const arm_feature_set fpu_neon_ext_v8_1 =
    ARM_FEATURE_COPROC (FPU_NEON_EXT_RDMA);
  static const arm_feature_set fpu_neon_ext_dotprod =
@@ -345,6 +382,7 @@ static arm_feature_set selected_fpu = FPU_NONE;
  /* Feature bits selected by the last .object_arch directive.  */
  static arm_feature_set selected_object_arch = ARM_ARCH_NONE;
  /* Must be long enough to hold any of the names in arm_cpus.  */
+static const struct arm_ext_table * selected_ctx_ext_table = NULL;
  static char selected_cpu_name[20];
  
  extern FLONUM_TYPE generic_floating_point_number;
@@ -436,6 +474,7 @@ enum neon_el_type
    NT_float,
    NT_poly,
    NT_signed,
+  NT_bfloat,
    NT_unsigned
  };
  
@@ -445,7 +484,7 @@ struct neon_type_el
    unsigned size;
  };
  
-#define NEON_MAX_TYPE_ELS 4
+#define NEON_MAX_TYPE_ELS 5
  
  struct neon_type
  {
@@ -467,7 +506,7 @@ enum pred_instruction_type
     VPT_INSN,              /* The VPT/VPST insn has been parsed.  */
     MVE_OUTSIDE_PRED_INSN , /* Instruction to indicate a MVE instruction without
                               a predication code.  */
-   MVE_UNPREDICABLE_INSN   /* MVE instruction that is non-predicable.  */
+   MVE_UNPREDICABLE_INSN,  /* MVE instruction that is non-predicable.  */
  };
  
  /* The maximum number of operands we need.  */
@@ -510,7 +549,10 @@ struct arm_it
      unsigned isreg     : 1;  /* Operand was a register.  */
      unsigned immisreg  : 2;  /* .imm field is a second register.
                                  0: imm, 1: gpr, 2: MVE Q-register.  */
-    unsigned isscalar   : 1;  /* Operand is a (Neon) scalar.  */
+    unsigned isscalar   : 2;  /* Operand is a (SIMD) scalar:
+                                0) not scalar,
+                                1) Neon scalar,
+                                2) MVE scalar.  */
      unsigned immisalign : 1;  /* Immediate is an alignment specifier.  */
      unsigned immisfloat : 1;  /* Immediate was parsed as a float.  */
      /* Note: we abuse "regisimm" to mean "is Neon register" in VMOV
@@ -519,6 +561,7 @@ struct arm_it
      unsigned isvec      : 1;  /* Is a single, double or quad VFP/Neon reg.  */
      unsigned isquad     : 1;  /* Operand is SIMD quad register.  */
      unsigned issingle   : 1;  /* Operand is VFP single-precision register.  */
+    unsigned iszr      : 1;  /* Operand is ZR register.  */
      unsigned hasreloc  : 1;  /* Operand has relocation suffix.  */
      unsigned writeback : 1;  /* Operand has trailing !  */
      unsigned preind    : 1;  /* Preindexed address.  */
@@ -643,6 +686,7 @@ enum arm_reg_type
    REG_TYPE_MMXWCG,
    REG_TYPE_XSCALE,
    REG_TYPE_RNB,
+  REG_TYPE_ZR
  };
  
  /* Structure for a hash table entry for a register.
@@ -685,7 +729,7 @@ const char * const reg_expected_msgs[] =
    [REG_TYPE_MMXWCG] = N_("iWMMXt scalar register expected"),
    [REG_TYPE_XSCALE] = N_("XScale accumulator register expected"),
    [REG_TYPE_MQ]            = N_("MVE vector register expected"),
-  [REG_TYPE_RNB]    = N_("")
+  [REG_TYPE_RNB]    = ""
  };
  
  /* Some well known registers that we refer to directly elsewhere.  */
@@ -862,6 +906,7 @@ struct asm_opcode
  #define BAD_ADDR_MODE   _("instruction does not accept this addressing mode")
  #define BAD_BRANCH     _("branch must be last instruction in IT block")
  #define BAD_BRANCH_OFF _("branch out of range or not a multiple of 2")
+#define BAD_NO_VPT     _("instruction not allowed in VPT block")
  #define BAD_NOT_IT     _("instruction not allowed in IT block")
  #define BAD_NOT_VPT    _("instruction missing MVE vector predication code")
  #define BAD_FPU                _("selected FPU does not support instruction")
@@ -878,6 +923,9 @@ struct asm_opcode
         _("cannot use writeback with PC-relative addressing")
  #define BAD_RANGE      _("branch out of range")
  #define BAD_FP16       _("selected processor does not support fp16 instruction")
+#define BAD_BF16       _("selected processor does not support bf16 instruction")
+#define BAD_CDE        _("selected processor does not support cde instruction")
+#define BAD_CDE_COPROC _("coprocessor for insn is not enabled for cde")
  #define UNPRED_REG(R)  _("using " R " results in unpredictable behaviour")
  #define THUMB1_RELOC_ONLY  _("relocation valid in thumb1 code only")
  #define MVE_NOT_IT     _("Warning: instruction is UNPREDICTABLE in an IT " \
@@ -895,16 +943,17 @@ struct asm_opcode
  #define BAD_MVE_SRCDEST        _("Warning: 32-bit element size and same destination "\
                           "and source operands makes instruction UNPREDICTABLE")
  #define BAD_EL_TYPE    _("bad element type for instruction")
-
-static struct hash_control * arm_ops_hsh;
-static struct hash_control * arm_cond_hsh;
-static struct hash_control * arm_vcond_hsh;
-static struct hash_control * arm_shift_hsh;
-static struct hash_control * arm_psr_hsh;
-static struct hash_control * arm_v7m_psr_hsh;
-static struct hash_control * arm_reg_hsh;
-static struct hash_control * arm_reloc_hsh;
-static struct hash_control * arm_barrier_opt_hsh;
+#define MVE_BAD_QREG   _("MVE vector register Q[0..7] expected")
+
+static htab_t  arm_ops_hsh;
+static htab_t  arm_cond_hsh;
+static htab_t  arm_vcond_hsh;
+static htab_t  arm_shift_hsh;
+static htab_t  arm_psr_hsh;
+static htab_t  arm_v7m_psr_hsh;
+static htab_t  arm_reg_hsh;
+static htab_t  arm_reloc_hsh;
+static htab_t  arm_barrier_opt_hsh;
  
  /* Stuff needed to resolve the label ambiguity
     As:
@@ -1003,6 +1052,9 @@ static void it_fsm_post_encode (void);
      }                                                  \
    while (0)
  
+/* Toggle value[pos].  */
+#define TOGGLE_BIT(value, pos) (value ^ (1 << pos))
+
  /* Pure syntax.         */
  
  /* This array holds the chars that always start a comment.  If the
@@ -1028,7 +1080,7 @@ const char EXP_CHARS[] = "eE";
  /* As in 0f12.456  */
  /* or   0d1.2345e12  */
  
-const char FLT_CHARS[] = "rRsSfFdDxXeEpP";
+const char FLT_CHARS[] = "rRsSfFdDxXeEpPHh";
  
  /* Prefix characters that indicate the start of an immediate
     value.  */
@@ -1038,6 +1090,16 @@ const char FLT_CHARS[] = "rRsSfFdDxXeEpP";
  
  #define skip_whitespace(str)  do { if (*(str) == ' ') ++(str); } while (0)
  
+enum fp_16bit_format
+{
+  ARM_FP16_FORMAT_IEEE         = 0x1,
+  ARM_FP16_FORMAT_ALTERNATIVE  = 0x2,
+  ARM_FP16_FORMAT_DEFAULT      = 0x3
+};
+
+static enum fp_16bit_format fp16_format = ARM_FP16_FORMAT_DEFAULT;
+
+
  static inline int
  skip_past_char (char ** str, char c)
  {
@@ -1179,6 +1241,57 @@ md_atof (int type, char * litP, int * sizeP)
  
    switch (type)
      {
+    case 'H':
+    case 'h':
+      prec = 1;
+      break;
+
+    /* If this is a bfloat16, then parse it slightly differently, as it
+       does not follow the IEEE specification for floating point numbers
+       exactly.  */
+    case 'b':
+      {
+       FLONUM_TYPE generic_float;
+
+       t = atof_ieee_detail (input_line_pointer, 1, 8, words, &generic_float);
+
+       if (t)
+         input_line_pointer = t;
+       else
+         return _("invalid floating point number");
+
+       switch (generic_float.sign)
+         {
+         /* Is +Inf.  */
+         case 'P':
+           words[0] = 0x7f80;
+           break;
+
+         /* Is -Inf.  */
+         case 'N':
+           words[0] = 0xff80;
+           break;
+
+         /* Is NaN.  */
+         /* bfloat16 has two types of NaN - quiet and signalling.
+            Quiet NaN has bit[6] == 1 && faction != 0, whereas
+            signalling NaN's have bit[0] == 0 && fraction != 0.
+            Chosen this specific encoding as it is the same form
+            as used by other IEEE 754 encodings in GAS.  */
+         case 0:
+           words[0] = 0x7fff;
+           break;
+
+         default:
+           break;
+         }
+
+       *sizeP = 2;
+
+       md_number_to_chars (litP, (valueT) words[0], sizeof (LITTLENUM_TYPE));
+
+       return NULL;
+      }
      case 'f':
      case 'F':
      case 's':
@@ -1213,34 +1326,29 @@ md_atof (int type, char * litP, int * sizeP)
      input_line_pointer = t;
    *sizeP = prec * sizeof (LITTLENUM_TYPE);
  
-  if (target_big_endian)
-    {
-      for (i = 0; i < prec; i++)
-       {
-         md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
-         litP += sizeof (LITTLENUM_TYPE);
-       }
-    }
+  if (target_big_endian || prec == 1)
+    for (i = 0; i < prec; i++)
+      {
+       md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
+       litP += sizeof (LITTLENUM_TYPE);
+      }
+  else if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_endian_pure))
+    for (i = prec - 1; i >= 0; i--)
+      {
+       md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
+       litP += sizeof (LITTLENUM_TYPE);
+      }
    else
-    {
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_endian_pure))
-       for (i = prec - 1; i >= 0; i--)
-         {
-           md_number_to_chars (litP, (valueT) words[i], sizeof (LITTLENUM_TYPE));
-           litP += sizeof (LITTLENUM_TYPE);
-         }
-      else
-       /* For a 4 byte float the order of elements in `words' is 1 0.
-          For an 8 byte float the order is 1 0 3 2.  */
-       for (i = 0; i < prec; i += 2)
-         {
-           md_number_to_chars (litP, (valueT) words[i + 1],
-                               sizeof (LITTLENUM_TYPE));
-           md_number_to_chars (litP + sizeof (LITTLENUM_TYPE),
-                               (valueT) words[i], sizeof (LITTLENUM_TYPE));
-           litP += 2 * sizeof (LITTLENUM_TYPE);
-         }
-    }
+    /* For a 4 byte float the order of elements in `words' is 1 0.
+       For an 8 byte float the order is 1 0 3 2.  */
+    for (i = 0; i < prec; i += 2)
+      {
+       md_number_to_chars (litP, (valueT) words[i + 1],
+                           sizeof (LITTLENUM_TYPE));
+       md_number_to_chars (litP + sizeof (LITTLENUM_TYPE),
+                           (valueT) words[i], sizeof (LITTLENUM_TYPE));
+       litP += 2 * sizeof (LITTLENUM_TYPE);
+      }
  
    return NULL;
  }
@@ -1319,7 +1427,7 @@ arm_reg_parse_multi (char **ccp)
      p++;
    while (ISALPHA (*p) || ISDIGIT (*p) || *p == '_');
  
-  reg = (struct reg_entry *) hash_find_n (arm_reg_hsh, start, p - start);
+  reg = (struct reg_entry *) str_hash_find_n (arm_reg_hsh, start, p - start);
  
    if (!reg)
      return NULL;
@@ -1439,6 +1547,28 @@ parse_neon_type (struct neon_type *type, char **str)
           thissize = 64;
           ptr++;
           goto done;
+       case 'b':
+         thistype = NT_bfloat;
+         switch (TOLOWER (*(++ptr)))
+           {
+           case 'f':
+             ptr += 1;
+             thissize = strtoul (ptr, &ptr, 10);
+             if (thissize != 16)
+               {
+                 as_bad (_("bad size %d in type specifier"), thissize);
+                 return FAIL;
+               }
+             goto done;
+           case '0': case '1': case '2': case '3': case '4':
+           case '5': case '6': case '7': case '8': case '9':
+           case ' ': case '.':
+             as_bad (_("unexpected type character `b' -- did you mean `bf'?"));
+             return FAIL;
+           default:
+             break;
+           }
+         break;
         default:
           as_bad (_("unexpected character `%c' in type specifier"), *ptr);
           return FAIL;
@@ -1653,9 +1783,14 @@ parse_typed_reg_or_scalar (char **ccp, enum arm_reg_type type,
      {
        if (type != REG_TYPE_VFD
           && !(type == REG_TYPE_VFS
-              && ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_2)))
+              && ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_2))
+         && !(type == REG_TYPE_NQ
+              && ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)))
         {
-         first_error (_("only D registers may be indexed"));
+         if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+           first_error (_("only D and Q registers may be indexed"));
+         else
+           first_error (_("only D registers may be indexed"));
           return FAIL;
         }
  
@@ -1744,27 +1879,41 @@ arm_typed_reg_parse (char **ccp, enum arm_reg_type type,
     just do easy checks here, and do further checks later.  */
  
  static int
-parse_scalar (char **ccp, int elsize, struct neon_type_el *type)
+parse_scalar (char **ccp, int elsize, struct neon_type_el *type, enum
+             arm_reg_type reg_type)
  {
    int reg;
    char *str = *ccp;
    struct neon_typed_alias atype;
-  enum arm_reg_type reg_type = REG_TYPE_VFD;
-
-  if (elsize == 4)
-    reg_type = REG_TYPE_VFS;
+  unsigned reg_size;
  
    reg = parse_typed_reg_or_scalar (&str, reg_type, NULL, &atype);
  
+  switch (reg_type)
+    {
+    case REG_TYPE_VFS:
+      reg_size = 32;
+      break;
+    case REG_TYPE_VFD:
+      reg_size = 64;
+      break;
+    case REG_TYPE_MQ:
+      reg_size = 128;
+      break;
+    default:
+      gas_assert (0);
+      return FAIL;
+    }
+
    if (reg == FAIL || (atype.defined & NTA_HASINDEX) == 0)
      return FAIL;
  
-  if (atype.index == NEON_ALL_LANES)
+  if (reg_type != REG_TYPE_MQ && atype.index == NEON_ALL_LANES)
      {
        first_error (_("scalar must have an index"));
        return FAIL;
      }
-  else if (atype.index >= 64 / elsize)
+  else if (atype.index >= reg_size / elsize)
      {
        first_error (_("scalar index out of range"));
        return FAIL;
@@ -1821,7 +1970,7 @@ parse_reg_list (char ** strp, enum reg_list_els etype)
               const char apsr_str[] = "apsr";
               int apsr_str_len = strlen (apsr_str);
  
-             reg = arm_reg_parse (&str, REGLIST_RN);
+             reg = arm_reg_parse (&str, REG_TYPE_RN);
               if (etype == REGLIST_CLRM)
                 {
                   if (reg == REG_SP || reg == REG_PC)
@@ -2397,7 +2546,7 @@ parse_reloc (char **str)
      return -1;
  
    if ((r = (struct reloc_entry *)
-       hash_find_n (arm_reloc_hsh, p, q - p)) == NULL)
+       str_hash_find_n (arm_reloc_hsh, p, q - p)) == NULL)
      return -1;
  
    *str = q + 1;
@@ -2412,7 +2561,7 @@ insert_reg_alias (char *str, unsigned number, int type)
    struct reg_entry *new_reg;
    const char *name;
  
-  if ((new_reg = (struct reg_entry *) hash_find (arm_reg_hsh, str)) != 0)
+  if ((new_reg = (struct reg_entry *) str_hash_find (arm_reg_hsh, str)) != 0)
      {
        if (new_reg->builtin)
         as_warn (_("ignoring attempt to redefine built-in register '%s'"), str);
@@ -2434,8 +2583,7 @@ insert_reg_alias (char *str, unsigned number, int type)
    new_reg->builtin = FALSE;
    new_reg->neon = NULL;
  
-  if (hash_insert (arm_reg_hsh, name, (void *) new_reg))
-    abort ();
+  str_hash_insert (arm_reg_hsh, name, new_reg);
  
    return new_reg;
  }
@@ -2483,7 +2631,7 @@ create_register_alias (char * newname, char *p)
    if (*oldname == '\0')
      return FALSE;
  
-  old = (struct reg_entry *) hash_find (arm_reg_hsh, oldname);
+  old = (struct reg_entry *) str_hash_find (arm_reg_hsh, oldname);
    if (!old)
      {
        as_warn (_("unknown register '%s' -- .req ignored"), oldname);
@@ -2735,7 +2883,7 @@ s_unreq (int a ATTRIBUTE_UNUSED)
      as_bad (_("invalid syntax for .unreq directive"));
    else
      {
-      struct reg_entry *reg = (struct reg_entry *) hash_find (arm_reg_hsh,
+      struct reg_entry *reg = (struct reg_entry *) str_hash_find (arm_reg_hsh,
                                                               name);
  
        if (!reg)
@@ -2748,10 +2896,9 @@ s_unreq (int a ATTRIBUTE_UNUSED)
           char * p;
           char * nbuf;
  
-         hash_delete (arm_reg_hsh, name, FALSE);
+         str_hash_delete (arm_reg_hsh, name);
           free ((char *) reg->name);
-         if (reg->neon)
-           free (reg->neon);
+         free (reg->neon);
           free (reg);
  
           /* Also locate the all upper case and all lower case versions.
@@ -2761,25 +2908,23 @@ s_unreq (int a ATTRIBUTE_UNUSED)
           nbuf = strdup (name);
           for (p = nbuf; *p; p++)
             *p = TOUPPER (*p);
-         reg = (struct reg_entry *) hash_find (arm_reg_hsh, nbuf);
+         reg = (struct reg_entry *) str_hash_find (arm_reg_hsh, nbuf);
           if (reg)
             {
-             hash_delete (arm_reg_hsh, nbuf, FALSE);
+             str_hash_delete (arm_reg_hsh, nbuf);
               free ((char *) reg->name);
-             if (reg->neon)
-               free (reg->neon);
+             free (reg->neon);
               free (reg);
             }
  
           for (p = nbuf; *p; p++)
             *p = TOLOWER (*p);
-         reg = (struct reg_entry *) hash_find (arm_reg_hsh, nbuf);
+         reg = (struct reg_entry *) str_hash_find (arm_reg_hsh, nbuf);
           if (reg)
             {
-             hash_delete (arm_reg_hsh, nbuf, FALSE);
+             str_hash_delete (arm_reg_hsh, nbuf);
               free ((char *) reg->name);
-             if (reg->neon)
-               free (reg->neon);
+             free (reg->neon);
               free (reg);
             }
  
@@ -4498,7 +4643,7 @@ s_arm_unwind_save_mmxwr (void)
      }
  
    return;
-error:
+ error:
    ignore_rest_of_line ();
  }
  
@@ -4566,7 +4711,7 @@ s_arm_unwind_save_mmxwcg (void)
    op = 0xc700 | mask;
    add_unwind_opcode (op, 2);
    return;
-error:
+ error:
    ignore_rest_of_line ();
  }
  
@@ -4897,6 +5042,55 @@ pe_directive_secrel (int dummy ATTRIBUTE_UNUSED)
  }
  #endif /* TE_PE */
  
+int
+arm_is_largest_exponent_ok (int precision)
+{
+  /* precision == 1 ensures that this will only return
+     true for 16 bit floats.  */
+  return (precision == 1) && (fp16_format == ARM_FP16_FORMAT_ALTERNATIVE);
+}
+
+static void
+set_fp16_format (int dummy ATTRIBUTE_UNUSED)
+{
+  char saved_char;
+  char* name;
+  enum fp_16bit_format new_format;
+
+  new_format = ARM_FP16_FORMAT_DEFAULT;
+
+  name = input_line_pointer;
+  while (*input_line_pointer && !ISSPACE (*input_line_pointer))
+    input_line_pointer++;
+
+  saved_char = *input_line_pointer;
+  *input_line_pointer = 0;
+
+  if (strcasecmp (name, "ieee") == 0)
+    new_format = ARM_FP16_FORMAT_IEEE;
+  else if (strcasecmp (name, "alternative") == 0)
+    new_format = ARM_FP16_FORMAT_ALTERNATIVE;
+  else
+    {
+      as_bad (_("unrecognised float16 format \"%s\""), name);
+      goto cleanup;
+    }
+
+  /* Only set fp16_format if it is still the default (aka not already
+     been set yet).  */
+  if (fp16_format == ARM_FP16_FORMAT_DEFAULT)
+    fp16_format = new_format;
+  else
+    {
+      if (new_format != fp16_format)
+       as_warn (_("float16 format cannot be set more than once, ignoring."));
+    }
+
+ cleanup:
+  *input_line_pointer = saved_char;
+  ignore_rest_of_line ();
+}
+
  /* This table describes all the machine specific pseudo-ops the assembler
     has to support.  The fields are:
       pseudo-op name without dot
@@ -4964,6 +5158,7 @@ const pseudo_typeS md_pseudo_table[] =
    { "extend",     float_cons, 'x' },
    { "ldouble",    float_cons, 'x' },
    { "packed",     float_cons, 'p' },
+  { "bfloat16",           float_cons, 'b' },
  #ifdef TE_PE
    {"secrel32", pe_directive_secrel, 0},
  #endif
@@ -4974,9 +5169,12 @@ const pseudo_typeS md_pseudo_table[] =
    {"asmfunc",      s_ccs_asmfunc,    0},
    {"endasmfunc",   s_ccs_endasmfunc, 0},
  
+  {"float16", float_cons, 'h' },
+  {"float16_format", set_fp16_format, 0 },
+
    { 0, 0, 0 }
  };
-\f
+
  /* Parser functions used exclusively in instruction operands.  */
  
  /* Generic immediate-value read function for use in insn parsing.
@@ -5338,7 +5536,7 @@ parse_shift (char **str, int i, enum parse_shift_mode mode)
        return FAIL;
      }
  
-  shift_name = (const struct asm_shift_name *) hash_find_n (arm_shift_hsh, *str,
+  shift_name = (const struct asm_shift_name *) str_hash_find_n (arm_shift_hsh, *str,
                                                             p - *str);
  
    if (shift_name == NULL)
@@ -6139,7 +6337,7 @@ parse_psr (char **str, bfd_boolean lhs)
           || strncasecmp (start, "psr", 3) == 0)
         p = start + strcspn (start, "rR") + 1;
  
-      psr = (const struct asm_psr *) hash_find_n (arm_v7m_psr_hsh, start,
+      psr = (const struct asm_psr *) str_hash_find_n (arm_v7m_psr_hsh, start,
                                                   p - start);
  
        if (!psr)
@@ -6165,7 +6363,7 @@ parse_psr (char **str, bfd_boolean lhs)
      goto unsupported_psr;
  
    p += 4;
-check_suffix:
+ check_suffix:
    if (*p == '_')
      {
        /* A suffix follows.  */
@@ -6242,7 +6440,7 @@ check_suffix:
         }
        else
         {
-         psr = (const struct asm_psr *) hash_find_n (arm_psr_hsh, start,
+         psr = (const struct asm_psr *) str_hash_find_n (arm_psr_hsh, start,
                                                       p - start);
           if (!psr)
             goto error;
@@ -6434,7 +6632,7 @@ parse_cond (char **str)
        n++;
      }
  
-  c = (const struct asm_cond *) hash_find_n (arm_cond_hsh, cond, n);
+  c = (const struct asm_cond *) str_hash_find_n (arm_cond_hsh, cond, n);
    if (!c)
      {
        inst.error = _("condition required");
@@ -6457,7 +6655,7 @@ parse_barrier (char **str)
    while (ISALPHA (*q))
      q++;
  
-  o = (const struct asm_barrier_opt *) hash_find_n (arm_barrier_opt_hsh, p,
+  o = (const struct asm_barrier_opt *) str_hash_find_n (arm_barrier_opt_hsh, p,
                                                     q - p);
    if (!o)
      return FAIL;
@@ -6539,7 +6737,61 @@ parse_neon_mov (char **str, int *which_operand)
    char *ptr = *str;
    struct neon_type_el optype;
  
-  if ((val = parse_scalar (&ptr, 8, &optype)) != FAIL)
+   if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+    {
+      /* Cases 17 or 19.  */
+      inst.operands[i].reg = val;
+      inst.operands[i].isvec = 1;
+      inst.operands[i].isscalar = 2;
+      inst.operands[i].vectype = optype;
+      inst.operands[i++].present = 1;
+
+      if (skip_past_comma (&ptr) == FAIL)
+       goto wanted_comma;
+
+      if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
+       {
+         /* Case 17: VMOV<c>.<dt> <Qd[idx]>, <Rt>  */
+         inst.operands[i].reg = val;
+         inst.operands[i].isreg = 1;
+         inst.operands[i].present = 1;
+       }
+      else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+       {
+         /* Case 19: VMOV<c> <Qd[idx]>, <Qd[idx2]>, <Rt>, <Rt2>  */
+         inst.operands[i].reg = val;
+         inst.operands[i].isvec = 1;
+         inst.operands[i].isscalar = 2;
+         inst.operands[i].vectype = optype;
+         inst.operands[i++].present = 1;
+
+         if (skip_past_comma (&ptr) == FAIL)
+           goto wanted_comma;
+
+         if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+           goto wanted_arm;
+
+         inst.operands[i].reg = val;
+         inst.operands[i].isreg = 1;
+         inst.operands[i++].present = 1;
+
+         if (skip_past_comma (&ptr) == FAIL)
+           goto wanted_comma;
+
+         if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) == FAIL)
+           goto wanted_arm;
+
+         inst.operands[i].reg = val;
+         inst.operands[i].isreg = 1;
+         inst.operands[i].present = 1;
+       }
+      else
+       {
+         first_error (_("expected ARM or MVE vector register"));
+         return FAIL;
+       }
+    }
+   else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_VFD)) != FAIL)
      {
        /* Case 4: VMOV<c><q>.<size> <Dn[x]>, <Rd>.  */
        inst.operands[i].reg = val;
@@ -6557,8 +6809,10 @@ parse_neon_mov (char **str, int *which_operand)
        inst.operands[i].isreg = 1;
        inst.operands[i].present = 1;
      }
-  else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype, &optype))
-          != FAIL)
+  else if (((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype, &optype))
+           != FAIL)
+          || ((val = arm_typed_reg_parse (&ptr, REG_TYPE_MQ, &rtype, &optype))
+              != FAIL))
      {
        /* Cases 0, 1, 2, 3, 5 (D only).  */
        if (skip_past_comma (&ptr) == FAIL)
@@ -6597,8 +6851,10 @@ parse_neon_mov (char **str, int *which_operand)
               inst.operands[i].present = 1;
             }
         }
-      else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype,
-                                          &optype)) != FAIL)
+      else if (((val = arm_typed_reg_parse (&ptr, REG_TYPE_NSDQ, &rtype,
+               &optype)) != FAIL)
+              || ((val = arm_typed_reg_parse (&ptr, REG_TYPE_MQ, &rtype,
+                  &optype)) != FAIL))
         {
           /* Case 0: VMOV<c><q> <Qd>, <Qm>
              Case 1: VMOV<c><q> <Dd>, <Dm>
@@ -6655,7 +6911,7 @@ parse_neon_mov (char **str, int *which_operand)
      }
    else if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
      {
-      /* Cases 6, 7.  */
+      /* Cases 6, 7, 16, 18.  */
        inst.operands[i].reg = val;
        inst.operands[i].isreg = 1;
        inst.operands[i++].present = 1;
@@ -6663,7 +6919,15 @@ parse_neon_mov (char **str, int *which_operand)
        if (skip_past_comma (&ptr) == FAIL)
         goto wanted_comma;
  
-      if ((val = parse_scalar (&ptr, 8, &optype)) != FAIL)
+      if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ)) != FAIL)
+       {
+         /* Case 18: VMOV<c>.<dt> <Rt>, <Qn[idx]>  */
+         inst.operands[i].reg = val;
+         inst.operands[i].isscalar = 2;
+         inst.operands[i].present = 1;
+         inst.operands[i].vectype = optype;
+       }
+      else if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_VFD)) != FAIL)
         {
           /* Case 6: VMOV<c><q>.<dt> <Rd>, <Dn[x]>  */
           inst.operands[i].reg = val;
@@ -6673,7 +6937,6 @@ parse_neon_mov (char **str, int *which_operand)
         }
        else if ((val = arm_reg_parse (&ptr, REG_TYPE_RN)) != FAIL)
         {
-         /* Case 7: VMOV<c><q> <Rd>, <Rn>, <Dm>  */
           inst.operands[i].reg = val;
           inst.operands[i].isreg = 1;
           inst.operands[i++].present = 1;
@@ -6682,37 +6945,70 @@ parse_neon_mov (char **str, int *which_operand)
             goto wanted_comma;
  
           if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFSD, &rtype, &optype))
-             == FAIL)
+             != FAIL)
             {
-             first_error (_(reg_expected_msgs[REG_TYPE_VFSD]));
-             return FAIL;
-           }
-
-         inst.operands[i].reg = val;
-         inst.operands[i].isreg = 1;
-         inst.operands[i].isvec = 1;
-         inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
-         inst.operands[i].vectype = optype;
-         inst.operands[i].present = 1;
+             /* Case 7: VMOV<c><q> <Rd>, <Rn>, <Dm>  */
  
-         if (rtype == REG_TYPE_VFS)
-           {
-             /* Case 14.  */
-             i++;
-             if (skip_past_comma (&ptr) == FAIL)
-               goto wanted_comma;
-             if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL,
-                                             &optype)) == FAIL)
-               {
-                 first_error (_(reg_expected_msgs[REG_TYPE_VFS]));
-                 return FAIL;
-               }
               inst.operands[i].reg = val;
               inst.operands[i].isreg = 1;
               inst.operands[i].isvec = 1;
-             inst.operands[i].issingle = 1;
+             inst.operands[i].issingle = (rtype == REG_TYPE_VFS);
               inst.operands[i].vectype = optype;
               inst.operands[i].present = 1;
+
+             if (rtype == REG_TYPE_VFS)
+               {
+                 /* Case 14.  */
+                 i++;
+                 if (skip_past_comma (&ptr) == FAIL)
+                   goto wanted_comma;
+                 if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL,
+                                                 &optype)) == FAIL)
+                   {
+                     first_error (_(reg_expected_msgs[REG_TYPE_VFS]));
+                     return FAIL;
+                   }
+                 inst.operands[i].reg = val;
+                 inst.operands[i].isreg = 1;
+                 inst.operands[i].isvec = 1;
+                 inst.operands[i].issingle = 1;
+                 inst.operands[i].vectype = optype;
+                 inst.operands[i].present = 1;
+               }
+           }
+         else
+           {
+             if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ))
+                      != FAIL)
+               {
+                 /* Case 16: VMOV<c> <Rt>, <Rt2>, <Qd[idx]>, <Qd[idx2]>  */
+                 inst.operands[i].reg = val;
+                 inst.operands[i].isvec = 1;
+                 inst.operands[i].isscalar = 2;
+                 inst.operands[i].vectype = optype;
+                 inst.operands[i++].present = 1;
+
+                 if (skip_past_comma (&ptr) == FAIL)
+                   goto wanted_comma;
+
+                 if ((val = parse_scalar (&ptr, 8, &optype, REG_TYPE_MQ))
+                     == FAIL)
+                   {
+                     first_error (_(reg_expected_msgs[REG_TYPE_MQ]));
+                     return FAIL;
+                   }
+                 inst.operands[i].reg = val;
+                 inst.operands[i].isvec = 1;
+                 inst.operands[i].isscalar = 2;
+                 inst.operands[i].vectype = optype;
+                 inst.operands[i].present = 1;
+               }
+             else
+               {
+                 first_error (_("VFP single, double or MVE vector register"
+                              " expected"));
+                 return FAIL;
+               }
             }
         }
        else if ((val = arm_typed_reg_parse (&ptr, REG_TYPE_VFS, NULL, &optype))
@@ -6773,12 +7069,17 @@ enum operand_parse_code
    OP_RNDMQ,     /* Neon double precision (0..31) or MVE vector register.  */
    OP_RNDMQR,    /* Neon double precision (0..31), MVE vector or ARM register.
                  */
+  OP_RNSDMQR,    /* Neon single or double precision, MVE vector or ARM register.
+                */
    OP_RNQ,      /* Neon quad precision register */
    OP_RNQMQ,    /* Neon quad or MVE vector register.  */
    OP_RVSD,     /* VFP single or double precision register */
+  OP_RVSD_COND,        /* VFP single, double precision register or condition code.  */
+  OP_RVSDMQ,   /* VFP single, double precision or MVE vector register.  */
    OP_RNSD,      /* Neon single or double precision register */
    OP_RNDQ,      /* Neon double or quad precision register */
    OP_RNDQMQ,     /* Neon double, quad or MVE vector register.  */
+  OP_RNDQMQR,   /* Neon double, quad, MVE vector or ARM register.  */
    OP_RNSDQ,    /* Neon single, double or quad precision register */
    OP_RNSC,      /* Neon scalar D[X] */
    OP_RVC,      /* VFP control register */
@@ -6793,17 +7094,21 @@ enum operand_parse_code
    OP_RIWG,     /* iWMMXt wCG register */
    OP_RXA,      /* XScale accumulator register */
  
+  OP_RNSDMQ,   /* Neon single, double or MVE vector register */
    OP_RNSDQMQ,  /* Neon single, double or quad register or MVE vector register
                  */
    OP_RNSDQMQR, /* Neon single, double or quad register, MVE vector register or
                    GPR (no SP/SP)  */
    OP_RMQ,      /* MVE vector register.  */
+  OP_RMQRZ,    /* MVE vector or ARM register including ZR.  */
+  OP_RMQRR,     /* MVE vector or ARM register.  */
  
    /* New operands for Armv8.1-M Mainline.  */
    OP_LR,       /* ARM LR register */
    OP_RRe,      /* ARM register, only even numbered.  */
    OP_RRo,      /* ARM register, only odd numbered, not r13 or r15.  */
    OP_RRnpcsp_I32, /* ARM register (no BadReg) or literal 1 .. 32 */
+  OP_RR_ZR,    /* ARM register or ZR but no PC */
  
    OP_REGLST,   /* ARM register list */
    OP_CLRMLST,  /* CLRM register list */
@@ -6819,16 +7124,28 @@ enum operand_parse_code
    OP_RNDQ_I0,   /* Neon D or Q reg, or immediate zero.  */
    OP_RVSD_I0,  /* VFP S or D reg, or immediate zero.  */
    OP_RSVD_FI0, /* VFP S or D reg, or floating point immediate zero.  */
+  OP_RSVDMQ_FI0, /* VFP S, D, MVE vector register or floating point immediate
+                   zero.  */
    OP_RR_RNSC,   /* ARM reg or Neon scalar.  */
    OP_RNSD_RNSC, /* Neon S or D reg, or Neon scalar.  */
    OP_RNSDQ_RNSC, /* Vector S, D or Q reg, or Neon scalar.  */
    OP_RNSDQ_RNSC_MQ, /* Vector S, D or Q reg, Neon scalar or MVE vector register.
                      */
+  OP_RNSDQ_RNSC_MQ_RR, /* Vector S, D or Q reg, or MVE vector reg , or Neon
+                         scalar, or ARM register.  */
    OP_RNDQ_RNSC, /* Neon D or Q reg, or Neon scalar.  */
+  OP_RNDQ_RNSC_RR, /* Neon D or Q reg, Neon scalar, or ARM register.  */
+  OP_RNDQMQ_RNSC_RR, /* Neon D or Q reg, Neon scalar, MVE vector or ARM
+                       register.  */
+  OP_RNDQMQ_RNSC, /* Neon D, Q or MVE vector reg, or Neon scalar.  */
    OP_RND_RNSC,  /* Neon D reg, or Neon scalar.  */
    OP_VMOV,      /* Neon VMOV operands.  */
    OP_RNDQ_Ibig,        /* Neon D or Q reg, or big immediate for logic and VMVN.  */
+  /* Neon D, Q or MVE vector register, or big immediate for logic and VMVN.  */
+  OP_RNDQMQ_Ibig,
    OP_RNDQ_I63b, /* Neon D or Q reg, or immediate for shift.  */
+  OP_RNDQMQ_I63b_RR, /* Neon D or Q reg, immediate for shift, MVE vector or
+                       ARM register.  */
    OP_RIWR_I32z, /* iWMMXt wR register, or immediate 0 .. 32 for iWMMXt2.  */
    OP_VLDR,     /* VLDR operand.  */
  
@@ -6841,12 +7158,16 @@ enum operand_parse_code
    OP_I31w,     /*                 0 .. 31, optional trailing ! */
    OP_I32,      /*                 1 .. 32 */
    OP_I32z,     /*                 0 .. 32 */
+  OP_I48_I64,  /*                 48 or 64 */
    OP_I63,      /*                 0 .. 63 */
    OP_I63s,     /*               -64 .. 63 */
    OP_I64,      /*                 1 .. 64 */
    OP_I64z,     /*                 0 .. 64 */
+  OP_I127,     /*                 0 .. 127 */
    OP_I255,     /*                 0 .. 255 */
-
+  OP_I511,     /*                 0 .. 511 */
+  OP_I4095,    /*                 0 .. 4095 */
+  OP_I8191,    /*                 0 .. 8191 */
    OP_I4b,      /* immediate, prefix optional, 1 .. 4 */
    OP_I7b,      /*                             0 .. 7 */
    OP_I15b,     /*                             0 .. 15 */
@@ -6903,12 +7224,16 @@ enum operand_parse_code
    OP_oRNSDQ,    /* Optional single, double or quad precision vector register */
    OP_oRNSDQMQ,  /* Optional single, double or quad register or MVE vector
                     register.  */
+  OP_oRNSDMQ,   /* Optional single, double register or MVE vector
+                   register.  */
    OP_oSHll,     /* LSL immediate */
    OP_oSHar,     /* ASR immediate */
    OP_oSHllar,   /* LSL or ASR immediate */
    OP_oROR,      /* ROR 0/8/16/24 */
    OP_oBARRIER_I15, /* Option argument for a barrier instruction.  */
  
+  OP_oRMQRZ,   /* optional MVE vector or ARM register including ZR.  */
+
    /* Some pre-defined mixed (ARM/THUMB) operands.  */
    OP_RR_npcsp          = MIX_ARM_THUMB_OPERANDS (OP_RR, OP_RRnpcsp),
    OP_RRnpc_npcsp       = MIX_ARM_THUMB_OPERANDS (OP_RRnpc, OP_RRnpcsp),
@@ -6958,6 +7283,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
        inst.operands[i].isvec = (rtype == REG_TYPE_VFS          \
                              || rtype == REG_TYPE_VFD           \
                              || rtype == REG_TYPE_NQ);          \
+      inst.operands[i].iszr = (rtype == REG_TYPE_ZR);          \
      }                                                          \
    while (0)
  
@@ -6976,6 +7302,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
        inst.operands[i].isvec = (rtype == REG_TYPE_VFS          \
                              || rtype == REG_TYPE_VFD           \
                              || rtype == REG_TYPE_NQ);          \
+      inst.operands[i].iszr = (rtype == REG_TYPE_ZR);          \
      }                                                          \
    while (0)
  
@@ -6988,10 +7315,30 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
      }                                                          \
    while (0)
  
-#define po_scalar_or_goto(elsz, label)                                 \
+#define po_imm1_or_imm2_or_fail(imm1, imm2, popt)              \
+  do                                                           \
+    {                                                          \
+      expressionS exp;                                         \
+      my_get_expression (&exp, &str, popt);                    \
+      if (exp.X_op != O_constant)                              \
+       {                                                       \
+         inst.error = _("constant expression required");       \
+         goto failure;                                         \
+       }                                                       \
+      if (exp.X_add_number != imm1 && exp.X_add_number != imm2) \
+       {                                                       \
+         inst.error = _("immediate value 48 or 64 expected");  \
+         goto failure;                                         \
+       }                                                       \
+      inst.operands[i].imm = exp.X_add_number;                 \
+    }                                                          \
+  while (0)
+
+#define po_scalar_or_goto(elsz, label, reg_type)                       \
    do                                                                   \
      {                                                                  \
-      val = parse_scalar (& str, elsz, & inst.operands[i].vectype);    \
+      val = parse_scalar (& str, elsz, & inst.operands[i].vectype,     \
+                         reg_type);                                    \
        if (val == FAIL)                                                 \
         goto label;                                                     \
        inst.operands[i].reg = val;                                      \
@@ -7048,7 +7395,6 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
        if (op_parse_code >= OP_FIRST_OPTIONAL)
         {
           /* Remember where we are in case we need to backtrack.  */
-         gas_assert (!backtrack_pos);
           backtrack_pos = str;
           backtrack_error = inst.error;
           backtrack_index = i;
@@ -7076,6 +7422,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         case OP_RVS:   po_reg_or_fail (REG_TYPE_VFS);     break;
         case OP_RVD:   po_reg_or_fail (REG_TYPE_VFD);     break;
         case OP_oRND:
+       case OP_RNSDMQR:
+         po_reg_or_goto (REG_TYPE_VFS, try_rndmqr);
+         break;
+       try_rndmqr:
         case OP_RNDMQR:
           po_reg_or_goto (REG_TYPE_RN, try_rndmq);
           break;
@@ -7090,7 +7440,20 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           break;
           /* Also accept generic coprocessor regs for unknown registers.  */
           coproc_reg:
-         po_reg_or_fail (REG_TYPE_CN);
+         po_reg_or_goto (REG_TYPE_CN, vpr_po);
+         break;
+         /* Also accept P0 or p0 for VPR.P0.  Since P0 is already an
+            existing register with a value of 0, this seems like the
+            best way to parse P0.  */
+         vpr_po:
+         if (strncasecmp (str, "P0", 2) == 0)
+           {
+             str += 2;
+             inst.operands[i].isreg = 1;
+             inst.operands[i].reg = 13;
+           }
+         else
+           goto failure;
           break;
         case OP_RMF:   po_reg_or_fail (REG_TYPE_MVF);     break;
         case OP_RMD:   po_reg_or_fail (REG_TYPE_MVD);     break;
@@ -7109,6 +7472,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         try_nq:
         case OP_RNQ:   po_reg_or_fail (REG_TYPE_NQ);      break;
         case OP_RNSD:  po_reg_or_fail (REG_TYPE_NSD);     break;
+       case OP_RNDQMQR:
+         po_reg_or_goto (REG_TYPE_RN, try_rndqmq);
+         break;
+       try_rndqmq:
         case OP_oRNDQMQ:
         case OP_RNDQMQ:
           po_reg_or_goto (REG_TYPE_MQ, try_rndq);
@@ -7116,7 +7483,21 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         try_rndq:
         case OP_oRNDQ:
         case OP_RNDQ:  po_reg_or_fail (REG_TYPE_NDQ);     break;
+       case OP_RVSDMQ:
+         po_reg_or_goto (REG_TYPE_MQ, try_rvsd);
+         break;
+       try_rvsd:
         case OP_RVSD:  po_reg_or_fail (REG_TYPE_VFSD);    break;
+       case OP_RVSD_COND:
+         po_reg_or_goto (REG_TYPE_VFSD, try_cond);
+         break;
+       case OP_oRNSDMQ:
+       case OP_RNSDMQ:
+         po_reg_or_goto (REG_TYPE_NSD, try_mq2);
+         break;
+         try_mq2:
+         po_reg_or_fail (REG_TYPE_MQ);
+         break;
         case OP_oRNSDQ:
         case OP_RNSDQ: po_reg_or_fail (REG_TYPE_NSDQ);    break;
         case OP_RNSDQMQR:
@@ -7131,12 +7512,16 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           po_reg_or_fail (REG_TYPE_NSDQ);
           inst.error = 0;
           break;
+       case OP_RMQRR:
+         po_reg_or_goto (REG_TYPE_RN, try_rmq);
+         break;
+       try_rmq:
         case OP_RMQ:
           po_reg_or_fail (REG_TYPE_MQ);
           break;
         /* Neon scalar. Using an element size of 8 means that some invalid
            scalars are accepted here, so deal with those in later code.  */
-       case OP_RNSC:  po_scalar_or_goto (8, failure);    break;
+       case OP_RNSC:  po_scalar_or_goto (8, failure, REG_TYPE_VFD);    break;
  
         case OP_RNDQ_I0:
           {
@@ -7151,6 +7536,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           po_reg_or_goto (REG_TYPE_VFSD, try_imm0);
           break;
  
+       case OP_RSVDMQ_FI0:
+         po_reg_or_goto (REG_TYPE_MQ, try_rsvd_fi0);
+         break;
+       try_rsvd_fi0:
         case OP_RSVD_FI0:
           {
             po_reg_or_goto (REG_TYPE_VFSD, try_ifimm0);
@@ -7169,41 +7558,58 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
  
         case OP_RR_RNSC:
           {
-           po_scalar_or_goto (8, try_rr);
+           po_scalar_or_goto (8, try_rr, REG_TYPE_VFD);
             break;
             try_rr:
             po_reg_or_fail (REG_TYPE_RN);
           }
           break;
  
+       case OP_RNSDQ_RNSC_MQ_RR:
+         po_reg_or_goto (REG_TYPE_RN, try_rnsdq_rnsc_mq);
+         break;
+       try_rnsdq_rnsc_mq:
         case OP_RNSDQ_RNSC_MQ:
           po_reg_or_goto (REG_TYPE_MQ, try_rnsdq_rnsc);
           break;
         try_rnsdq_rnsc:
         case OP_RNSDQ_RNSC:
           {
-           po_scalar_or_goto (8, try_nsdq);
+           po_scalar_or_goto (8, try_nsdq, REG_TYPE_VFD);
+           inst.error = 0;
             break;
             try_nsdq:
             po_reg_or_fail (REG_TYPE_NSDQ);
+           inst.error = 0;
           }
           break;
  
         case OP_RNSD_RNSC:
           {
-           po_scalar_or_goto (8, try_s_scalar);
+           po_scalar_or_goto (8, try_s_scalar, REG_TYPE_VFD);
             break;
             try_s_scalar:
-           po_scalar_or_goto (4, try_nsd);
+           po_scalar_or_goto (4, try_nsd, REG_TYPE_VFS);
             break;
             try_nsd:
             po_reg_or_fail (REG_TYPE_NSD);
           }
           break;
  
+       case OP_RNDQMQ_RNSC_RR:
+         po_reg_or_goto (REG_TYPE_MQ, try_rndq_rnsc_rr);
+         break;
+       try_rndq_rnsc_rr:
+       case OP_RNDQ_RNSC_RR:
+         po_reg_or_goto (REG_TYPE_RN, try_rndq_rnsc);
+         break;
+       case OP_RNDQMQ_RNSC:
+         po_reg_or_goto (REG_TYPE_MQ, try_rndq_rnsc);
+         break;
+       try_rndq_rnsc:
         case OP_RNDQ_RNSC:
           {
-           po_scalar_or_goto (8, try_ndq);
+           po_scalar_or_goto (8, try_ndq, REG_TYPE_VFD);
             break;
             try_ndq:
             po_reg_or_fail (REG_TYPE_NDQ);
@@ -7212,7 +7618,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
  
         case OP_RND_RNSC:
           {
-           po_scalar_or_goto (8, try_vfd);
+           po_scalar_or_goto (8, try_vfd, REG_TYPE_VFD);
             break;
             try_vfd:
             po_reg_or_fail (REG_TYPE_VFD);
@@ -7225,6 +7631,10 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           po_misc_or_fail (parse_neon_mov (&str, &i) == FAIL);
           break;
  
+       case OP_RNDQMQ_Ibig:
+         po_reg_or_goto (REG_TYPE_MQ, try_rndq_ibig);
+         break;
+       try_rndq_ibig:
         case OP_RNDQ_Ibig:
           {
             po_reg_or_goto (REG_TYPE_NDQ, try_immbig);
@@ -7241,6 +7651,13 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           }
           break;
  
+       case OP_RNDQMQ_I63b_RR:
+         po_reg_or_goto (REG_TYPE_MQ, try_rndq_i63b_rr);
+         break;
+       try_rndq_i63b_rr:
+         po_reg_or_goto (REG_TYPE_RN, try_rndq_i63b);
+         break;
+       try_rndq_i63b:
         case OP_RNDQ_I63b:
           {
             po_reg_or_goto (REG_TYPE_NDQ, try_shimm);
@@ -7272,12 +7689,16 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         case OP_I31:     po_imm_or_fail (  0,     31, FALSE);   break;
         case OP_I32:     po_imm_or_fail (  1,     32, FALSE);   break;
         case OP_I32z:    po_imm_or_fail (  0,     32, FALSE);   break;
+       case OP_I48_I64: po_imm1_or_imm2_or_fail (48, 64, FALSE); break;
         case OP_I63s:    po_imm_or_fail (-64,     63, FALSE);   break;
         case OP_I63:     po_imm_or_fail (  0,     63, FALSE);   break;
         case OP_I64:     po_imm_or_fail (  1,     64, FALSE);   break;
         case OP_I64z:    po_imm_or_fail (  0,     64, FALSE);   break;
+       case OP_I127:    po_imm_or_fail (  0,    127, FALSE);   break;
         case OP_I255:    po_imm_or_fail (  0,    255, FALSE);   break;
-
+       case OP_I511:    po_imm_or_fail (  0,    511, FALSE);   break;
+       case OP_I4095:   po_imm_or_fail (  0,    4095, FALSE);  break;
+       case OP_I8191:   po_imm_or_fail (  0,    8191, FALSE);  break;
         case OP_I4b:     po_imm_or_fail (  1,      4, TRUE);    break;
         case OP_oI7b:
         case OP_I7b:     po_imm_or_fail (  0,      7, TRUE);    break;
@@ -7370,6 +7791,9 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         case OP_RRnpc_I0: po_reg_or_goto (REG_TYPE_RN, I0);   break;
         I0:               po_imm_or_fail (0, 0, FALSE);       break;
  
+       case OP_RRnpcsp_I32: po_reg_or_goto (REG_TYPE_RN, I32); break;
+       I32:                 po_imm_or_fail (1, 32, FALSE);     break;
+
         case OP_RF_IF:    po_reg_or_goto (REG_TYPE_FN, IF);   break;
         IF:
           if (!is_immediate_prefix (*str))
@@ -7423,6 +7847,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
         case OP_CPSF:    val = parse_cps_flags (&str);          break;
         case OP_ENDI:    val = parse_endian_specifier (&str);   break;
         case OP_oROR:    val = parse_ror (&str);                break;
+       try_cond:
         case OP_COND:    val = parse_cond (&str);               break;
         case OP_oBARRIER_I15:
           po_barrier_or_imm (str); break;
@@ -7596,6 +8021,19 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
           po_misc_or_fail (parse_shift (&str, i, SHIFT_LSL_OR_ASR_IMMEDIATE));
           break;
  
+       case OP_RMQRZ:
+       case OP_oRMQRZ:
+         po_reg_or_goto (REG_TYPE_MQ, try_rr_zr);
+         break;
+
+       case OP_RR_ZR:
+       try_rr_zr:
+         po_reg_or_goto (REG_TYPE_RN, ZR);
+         break;
+       ZR:
+         po_reg_or_fail (REG_TYPE_ZR);
+         break;
+
         default:
           as_fatal (_("unhandled operand code %d"), op_parse_code);
         }
@@ -7617,6 +8055,7 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
  
         case OP_oRRnpcsp:
         case OP_RRnpcsp:
+       case OP_RRnpcsp_I32:
           if (inst.operands[i].isreg)
             {
               if (inst.operands[i].reg == REG_PC)
@@ -7639,10 +8078,12 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
             inst.error = BAD_PC;
           break;
  
+       case OP_RVSD_COND:
         case OP_VLDR:
           if (inst.operands[i].isreg)
             break;
         /* fall through.  */
+
         case OP_CPSF:
         case OP_ENDI:
         case OP_oROR:
@@ -7671,6 +8112,13 @@ parse_operands (char *str, const unsigned int *pattern, bfd_boolean thumb)
             inst.error = _("operand must be LR register");
           break;
  
+       case OP_RMQRZ:
+       case OP_oRMQRZ:
+       case OP_RR_ZR:
+         if (!inst.operands[i].iszr && inst.operands[i].reg == REG_PC)
+           inst.error = BAD_PC;
+         break;
+
         case OP_RRe:
           if (inst.operands[i].isreg
               && (inst.operands[i].reg & 0x00000001) != 0)
@@ -8487,6 +8935,11 @@ move_or_literal_pool (int i, enum lit_type t, bfd_boolean mode_3)
                       inst.instruction |= (imm & 0x0800) << 15;
                       inst.instruction |= (imm & 0x0700) << 4;
                       inst.instruction |= (imm & 0x00ff);
+                     /*  In case this replacement is being done on Armv8-M
+                         Baseline we need to make sure to disable the
+                         instruction size check, as otherwise GAS will reject
+                         the use of this T32 instruction.  */
+                     inst.size_req = 0;
                       return TRUE;
                     }
                 }
@@ -9611,10 +10064,42 @@ do_vmrs (void)
        return;
      }
  
-  /* MVFR2 is only valid at ARMv8-A.  */
-  if (inst.operands[1].reg == 5)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-               _(BAD_FPU));
+  switch (inst.operands[1].reg)
+    {
+    /* MVFR2 is only valid for Armv8-A.  */
+    case 5:
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+                 _(BAD_FPU));
+      break;
+
+    /* Check for new Armv8.1-M Mainline changes to <spec_reg>.  */
+    case 1: /* fpscr.  */
+      constraint (!(ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+                   || ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+                 _(BAD_FPU));
+      break;
+
+    case 14: /* fpcxt_ns.  */
+    case 15: /* fpcxt_s.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main),
+                 _("selected processor does not support instruction"));
+      break;
+
+    case  2: /* fpscr_nzcvqc.  */
+    case 12: /* vpr.  */
+    case 13: /* p0.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main)
+                 || (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+                     && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+                 _("selected processor does not support instruction"));
+      if (inst.operands[0].reg != 2
+         && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       as_warn (_("accessing MVE system register without MVE is UNPREDICTABLE"));
+      break;
+
+    default:
+      break;
+    }
  
    /* APSR_ sets isvec. All other refs to PC are illegal.  */
    if (!inst.operands[0].isvec && Rt == REG_PC)
@@ -9642,11 +10127,43 @@ do_vmsr (void)
        return;
      }
  
-  /* MVFR2 is only valid for ARMv8-A.  */
-  if (inst.operands[0].reg == 5)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-               _(BAD_FPU));
-
+  switch (inst.operands[0].reg)
+    {
+    /* MVFR2 is only valid for Armv8-A.  */
+    case 5:
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+                 _(BAD_FPU));
+      break;
+
+    /* Check for new Armv8.1-M Mainline changes to <spec_reg>.  */
+    case  1: /* fpcr.  */
+      constraint (!(ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+                   || ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+                 _(BAD_FPU));
+      break;
+
+    case 14: /* fpcxt_ns.  */
+    case 15: /* fpcxt_s.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main),
+                 _("selected processor does not support instruction"));
+      break;
+
+    case  2: /* fpscr_nzcvqc.  */
+    case 12: /* vpr.  */
+    case 13: /* p0.  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8_1m_main)
+                 || (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+                     && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)),
+                 _("selected processor does not support instruction"));
+      if (inst.operands[0].reg != 2
+         && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       as_warn (_("accessing MVE system register without MVE is UNPREDICTABLE"));
+      break;
+
+    default:
+      break;
+    }
+
    /* If we get through parsing the register name, we just insert the number
       generated into the instruction without further validation.  */
    inst.instruction |= (inst.operands[0].reg << 16);
@@ -9945,6 +10462,9 @@ do_shift (void)
  static void
  do_smc (void)
  {
+  unsigned int value = inst.relocs[0].exp.X_add_number;
+  constraint (value > 0xf, _("immediate too large (bigger than 0xF)"));
+
    inst.relocs[0].type = BFD_RELOC_ARM_SMC;
    inst.relocs[0].pc_rel = 0;
  }
@@ -10165,6 +10685,10 @@ do_sxth (void)
  static void
  do_vfp_sp_monadic (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Sd);
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Sm);
  }
@@ -10200,6 +10724,10 @@ do_vfp_sp_dp_cvt (void)
  static void
  do_vfp_reg_from_sp (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+            && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+            _(BAD_FPU));
+
    inst.instruction |= inst.operands[0].reg << 12;
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Sn);
  }
@@ -10217,6 +10745,10 @@ do_vfp_reg2_from_sp2 (void)
  static void
  do_vfp_sp_from_reg (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+            && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+            _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Sn);
    inst.instruction |= inst.operands[1].reg << 12;
  }
@@ -10319,6 +10851,10 @@ do_vfp_xp_ldstmdb (void)
  static void
  do_vfp_dp_rd_rm (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dd);
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dm);
  }
@@ -10340,6 +10876,10 @@ do_vfp_dp_rd_rn (void)
  static void
  do_vfp_dp_rd_rn_rm (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dd);
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dn);
    encode_arm_vfp_reg (inst.operands[2].reg, VFP_REG_Dm);
@@ -10354,6 +10894,10 @@ do_vfp_dp_rd (void)
  static void
  do_vfp_dp_rm_rd_rn (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
+
    encode_arm_vfp_reg (inst.operands[0].reg, VFP_REG_Dm);
    encode_arm_vfp_reg (inst.operands[1].reg, VFP_REG_Dd);
    encode_arm_vfp_reg (inst.operands[2].reg, VFP_REG_Dn);
@@ -10865,7 +11409,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
      inst.error = _("instruction does not accept unindexed addressing");
  }
  
-/* Table of Thumb instructions which exist in both 16- and 32-bit
+/* Table of Thumb instructions which exist in 16- and/or 32-bit
     encodings (the latter only in post-V6T2 cores).  The index is the
     value used in the insns table below.  When there is more than one
     possible 16-bit encoding for the instruction, this table always
@@ -10894,16 +11438,27 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
    X(_bflx,  0000, f070e001),                   \
    X(_bic,   4380, ea200000),                   \
    X(_bics,  4380, ea300000),                   \
+  X(_cinc,  0000, ea509000),                   \
+  X(_cinv,  0000, ea50a000),                   \
    X(_cmn,   42c0, eb100f00),                   \
    X(_cmp,   2800, ebb00f00),                   \
+  X(_cneg,  0000, ea50b000),                   \
    X(_cpsie, b660, f3af8400),                   \
    X(_cpsid, b670, f3af8600),                   \
    X(_cpy,   4600, ea4f0000),                   \
+  X(_csel,  0000, ea508000),                   \
+  X(_cset,  0000, ea5f900f),                   \
+  X(_csetm, 0000, ea5fa00f),                   \
+  X(_csinc, 0000, ea509000),                   \
+  X(_csinv, 0000, ea50a000),                   \
+  X(_csneg, 0000, ea50b000),                   \
    X(_dec_sp,80dd, f1ad0d00),                   \
    X(_dls,   0000, f040e001),                   \
+  X(_dlstp, 0000, f000e001),                   \
    X(_eor,   4040, ea800000),                   \
    X(_eors,  4040, ea900000),                   \
    X(_inc_sp,00dd, f10d0d00),                   \
+  X(_lctp,  0000, f00fe001),                   \
    X(_ldmia, c800, e8900000),                   \
    X(_ldr,   6800, f8500000),                   \
    X(_ldrb,  7800, f8100000),                   \
@@ -10914,6 +11469,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
    X(_ldr_pc2,4800, f85f0000),                  \
    X(_ldr_sp,9800, f85d0000),                   \
    X(_le,    0000, f00fc001),                   \
+  X(_letp,  0000, f01fc001),                   \
    X(_lsl,   0000, fa00f000),                   \
    X(_lsls,  0000, fa10f000),                   \
    X(_lsr,   0800, fa20f000),                   \
@@ -10956,6 +11512,7 @@ encode_thumb32_addr_mode (int i, bfd_boolean is_t, bfd_boolean is_d)
    X(_wfe,   bf20, f3af8002),                   \
    X(_wfi,   bf30, f3af8003),                   \
    X(_wls,   0000, f040c001),                   \
+  X(_wlstp, 0000, f000c001),                   \
    X(_sev,   bf40, f3af8004),                    \
    X(_sevl,  bf50, f3af8005),                   \
    X(_udf,   de00, f7f0a000)
@@ -11709,6 +12266,60 @@ do_t_clz (void)
    inst.instruction |= Rm;
  }
  
+/* For the Armv8.1-M conditional instructions.  */
+static void
+do_t_cond (void)
+{
+  unsigned Rd, Rn, Rm;
+  signed int cond;
+
+  constraint (inst.cond != COND_ALWAYS, BAD_COND);
+
+  Rd = inst.operands[0].reg;
+  switch (inst.instruction)
+    {
+      case T_MNEM_csinc:
+      case T_MNEM_csinv:
+      case T_MNEM_csneg:
+      case T_MNEM_csel:
+       Rn = inst.operands[1].reg;
+       Rm = inst.operands[2].reg;
+       cond = inst.operands[3].imm;
+       constraint (Rn == REG_SP, BAD_SP);
+       constraint (Rm == REG_SP, BAD_SP);
+       break;
+
+      case T_MNEM_cinc:
+      case T_MNEM_cinv:
+      case T_MNEM_cneg:
+       Rn = inst.operands[1].reg;
+       cond = inst.operands[2].imm;
+       /* Invert the last bit to invert the cond.  */
+       cond = TOGGLE_BIT (cond, 0);
+       constraint (Rn == REG_SP, BAD_SP);
+       Rm = Rn;
+       break;
+
+      case T_MNEM_csetm:
+      case T_MNEM_cset:
+       cond = inst.operands[1].imm;
+       /* Invert the last bit to invert the cond.  */
+       cond = TOGGLE_BIT (cond, 0);
+       Rn = REG_PC;
+       Rm = REG_PC;
+       break;
+
+      default: abort ();
+    }
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  inst.instruction = THUMB_OP32 (inst.instruction);
+  inst.instruction |= Rd << 8;
+  inst.instruction |= Rn << 16;
+  inst.instruction |= Rm;
+  inst.instruction |= cond << 4;
+}
+
  static void
  do_t_csdb (void)
  {
@@ -11857,18 +12468,6 @@ do_t_it (void)
    inst.instruction |= cond << 4;
  }
  
-static void
-do_mve_vpt (void)
-{
-  /* We are dealing with a vector predicated block.  */
-  set_pred_insn_type (VPT_INSN);
-  now_pred.cc = 0;
-  now_pred.mask = ((inst.instruction & 0x00400000) >> 19)
-                 | ((inst.instruction & 0xe000) >> 13);
-  now_pred.warn_deprecated = FALSE;
-  now_pred.type = VECTOR_PRED;
-}
-
  /* Helper function used for both push/pop and ldm/stm.  */
  static void
  encode_thumb2_multi (bfd_boolean do_io, int base, unsigned mask,
@@ -13496,10 +14095,11 @@ do_t_smc (void)
               _("SMC is not permitted on this architecture"));
    constraint (inst.relocs[0].exp.X_op != O_constant,
               _("expression too complex"));
+  constraint (value > 0xf, _("immediate too large (bigger than 0xF)"));
+
    inst.relocs[0].type = BFD_RELOC_UNUSED;
-  inst.instruction |= (value & 0xf000) >> 12;
-  inst.instruction |= (value & 0x0ff0);
    inst.instruction |= (value & 0x000f) << 16;
+
    /* PR gas/15623: SMC instructions must be last in an IT block.  */
    set_pred_insn_type_last ();
  }
@@ -13893,35 +14493,52 @@ v8_1_loop_reloc (int is_le)
      }
  }
  
-/* To handle the Scalar Low Overhead Loop instructions
-   in Armv8.1-M Mainline.  */
+/* For shifts with four operands in MVE.  */
  static void
-do_t_loloop (void)
+do_mve_scalar_shift1 (void)
  {
-  unsigned long insn = inst.instruction;
+  unsigned int value = inst.operands[2].imm;
  
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
-  inst.instruction = THUMB_OP32 (inst.instruction);
+  inst.instruction |= inst.operands[0].reg << 16;
+  inst.instruction |= inst.operands[1].reg << 8;
  
-  switch (insn)
-    {
-    case T_MNEM_le:
-      /* le <label>.  */
-      if (!inst.operands[0].present)
-       inst.instruction |= 1 << 21;
+  /* Setting the bit for saturation.  */
+  inst.instruction |= ((value == 64) ? 0: 1) << 7;
  
-      v8_1_loop_reloc (TRUE);
-      break;
+  /* Assuming Rm is already checked not to be 11x1.  */
+  constraint (inst.operands[3].reg == inst.operands[0].reg, BAD_OVERLAP);
+  constraint (inst.operands[3].reg == inst.operands[1].reg, BAD_OVERLAP);
+  inst.instruction |= inst.operands[3].reg << 12;
+}
  
-    case T_MNEM_wls:
-      v8_1_loop_reloc (FALSE);
-      /* Fall through.  */
-    case T_MNEM_dls:
-      constraint (inst.operands[1].isreg != 1, BAD_ARGS);
-      inst.instruction |= (inst.operands[1].reg << 16);
-      break;
+/* For shifts in MVE.  */
+static void
+do_mve_scalar_shift (void)
+{
+  if (!inst.operands[2].present)
+    {
+      inst.operands[2] = inst.operands[1];
+      inst.operands[1].reg = 0xf;
+    }
+
+  inst.instruction |= inst.operands[0].reg << 16;
+  inst.instruction |= inst.operands[1].reg << 8;
  
-    default: abort();
+  if (inst.operands[2].isreg)
+    {
+      /* Assuming Rm is already checked not to be 11x1.  */
+      constraint (inst.operands[2].reg == inst.operands[0].reg, BAD_OVERLAP);
+      constraint (inst.operands[2].reg == inst.operands[1].reg, BAD_OVERLAP);
+      inst.instruction |= inst.operands[2].reg << 12;
+    }
+  else
+    {
+      /* Assuming imm is already checked as [1,32].  */
+      unsigned int value = inst.operands[2].imm;
+      inst.instruction |= (value & 0x1c) << 10;
+      inst.instruction |= (value & 0x03) << 6;
+      /* Change last 4 bits from 0xd to 0xf.  */
+      inst.instruction |= 0x2;
      }
  }
  
@@ -13937,6 +14554,7 @@ do_t_loloop (void)
  #define M_MNEM_vmlsdavax  0xeef01e21
  #define M_MNEM_vmullt  0xee011e00
  #define M_MNEM_vmullb  0xee010e00
+#define M_MNEM_vctp    0xf000e801
  #define M_MNEM_vst20   0xfc801e00
  #define M_MNEM_vst21   0xfc801e20
  #define M_MNEM_vst40   0xfc801e01
@@ -13957,6 +14575,59 @@ do_t_loloop (void)
  #define M_MNEM_vldrh   0xec100e10
  #define M_MNEM_vldrw   0xec100e40
  #define M_MNEM_vldrd   0xec100e50
+#define M_MNEM_vmovlt  0xeea01f40
+#define M_MNEM_vmovlb  0xeea00f40
+#define M_MNEM_vmovnt  0xfe311e81
+#define M_MNEM_vmovnb  0xfe310e81
+#define M_MNEM_vadc    0xee300f00
+#define M_MNEM_vadci   0xee301f00
+#define M_MNEM_vbrsr   0xfe011e60
+#define M_MNEM_vaddlv  0xee890f00
+#define M_MNEM_vaddlva 0xee890f20
+#define M_MNEM_vaddv   0xeef10f00
+#define M_MNEM_vaddva  0xeef10f20
+#define M_MNEM_vddup   0xee011f6e
+#define M_MNEM_vdwdup  0xee011f60
+#define M_MNEM_vidup   0xee010f6e
+#define M_MNEM_viwdup  0xee010f60
+#define M_MNEM_vmaxv   0xeee20f00
+#define M_MNEM_vmaxav  0xeee00f00
+#define M_MNEM_vminv   0xeee20f80
+#define M_MNEM_vminav  0xeee00f80
+#define M_MNEM_vmlaldav          0xee800e00
+#define M_MNEM_vmlaldava  0xee800e20
+#define M_MNEM_vmlaldavx  0xee801e00
+#define M_MNEM_vmlaldavax 0xee801e20
+#define M_MNEM_vmlsldav          0xee800e01
+#define M_MNEM_vmlsldava  0xee800e21
+#define M_MNEM_vmlsldavx  0xee801e01
+#define M_MNEM_vmlsldavax 0xee801e21
+#define M_MNEM_vrmlaldavhx  0xee801f00
+#define M_MNEM_vrmlaldavhax 0xee801f20
+#define M_MNEM_vrmlsldavh   0xfe800e01
+#define M_MNEM_vrmlsldavha  0xfe800e21
+#define M_MNEM_vrmlsldavhx  0xfe801e01
+#define M_MNEM_vrmlsldavhax 0xfe801e21
+#define M_MNEM_vqmovnt   0xee331e01
+#define M_MNEM_vqmovnb   0xee330e01
+#define M_MNEM_vqmovunt          0xee311e81
+#define M_MNEM_vqmovunb          0xee310e81
+#define M_MNEM_vshrnt      0xee801fc1
+#define M_MNEM_vshrnb      0xee800fc1
+#define M_MNEM_vrshrnt     0xfe801fc1
+#define M_MNEM_vqshrnt     0xee801f40
+#define M_MNEM_vqshrnb     0xee800f40
+#define M_MNEM_vqshrunt            0xee801fc0
+#define M_MNEM_vqshrunb            0xee800fc0
+#define M_MNEM_vrshrnb     0xfe800fc1
+#define M_MNEM_vqrshrnt            0xee801f41
+#define M_MNEM_vqrshrnb            0xee800f41
+#define M_MNEM_vqrshrunt    0xfe801fc0
+#define M_MNEM_vqrshrunb    0xfe800fc0
+
+/* Bfloat16 instruction encoder helpers.  */
+#define B_MNEM_vfmat 0xfc300850
+#define B_MNEM_vfmab 0xfc300810
  
  /* Neon instruction encoder helpers.  */
  
@@ -14121,6 +14792,13 @@ NEON_ENC_TAB
       - a table used to drive neon_select_shape.  */
  
  #define NEON_SHAPE_DEF                 \
+  X(4, (R, R, Q, Q), QUAD),            \
+  X(4, (Q, R, R, I), QUAD),            \
+  X(4, (R, R, S, S), QUAD),            \
+  X(4, (S, S, R, R), QUAD),            \
+  X(3, (Q, R, I), QUAD),               \
+  X(3, (I, Q, Q), QUAD),               \
+  X(3, (I, Q, R), QUAD),               \
    X(3, (R, Q, Q), QUAD),               \
    X(3, (D, D, D), DOUBLE),             \
    X(3, (Q, Q, Q), QUAD),               \
@@ -14129,6 +14807,8 @@ NEON_ENC_TAB
    X(3, (D, D, S), DOUBLE),             \
    X(3, (Q, Q, S), QUAD),               \
    X(3, (Q, Q, R), QUAD),               \
+  X(3, (R, R, Q), QUAD),               \
+  X(2, (R, Q),   QUAD),                \
    X(2, (D, D), DOUBLE),                        \
    X(2, (Q, Q), QUAD),                  \
    X(2, (D, S), DOUBLE),                        \
@@ -14137,6 +14817,15 @@ NEON_ENC_TAB
    X(2, (Q, R), QUAD),                  \
    X(2, (D, I), DOUBLE),                        \
    X(2, (Q, I), QUAD),                  \
+  X(3, (P, F, I), SINGLE),             \
+  X(3, (P, D, I), DOUBLE),             \
+  X(3, (P, Q, I), QUAD),               \
+  X(4, (P, F, F, I), SINGLE),          \
+  X(4, (P, D, D, I), DOUBLE),          \
+  X(4, (P, Q, Q, I), QUAD),            \
+  X(5, (P, F, F, F, I), SINGLE),       \
+  X(5, (P, D, D, D, I), DOUBLE),       \
+  X(5, (P, Q, Q, Q, I), QUAD),         \
    X(3, (D, L, D), DOUBLE),             \
    X(2, (D, Q), MIXED),                 \
    X(2, (Q, D), MIXED),                 \
@@ -14165,6 +14854,8 @@ NEON_ENC_TAB
    X(2, (R, S), SINGLE),                        \
    X(2, (F, R), SINGLE),                        \
    X(2, (R, F), SINGLE),                        \
+/* Used for MVE tail predicated loop instructions.  */\
+  X(2, (R, R), QUAD),                  \
  /* Half float shape supported so far.  */\
    X (2, (H, D), MIXED),                        \
    X (2, (D, H), MIXED),                        \
@@ -14183,6 +14874,7 @@ NEON_ENC_TAB
  #define S2(A,B)                NS_##A##B
  #define S3(A,B,C)      NS_##A##B##C
  #define S4(A,B,C,D)    NS_##A##B##C##D
+#define S5(A,B,C,D,E)  NS_##A##B##C##D##E
  
  #define X(N, L, C) S##N L
  
@@ -14196,6 +14888,7 @@ enum neon_shape
  #undef S2
  #undef S3
  #undef S4
+#undef S5
  
  enum neon_shape_class
  {
@@ -14224,7 +14917,8 @@ enum neon_shape_el
    SE_I,
    SE_S,
    SE_R,
-  SE_L
+  SE_L,
+  SE_P
  };
  
  /* Register widths of above.  */
@@ -14237,6 +14931,7 @@ static unsigned neon_shape_el_size[] =
    0,
    32,
    32,
+  0,
    0
  };
  
@@ -14249,6 +14944,7 @@ struct neon_shape_info
  #define S2(A,B)                { SE_##A, SE_##B }
  #define S3(A,B,C)      { SE_##A, SE_##B, SE_##C }
  #define S4(A,B,C,D)    { SE_##A, SE_##B, SE_##C, SE_##D }
+#define S5(A,B,C,D,E)  { SE_##A, SE_##B, SE_##C, SE_##D, SE_##E }
  
  #define X(N, L, C) { N, S##N L }
  
@@ -14261,6 +14957,7 @@ static struct neon_shape_info neon_shape_tab[] =
  #undef S2
  #undef S3
  #undef S4
+#undef S5
  
  /* Bit masks used in type checking given instructions.
    'N_EQK' means the type must be the same as (or based on in some way) the key
@@ -14292,6 +14989,7 @@ enum neon_type_mask
    N_F32  = 0x0080000,
    N_F64  = 0x0100000,
    N_P64         = 0x0200000,
+  N_BF16 = 0x0400000,
    N_KEY  = 0x1000000, /* Key element (main type specifier).  */
    N_EQK  = 0x2000000, /* Given operand has the same type & size as the key.  */
    N_VFP  = 0x4000000, /* VFP mode: operand size must match register width.  */
@@ -14449,6 +15147,7 @@ neon_select_shape (enum neon_shape shape, ...)
                 matches = 0;
               break;
  
+           case SE_P:
             case SE_L:
               break;
             }
@@ -14590,6 +15289,10 @@ type_chk_of_el_type (enum neon_el_type type, unsigned size)
         }
        break;
  
+    case NT_bfloat:
+      if (size == 16) return N_BF16;
+      break;
+
      default: ;
      }
  
@@ -14608,7 +15311,8 @@ el_type_of_type_chk (enum neon_el_type *type, unsigned *size,
  
    if ((mask & (N_S8 | N_U8 | N_I8 | N_8 | N_P8)) != 0)
      *size = 8;
-  else if ((mask & (N_S16 | N_U16 | N_I16 | N_16 | N_F16 | N_P16)) != 0)
+  else if ((mask & (N_S16 | N_U16 | N_I16 | N_16 | N_F16 | N_P16 | N_BF16))
+          != 0)
      *size = 16;
    else if ((mask & (N_S32 | N_U32 | N_I32 | N_32 | N_F32)) != 0)
      *size = 32;
@@ -14629,6 +15333,8 @@ el_type_of_type_chk (enum neon_el_type *type, unsigned *size,
      *type = NT_poly;
    else if ((mask & (N_F_ALL)) != 0)
      *type = NT_float;
+  else if ((mask & (N_BF16)) != 0)
+    *type = NT_bfloat;
    else
      return FAIL;
  
@@ -14899,7 +15605,7 @@ do_vfp_nsyn_opcode (const char *opname)
  {
    const struct asm_opcode *opcode;
  
-  opcode = (const struct asm_opcode *) hash_find (arm_ops_hsh, opname);
+  opcode = (const struct asm_opcode *) str_hash_find (arm_ops_hsh, opname);
  
    if (!opcode)
      abort ();
@@ -15159,1777 +15865,3138 @@ do_vfp_nsyn_nmul (void)
  
  }
  
-static void
-do_vfp_nsyn_cmp (void)
+/* Turn a size (8, 16, 32, 64) into the respective bit number minus 3
+   (0, 1, 2, 3).  */
+
+static unsigned
+neon_logbits (unsigned x)
  {
-  enum neon_shape rs;
-  if (inst.operands[1].isreg)
-    {
-      rs = neon_select_shape (NS_HH, NS_FF, NS_DD, NS_NULL);
-      neon_check_type (2, rs, N_EQK | N_VFP, N_F_ALL | N_KEY | N_VFP);
+  return ffs (x) - 4;
+}
  
-      if (rs == NS_FF || rs == NS_HH)
-       {
-         NEON_ENCODE (SINGLE, inst);
-         do_vfp_sp_monadic ();
-       }
-      else
-       {
-         NEON_ENCODE (DOUBLE, inst);
-         do_vfp_dp_rd_rm ();
-       }
-    }
-  else
-    {
-      rs = neon_select_shape (NS_HI, NS_FI, NS_DI, NS_NULL);
-      neon_check_type (2, rs, N_F_ALL | N_KEY | N_VFP, N_EQK);
+#define LOW4(R) ((R) & 0xf)
+#define HI1(R) (((R) >> 4) & 1)
+#define LOW1(R) ((R) & 0x1)
+#define HI4(R) (((R) >> 1) & 0xf)
  
-      switch (inst.instruction & 0x0fffffff)
+static unsigned
+mve_get_vcmp_vpt_cond (struct neon_type_el et)
+{
+  switch (et.type)
+    {
+    default:
+      first_error (BAD_EL_TYPE);
+      return 0;
+    case NT_float:
+      switch (inst.operands[0].imm)
         {
-       case N_MNEM_vcmp:
-         inst.instruction += N_MNEM_vcmpz - N_MNEM_vcmp;
-         break;
-       case N_MNEM_vcmpe:
-         inst.instruction += N_MNEM_vcmpez - N_MNEM_vcmpe;
-         break;
         default:
-         abort ();
+         first_error (_("invalid condition"));
+         return 0;
+       case 0x0:
+         /* eq.  */
+         return 0;
+       case 0x1:
+         /* ne.  */
+         return 1;
+       case 0xa:
+         /* ge/  */
+         return 4;
+       case 0xb:
+         /* lt.  */
+         return 5;
+       case 0xc:
+         /* gt.  */
+         return 6;
+       case 0xd:
+         /* le.  */
+         return 7;
         }
-
-      if (rs == NS_FI || rs == NS_HI)
+    case NT_integer:
+      /* only accept eq and ne.  */
+      if (inst.operands[0].imm > 1)
         {
-         NEON_ENCODE (SINGLE, inst);
-         do_vfp_sp_compare_z ();
+         first_error (_("invalid condition"));
+         return 0;
         }
+      return inst.operands[0].imm;
+    case NT_unsigned:
+      if (inst.operands[0].imm == 0x2)
+       return 2;
+      else if (inst.operands[0].imm == 0x8)
+       return 3;
        else
         {
-         NEON_ENCODE (DOUBLE, inst);
-         do_vfp_dp_rd ();
+         first_error (_("invalid condition"));
+         return 0;
+       }
+    case NT_signed:
+      switch (inst.operands[0].imm)
+       {
+         default:
+           first_error (_("invalid condition"));
+           return 0;
+         case 0xa:
+           /* ge.  */
+           return 4;
+         case 0xb:
+           /* lt.  */
+           return 5;
+         case 0xc:
+           /* gt.  */
+           return 6;
+         case 0xd:
+           /* le.  */
+           return 7;
         }
      }
-  do_vfp_cond_or_thumb ();
-
-  /* ARMv8.2 fp16 instruction.  */
-  if (rs == NS_HI || rs == NS_HH)
-    do_scalar_fp16_v82_encode ();
+  /* Should be unreachable.  */
+  abort ();
  }
  
+/* For VCTP (create vector tail predicate) in MVE.  */
  static void
-nsyn_insert_sp (void)
+do_mve_vctp (void)
  {
-  inst.operands[1] = inst.operands[0];
-  memset (&inst.operands[0], '\0', sizeof (inst.operands[0]));
-  inst.operands[0].reg = REG_SP;
-  inst.operands[0].isreg = 1;
-  inst.operands[0].writeback = 1;
-  inst.operands[0].present = 1;
-}
+  int dt = 0;
+  unsigned size = 0x0;
  
-static void
-do_vfp_nsyn_push (void)
-{
-  nsyn_insert_sp ();
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-  constraint (inst.operands[1].imm < 1 || inst.operands[1].imm > 16,
-             _("register list must contain at least 1 and at most 16 "
-               "registers"));
+  /* This is a typical MVE instruction which has no type but have size 8, 16,
+     32 and 64.  For instructions with no type, inst.vectype.el[j].type is set
+     to NT_untyped and size is updated in inst.vectype.el[j].size.  */
+  if ((inst.operands[0].present) && (inst.vectype.el[0].type == NT_untyped))
+    dt = inst.vectype.el[0].size;
  
-  if (inst.operands[1].issingle)
-    do_vfp_nsyn_opcode ("fstmdbs");
-  else
-    do_vfp_nsyn_opcode ("fstmdbd");
+  /* Setting this does not indicate an actual NEON instruction, but only
+     indicates that the mnemonic accepts neon-style type suffixes.  */
+  inst.is_neon = 1;
+
+  switch (dt)
+    {
+      case 8:
+       break;
+      case 16:
+       size = 0x1; break;
+      case 32:
+       size = 0x2; break;
+      case 64:
+       size = 0x3; break;
+      default:
+       first_error (_("Type is not allowed for this instruction"));
+    }
+  inst.instruction |= size << 20;
+  inst.instruction |= inst.operands[0].reg << 16;
  }
  
  static void
-do_vfp_nsyn_pop (void)
+do_mve_vpt (void)
  {
-  nsyn_insert_sp ();
+  /* We are dealing with a vector predicated block.  */
+  if (inst.operands[0].present)
+    {
+      enum neon_shape rs = neon_select_shape (NS_IQQ, NS_IQR, NS_NULL);
+      struct neon_type_el et
+       = neon_check_type (3, rs, N_EQK, N_KEY | N_F_MVE | N_I_MVE | N_SU_32,
+                          N_EQK);
  
-  constraint (inst.operands[1].imm < 1 || inst.operands[1].imm > 16,
-             _("register list must contain at least 1 and at most 16 "
-               "registers"));
+      unsigned fcond = mve_get_vcmp_vpt_cond (et);
  
-  if (inst.operands[1].issingle)
-    do_vfp_nsyn_opcode ("fldmias");
-  else
-    do_vfp_nsyn_opcode ("fldmiad");
-}
+      constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
  
-/* Fix up Neon data-processing instructions, ORing in the correct bits for
-   ARM mode or Thumb mode and moving the encoded bit 24 to bit 28.  */
+      if (et.type == NT_invtype)
+       return;
+
+      if (et.type == NT_float)
+       {
+         constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+                     BAD_FPU);
+         constraint (et.size != 16 && et.size != 32, BAD_EL_TYPE);
+         inst.instruction |= (et.size == 16) << 28;
+         inst.instruction |= 0x3 << 20;
+       }
+      else
+       {
+         constraint (et.size != 8 && et.size != 16 && et.size != 32,
+                     BAD_EL_TYPE);
+         inst.instruction |= 1 << 28;
+         inst.instruction |= neon_logbits (et.size) << 20;
+       }
+
+      if (inst.operands[2].isquad)
+       {
+         inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+         inst.instruction |= LOW4 (inst.operands[2].reg);
+         inst.instruction |= (fcond & 0x2) >> 1;
+       }
+      else
+       {
+         if (inst.operands[2].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         inst.instruction |= 1 << 6;
+         inst.instruction |= (fcond & 0x2) << 4;
+         inst.instruction |= inst.operands[2].reg;
+       }
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= (fcond & 0x4) << 10;
+      inst.instruction |= (fcond & 0x1) << 7;
+
+    }
+    set_pred_insn_type (VPT_INSN);
+    now_pred.cc = 0;
+    now_pred.mask = ((inst.instruction & 0x00400000) >> 19)
+                   | ((inst.instruction & 0xe000) >> 13);
+    now_pred.warn_deprecated = FALSE;
+    now_pred.type = VECTOR_PRED;
+    inst.is_neon = 1;
+}
  
  static void
-neon_dp_fixup (struct arm_it* insn)
+do_mve_vcmp (void)
  {
-  unsigned int i = insn->instruction;
-  insn->is_neon = 1;
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+  if (!inst.operands[1].isreg || !inst.operands[1].isquad)
+    first_error (_(reg_expected_msgs[REG_TYPE_MQ]));
+  if (!inst.operands[2].present)
+    first_error (_("MVE vector or ARM register expected"));
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
  
-  if (thumb_mode)
+  /* Deal with 'else' conditional MVE's vcmp, it will be parsed as vcmpe.  */
+  if ((inst.instruction & 0xffffffff) == N_MNEM_vcmpe
+      && inst.operands[1].isquad)
      {
-      /* The U bit is at bit 24 by default. Move to bit 28 in Thumb mode.  */
-      if (i & (1 << 24))
-       i |= 1 << 28;
-
-      i &= ~(1 << 24);
-
-      i |= 0xef000000;
+      inst.instruction = N_MNEM_vcmp;
+      inst.cond = 0x10;
      }
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
    else
-    i |= 0xf2000000;
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-  insn->instruction = i;
-}
+  enum neon_shape rs = neon_select_shape (NS_IQQ, NS_IQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_KEY | N_F_MVE | N_I_MVE | N_SU_32,
+                      N_EQK);
  
-/* Turn a size (8, 16, 32, 64) into the respective bit number minus 3
-   (0, 1, 2, 3).  */
+  constraint (rs == NS_IQR && inst.operands[2].reg == REG_PC
+             && !inst.operands[2].iszr, BAD_PC);
  
-static unsigned
-neon_logbits (unsigned x)
-{
-  return ffs (x) - 4;
+  unsigned fcond = mve_get_vcmp_vpt_cond (et);
+
+  inst.instruction = 0xee010f00;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= (fcond & 0x4) << 10;
+  inst.instruction |= (fcond & 0x1) << 7;
+  if (et.type == NT_float)
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+                 BAD_FPU);
+      inst.instruction |= (et.size == 16) << 28;
+      inst.instruction |= 0x3 << 20;
+    }
+  else
+    {
+      inst.instruction |= 1 << 28;
+      inst.instruction |= neon_logbits (et.size) << 20;
+    }
+  if (inst.operands[2].isquad)
+    {
+      inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+      inst.instruction |= (fcond & 0x2) >> 1;
+      inst.instruction |= LOW4 (inst.operands[2].reg);
+    }
+  else
+    {
+      if (inst.operands[2].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+      inst.instruction |= 1 << 6;
+      inst.instruction |= (fcond & 0x2) << 4;
+      inst.instruction |= inst.operands[2].reg;
+    }
+
+  inst.is_neon = 1;
+  return;
  }
  
-#define LOW4(R) ((R) & 0xf)
-#define HI1(R) (((R) >> 4) & 1)
+static void
+do_mve_vmaxa_vmina (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  enum neon_shape rs = neon_select_shape (NS_QQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_KEY | N_S8 | N_S16 | N_S32);
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
  
  static void
-mve_encode_qqr (int size, int fp)
+do_mve_vfmas (void)
  {
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_F_MVE | N_KEY, N_EQK, N_EQK);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
    if (inst.operands[2].reg == REG_SP)
      as_tsktsk (MVE_BAD_SP);
    else if (inst.operands[2].reg == REG_PC)
      as_tsktsk (MVE_BAD_PC);
  
-  if (fp)
-    {
-      /* vadd.  */
-      if (((unsigned)inst.instruction) == 0xd00)
-       inst.instruction = 0xee300f40;
-      /* vsub.  */
-      else if (((unsigned)inst.instruction) == 0x200d00)
-       inst.instruction = 0xee301f40;
-
-      /* Setting size which is 1 for F16 and 0 for F32.  */
-      inst.instruction |= (size == 16) << 28;
-    }
-  else
-    {
-      /* vadd.  */
-      if (((unsigned)inst.instruction) == 0x800)
-       inst.instruction = 0xee010f40;
-      /* vsub.  */
-      else if (((unsigned)inst.instruction) == 0x1000800)
-       inst.instruction = 0xee011f40;
-      /* Setting bits for size.  */
-      inst.instruction |= neon_logbits (size) << 20;
-    }
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (et.size == 16) << 28;
    inst.instruction |= HI1 (inst.operands[0].reg) << 22;
    inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
    inst.instruction |= HI1 (inst.operands[1].reg) << 7;
    inst.instruction |= inst.operands[2].reg;
    inst.is_neon = 1;
  }
  
  static void
-mve_encode_rqq (unsigned bit28, unsigned size)
+do_mve_viddup (void)
  {
-  inst.instruction |= bit28 << 28;
-  inst.instruction |= neon_logbits (size) << 20;
-  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
-  inst.instruction |= inst.operands[0].reg << 12;
-  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
-  inst.instruction |= LOW4 (inst.operands[2].reg);
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  unsigned imm = inst.relocs[0].exp.X_add_number;
+  constraint (imm != 1 && imm != 2 && imm != 4 && imm != 8,
+             _("immediate must be either 1, 2, 4 or 8"));
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  unsigned Rm;
+  if (inst.instruction == M_MNEM_vddup || inst.instruction == M_MNEM_vidup)
+    {
+      rs = neon_select_shape (NS_QRI, NS_NULL);
+      et = neon_check_type (2, rs, N_KEY | N_U8 | N_U16 | N_U32, N_EQK);
+      Rm = 7;
+    }
+  else
+    {
+      constraint ((inst.operands[2].reg % 2) != 1, BAD_EVEN);
+      if (inst.operands[2].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+      else if (inst.operands[2].reg == REG_PC)
+       first_error (BAD_PC);
+
+      rs = neon_select_shape (NS_QRRI, NS_NULL);
+      et = neon_check_type (3, rs, N_KEY | N_U8 | N_U16 | N_U32, N_EQK, N_EQK);
+      Rm = inst.operands[2].reg >> 1;
+    }
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= inst.operands[1].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (imm > 2) << 7;
+  inst.instruction |= Rm << 1;
+  inst.instruction |= (imm == 2 || imm == 8);
    inst.is_neon = 1;
  }
  
  static void
-mve_encode_qqq (int ubit, int size)
+do_mve_vmlas (void)
  {
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
  
-  inst.instruction |= (ubit != 0) << 28;
+  if (inst.operands[2].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[2].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
    inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= neon_logbits (size) << 20;
+  inst.instruction |= neon_logbits (et.size) << 20;
    inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
    inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
    inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
-  inst.instruction |= LOW4 (inst.operands[2].reg);
-
+  inst.instruction |= inst.operands[2].reg;
    inst.is_neon = 1;
  }
  
+static void
+do_mve_vshll (void)
+{
+  struct neon_type_el et
+    = neon_check_type (2, NS_QQI, N_EQK, N_S8 | N_U8 | N_S16 | N_U16 | N_KEY);
  
-/* Encode insns with bit pattern:
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-  |28/24|23|22 |21 20|19 16|15 12|11    8|7|6|5|4|3  0|
-  |  U  |x |D  |size | Rn  | Rd  |x x x x|N|Q|M|x| Rm |
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || (unsigned)imm > et.size,
+             _("immediate value out of range"));
  
-  SIZE is passed in bits. -1 means size field isn't changed, in case it has a
-  different meaning for some instruction.  */
+  if ((unsigned)imm == et.size)
+    {
+      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= 0x110001;
+    }
+  else
+    {
+      inst.instruction |= (et.size + imm) << 16;
+      inst.instruction |= 0x800140;
+    }
  
-static void
-neon_three_same (int isquad, int ubit, int size)
-{
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (et.type == NT_unsigned) << 28;
    inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
-  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-  inst.instruction |= LOW4 (inst.operands[2].reg);
-  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
-  inst.instruction |= (isquad != 0) << 6;
-  inst.instruction |= (ubit != 0) << 24;
-  if (size != -1)
-    inst.instruction |= neon_logbits (size) << 20;
-
-  neon_dp_fixup (&inst);
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
  }
  
-/* Encode instructions of the form:
+static void
+do_mve_vshlc (void)
+{
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-  |28/24|23|22|21 20|19 18|17 16|15 12|11      7|6|5|4|3  0|
-  |  U  |x |D |x  x |size |x  x | Rd  |x x x x x|Q|M|x| Rm |
+  if (inst.operands[1].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[1].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
  
-  Don't write size if SIZE == -1.  */
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || imm > 32, _("immediate value out of range"));
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (imm & 0x1f) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= inst.operands[1].reg;
+  inst.is_neon = 1;
+}
  
  static void
-neon_two_same (int qbit, int ubit, int size)
+do_mve_vshrn (void)
  {
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= LOW4 (inst.operands[1].reg);
-  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-  inst.instruction |= (qbit != 0) << 6;
-  inst.instruction |= (ubit != 0) << 24;
+  unsigned types;
+  switch (inst.instruction)
+    {
+    case M_MNEM_vshrnt:
+    case M_MNEM_vshrnb:
+    case M_MNEM_vrshrnt:
+    case M_MNEM_vrshrnb:
+      types = N_I16 | N_I32;
+      break;
+    case M_MNEM_vqshrnt:
+    case M_MNEM_vqshrnb:
+    case M_MNEM_vqrshrnt:
+    case M_MNEM_vqrshrnb:
+      types = N_U16 | N_U32 | N_S16 | N_S32;
+      break;
+    case M_MNEM_vqshrunt:
+    case M_MNEM_vqshrunb:
+    case M_MNEM_vqrshrunt:
+    case M_MNEM_vqrshrunb:
+      types = N_S16 | N_S32;
+      break;
+    default:
+      abort ();
+    }
  
-  if (size != -1)
-    inst.instruction |= neon_logbits (size) << 18;
+  struct neon_type_el et = neon_check_type (2, NS_QQI, N_EQK, types | N_KEY);
  
-  neon_dp_fixup (&inst);
-}
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-/* Neon instruction encoders, in approximate order of appearance.  */
+  unsigned Qd = inst.operands[0].reg;
+  unsigned Qm = inst.operands[1].reg;
+  unsigned imm = inst.operands[2].imm;
+  constraint (imm < 1 || ((unsigned) imm) > (et.size / 2),
+             et.size == 16
+             ? _("immediate operand expected in the range [1,8]")
+             : _("immediate operand expected in the range [1,16]"));
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (Qd) << 22;
+  inst.instruction |= (et.size - imm) << 16;
+  inst.instruction |= LOW4 (Qd) << 12;
+  inst.instruction |= HI1 (Qm) << 5;
+  inst.instruction |= LOW4 (Qm);
+  inst.is_neon = 1;
+}
  
  static void
-do_neon_dyadic_i_su (void)
+do_mve_vqmovn (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_32 | N_KEY);
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  struct neon_type_el et;
+  if (inst.instruction == M_MNEM_vqmovnt
+     || inst.instruction == M_MNEM_vqmovnb)
+    et = neon_check_type (2, NS_QQ, N_EQK,
+                         N_U16 | N_U32 | N_S16 | N_S32 | N_KEY);
+  else
+    et = neon_check_type (2, NS_QQ, N_EQK, N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (et.size == 32) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
  }
  
  static void
-do_neon_dyadic_i64_su (void)
+do_mve_vpsel (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_ALL | N_KEY);
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  neon_select_shape (NS_QQQ, NS_NULL);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.is_neon = 1;
  }
  
  static void
-neon_imm_shift (int write_ubit, int uval, int isquad, struct neon_type_el et,
-               unsigned immbits)
+do_mve_vpnot (void)
  {
-  unsigned size = et.size >> 3;
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= LOW4 (inst.operands[1].reg);
-  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-  inst.instruction |= (isquad != 0) << 6;
-  inst.instruction |= immbits << 16;
-  inst.instruction |= (size >> 3) << 7;
-  inst.instruction |= (size & 0x7) << 19;
-  if (write_ubit)
-    inst.instruction |= (uval != 0) << 24;
-
-  neon_dp_fixup (&inst);
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  }
  
  static void
-do_neon_shl_imm (void)
+do_mve_vmaxnma_vminnma (void)
  {
-  if (!inst.operands[2].isreg)
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
-      int imm = inst.operands[2].imm;
+  enum neon_shape rs = neon_select_shape (NS_QQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_F_MVE | N_KEY);
  
-      constraint (imm < 0 || (unsigned)imm >= et.size,
-                 _("immediate out of range for shift"));
-      NEON_ENCODE (IMMED, inst);
-      neon_imm_shift (FALSE, 0, neon_quad (rs), et, imm);
-    }
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
    else
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
-      unsigned int tmp;
-
-      /* VSHL/VQSHL 3-register variants have syntax such as:
-          vshl.xx Dd, Dm, Dn
-        whereas other 3-register operations encoded by neon_three_same have
-        syntax like:
-          vadd.xx Dd, Dn, Dm
-        (i.e. with Dn & Dm reversed). Swap operands[1].reg and operands[2].reg
-        here.  */
-      tmp = inst.operands[2].reg;
-      inst.operands[2].reg = inst.operands[1].reg;
-      inst.operands[1].reg = tmp;
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
-    }
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  inst.instruction |= (et.size == 16) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
  }
  
  static void
-do_neon_qshl_imm (void)
+do_mve_vcmul (void)
  {
-  if (!inst.operands[2].isreg)
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
-      int imm = inst.operands[2].imm;
+  enum neon_shape rs = neon_select_shape (NS_QQQI, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_F_MVE | N_KEY);
  
-      constraint (imm < 0 || (unsigned)imm >= et.size,
-                 _("immediate out of range for shift"));
-      NEON_ENCODE (IMMED, inst);
-      neon_imm_shift (TRUE, et.type == NT_unsigned, neon_quad (rs), et, imm);
-    }
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
    else
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
-      unsigned int tmp;
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-      /* See note in do_neon_shl_imm.  */
-      tmp = inst.operands[2].reg;
-      inst.operands[2].reg = inst.operands[1].reg;
-      inst.operands[1].reg = tmp;
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
-    }
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
+             _("immediate out of range"));
+
+  if (et.size == 32 && (inst.operands[0].reg == inst.operands[1].reg
+                       || inst.operands[0].reg == inst.operands[2].reg))
+    as_tsktsk (BAD_MVE_SRCDEST);
+
+  inst.instruction |= (et.size == 32) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= (rot > 90) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.instruction |= (rot == 90 || rot == 270);
+  inst.is_neon = 1;
  }
  
+/* To handle the Low Overhead Loop instructions
+   in Armv8.1-M Mainline and MVE.  */
  static void
-do_neon_rshl (void)
+do_t_loloop (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_SU_ALL | N_KEY);
-  unsigned int tmp;
+  unsigned long insn = inst.instruction;
  
-  tmp = inst.operands[2].reg;
-  inst.operands[2].reg = inst.operands[1].reg;
-  inst.operands[1].reg = tmp;
-  neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
-}
+  inst.instruction = THUMB_OP32 (inst.instruction);
  
-static int
-neon_cmode_for_logic_imm (unsigned immediate, unsigned *immbits, int size)
-{
-  /* Handle .I8 pseudo-instructions.  */
-  if (size == 8)
-    {
-      /* Unfortunately, this will make everything apart from zero out-of-range.
-        FIXME is this the intended semantics? There doesn't seem much point in
-        accepting .I8 if so.  */
-      immediate |= immediate << 8;
-      size = 16;
-    }
+  if (insn == T_MNEM_lctp)
+    return;
  
-  if (size >= 32)
-    {
-      if (immediate == (immediate & 0x000000ff))
-       {
-         *immbits = immediate;
-         return 0x1;
-       }
-      else if (immediate == (immediate & 0x0000ff00))
-       {
-         *immbits = immediate >> 8;
-         return 0x3;
-       }
-      else if (immediate == (immediate & 0x00ff0000))
-       {
-         *immbits = immediate >> 16;
-         return 0x5;
-       }
-      else if (immediate == (immediate & 0xff000000))
-       {
-         *immbits = immediate >> 24;
-         return 0x7;
-       }
-      if ((immediate & 0xffff) != (immediate >> 16))
-       goto bad_immediate;
-      immediate &= 0xffff;
-    }
+  set_pred_insn_type (MVE_OUTSIDE_PRED_INSN);
  
-  if (immediate == (immediate & 0x000000ff))
+  if (insn == T_MNEM_wlstp || insn == T_MNEM_dlstp)
      {
-      *immbits = immediate;
-      return 0x9;
+      struct neon_type_el et
+       = neon_check_type (2, NS_RR, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+      inst.instruction |= neon_logbits (et.size) << 20;
+      inst.is_neon = 1;
      }
-  else if (immediate == (immediate & 0x0000ff00))
+
+  switch (insn)
      {
-      *immbits = immediate >> 8;
-      return 0xb;
-    }
+    case T_MNEM_letp:
+      constraint (!inst.operands[0].present,
+                 _("expected LR"));
+      /* fall through.  */
+    case T_MNEM_le:
+      /* le <label>.  */
+      if (!inst.operands[0].present)
+       inst.instruction |= 1 << 21;
  
-  bad_immediate:
-  first_error (_("immediate value out of range"));
-  return FAIL;
+      v8_1_loop_reloc (TRUE);
+      break;
+
+    case T_MNEM_wls:
+    case T_MNEM_wlstp:
+      v8_1_loop_reloc (FALSE);
+      /* fall through.  */
+    case T_MNEM_dlstp:
+    case T_MNEM_dls:
+      constraint (inst.operands[1].isreg != 1, BAD_ARGS);
+
+      if (insn == T_MNEM_wlstp || insn == T_MNEM_dlstp)
+       constraint (inst.operands[1].reg == REG_PC, BAD_PC);
+      else if (inst.operands[1].reg == REG_PC)
+       as_tsktsk (MVE_BAD_PC);
+      if (inst.operands[1].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+
+      inst.instruction |= (inst.operands[1].reg << 16);
+      break;
+
+    default:
+      abort ();
+    }
  }
  
+
  static void
-do_neon_logic (void)
+do_vfp_nsyn_cmp (void)
  {
-  if (inst.operands[2].present && inst.operands[2].isreg)
+  enum neon_shape rs;
+  if (!inst.operands[0].isreg)
      {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      neon_check_type (3, rs, N_IGNORE_TYPE);
-      /* U bit and size field were set as part of the bitmask.  */
-      NEON_ENCODE (INTEGER, inst);
-      neon_three_same (neon_quad (rs), 0, -1);
+      do_mve_vcmp ();
+      return;
      }
    else
      {
-      const int three_ops_form = (inst.operands[2].present
-                                 && !inst.operands[2].isreg);
-      const int immoperand = (three_ops_form ? 2 : 1);
-      enum neon_shape rs = (three_ops_form
-                           ? neon_select_shape (NS_DDI, NS_QQI, NS_NULL)
-                           : neon_select_shape (NS_DI, NS_QI, NS_NULL));
-      struct neon_type_el et = neon_check_type (2, rs,
-       N_I8 | N_I16 | N_I32 | N_I64 | N_F32 | N_KEY, N_EQK);
-      enum neon_opc opcode = (enum neon_opc) inst.instruction & 0x0fffffff;
-      unsigned immbits;
-      int cmode;
-
-      if (et.type == NT_invtype)
-       return;
-
-      if (three_ops_form)
-       constraint (inst.operands[0].reg != inst.operands[1].reg,
-                   _("first and second operands shall be the same register"));
+      constraint (inst.operands[2].present, BAD_SYNTAX);
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd),
+                 BAD_FPU);
+    }
  
-      NEON_ENCODE (IMMED, inst);
+  if (inst.operands[1].isreg)
+    {
+      rs = neon_select_shape (NS_HH, NS_FF, NS_DD, NS_NULL);
+      neon_check_type (2, rs, N_EQK | N_VFP, N_F_ALL | N_KEY | N_VFP);
  
-      immbits = inst.operands[immoperand].imm;
-      if (et.size == 64)
+      if (rs == NS_FF || rs == NS_HH)
         {
-         /* .i64 is a pseudo-op, so the immediate must be a repeating
-            pattern.  */
-         if (immbits != (inst.operands[immoperand].regisimm ?
-                         inst.operands[immoperand].reg : 0))
-           {
-             /* Set immbits to an invalid constant.  */
-             immbits = 0xdeadbeef;
-           }
+         NEON_ENCODE (SINGLE, inst);
+         do_vfp_sp_monadic ();
         }
-
-      switch (opcode)
+      else
         {
-       case N_MNEM_vbic:
-         cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
-         break;
-
-       case N_MNEM_vorr:
-         cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
-         break;
+         NEON_ENCODE (DOUBLE, inst);
+         do_vfp_dp_rd_rm ();
+       }
+    }
+  else
+    {
+      rs = neon_select_shape (NS_HI, NS_FI, NS_DI, NS_NULL);
+      neon_check_type (2, rs, N_F_ALL | N_KEY | N_VFP, N_EQK);
  
-       case N_MNEM_vand:
-         /* Pseudo-instruction for VBIC.  */
-         neon_invert_size (&immbits, 0, et.size);
-         cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
+      switch (inst.instruction & 0x0fffffff)
+       {
+       case N_MNEM_vcmp:
+         inst.instruction += N_MNEM_vcmpz - N_MNEM_vcmp;
           break;
-
-       case N_MNEM_vorn:
-         /* Pseudo-instruction for VORR.  */
-         neon_invert_size (&immbits, 0, et.size);
-         cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
+       case N_MNEM_vcmpe:
+         inst.instruction += N_MNEM_vcmpez - N_MNEM_vcmpe;
           break;
-
         default:
           abort ();
         }
  
-      if (cmode == FAIL)
-       return;
-
-      inst.instruction |= neon_quad (rs) << 6;
-      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-      inst.instruction |= cmode << 8;
-      neon_write_immbits (immbits);
-
-      neon_dp_fixup (&inst);
+      if (rs == NS_FI || rs == NS_HI)
+       {
+         NEON_ENCODE (SINGLE, inst);
+         do_vfp_sp_compare_z ();
+       }
+      else
+       {
+         NEON_ENCODE (DOUBLE, inst);
+         do_vfp_dp_rd ();
+       }
      }
-}
+  do_vfp_cond_or_thumb ();
  
-static void
-do_neon_bitfield (void)
-{
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  neon_check_type (3, rs, N_IGNORE_TYPE);
-  neon_three_same (neon_quad (rs), 0, -1);
+  /* ARMv8.2 fp16 instruction.  */
+  if (rs == NS_HI || rs == NS_HH)
+    do_scalar_fp16_v82_encode ();
  }
  
  static void
-neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
-                 unsigned destbits)
+nsyn_insert_sp (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_QQR, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs, N_EQK | destbits, N_EQK,
-                                           types | N_KEY);
-  if (et.type == NT_float)
+  inst.operands[1] = inst.operands[0];
+  memset (&inst.operands[0], '\0', sizeof (inst.operands[0]));
+  inst.operands[0].reg = REG_SP;
+  inst.operands[0].isreg = 1;
+  inst.operands[0].writeback = 1;
+  inst.operands[0].present = 1;
+}
+
+/* Fix up Neon data-processing instructions, ORing in the correct bits for
+   ARM mode or Thumb mode and moving the encoded bit 24 to bit 28.  */
+
+static void
+neon_dp_fixup (struct arm_it* insn)
+{
+  unsigned int i = insn->instruction;
+  insn->is_neon = 1;
+
+  if (thumb_mode)
      {
-      NEON_ENCODE (FLOAT, inst);
-      if (rs == NS_QQR)
-       mve_encode_qqr (et.size, 1);
-      else
-       neon_three_same (neon_quad (rs), 0, et.size == 16 ? (int) et.size : -1);
+      /* The U bit is at bit 24 by default. Move to bit 28 in Thumb mode.  */
+      if (i & (1 << 24))
+       i |= 1 << 28;
+
+      i &= ~(1 << 24);
+
+      i |= 0xef000000;
      }
    else
+    i |= 0xf2000000;
+
+  insn->instruction = i;
+}
+
+static void
+mve_encode_qqr (int size, int U, int fp)
+{
+  if (inst.operands[2].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[2].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  if (fp)
      {
-      NEON_ENCODE (INTEGER, inst);
-      if (rs == NS_QQR)
-       mve_encode_qqr (et.size, 0);
-      else
-       neon_three_same (neon_quad (rs), et.type == ubit_meaning, et.size);
+      /* vadd.  */
+      if (((unsigned)inst.instruction) == 0xd00)
+       inst.instruction = 0xee300f40;
+      /* vsub.  */
+      else if (((unsigned)inst.instruction) == 0x200d00)
+       inst.instruction = 0xee301f40;
+      /* vmul.  */
+      else if (((unsigned)inst.instruction) == 0x1000d10)
+       inst.instruction = 0xee310e60;
+
+      /* Setting size which is 1 for F16 and 0 for F32.  */
+      inst.instruction |= (size == 16) << 28;
+    }
+  else
+    {
+      /* vadd.  */
+      if (((unsigned)inst.instruction) == 0x800)
+       inst.instruction = 0xee010f40;
+      /* vsub.  */
+      else if (((unsigned)inst.instruction) == 0x1000800)
+       inst.instruction = 0xee011f40;
+      /* vhadd.  */
+      else if (((unsigned)inst.instruction) == 0)
+       inst.instruction = 0xee000f40;
+      /* vhsub.  */
+      else if (((unsigned)inst.instruction) == 0x200)
+       inst.instruction = 0xee001f40;
+      /* vmla.  */
+      else if (((unsigned)inst.instruction) == 0x900)
+       inst.instruction = 0xee010e40;
+      /* vmul.  */
+      else if (((unsigned)inst.instruction) == 0x910)
+       inst.instruction = 0xee011e60;
+      /* vqadd.  */
+      else if (((unsigned)inst.instruction) == 0x10)
+       inst.instruction = 0xee000f60;
+      /* vqsub.  */
+      else if (((unsigned)inst.instruction) == 0x210)
+       inst.instruction = 0xee001f60;
+      /* vqrdmlah.  */
+      else if (((unsigned)inst.instruction) == 0x3000b10)
+       inst.instruction = 0xee000e40;
+      /* vqdmulh.  */
+      else if (((unsigned)inst.instruction) == 0x0000b00)
+       inst.instruction = 0xee010e60;
+      /* vqrdmulh.  */
+      else if (((unsigned)inst.instruction) == 0x1000b00)
+       inst.instruction = 0xfe010e60;
+
+      /* Set U-bit.  */
+      inst.instruction |= U << 28;
+
+      /* Setting bits for size.  */
+      inst.instruction |= neon_logbits (size) << 20;
      }
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
  }
  
+static void
+mve_encode_rqq (unsigned bit28, unsigned size)
+{
+  inst.instruction |= bit28 << 28;
+  inst.instruction |= neon_logbits (size) << 20;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.is_neon = 1;
+}
  
  static void
-do_neon_dyadic_if_su_d (void)
+mve_encode_qqq (int ubit, int size)
  {
-  /* This version only allow D registers, but that constraint is enforced during
-     operand parsing so we don't need to do anything extra here.  */
-  neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
+
+  inst.instruction |= (ubit != 0) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= neon_logbits (size) << 20;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+
+  inst.is_neon = 1;
  }
  
  static void
-do_neon_dyadic_if_i_d (void)
+mve_encode_rq (unsigned bit28, unsigned size)
  {
-  /* The "untyped" case can't happen. Do this to stop the "U" bit being
-     affected if we specify unsigned args.  */
-  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+  inst.instruction |= bit28 << 28;
+  inst.instruction |= neon_logbits (size) << 18;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
+static void
+mve_encode_rrqq (unsigned U, unsigned size)
+{
+  constraint (inst.operands[3].reg > 14, MVE_BAD_QREG);
+
+  inst.instruction |= U << 28;
+  inst.instruction |= (inst.operands[1].reg >> 1) << 20;
+  inst.instruction |= LOW4 (inst.operands[2].reg) << 16;
+  inst.instruction |= (size == 32) << 16;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= HI1 (inst.operands[2].reg) << 7;
+  inst.instruction |= inst.operands[3].reg;
+  inst.is_neon = 1;
+}
+
+/* Helper function for neon_three_same handling the operands.  */
+static void
+neon_three_args (int isquad)
+{
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= LOW4 (inst.operands[2].reg);
+  inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+  inst.instruction |= (isquad != 0) << 6;
+  inst.is_neon = 1;
+}
+
+/* Encode insns with bit pattern:
+
+  |28/24|23|22 |21 20|19 16|15 12|11    8|7|6|5|4|3  0|
+  |  U  |x |D  |size | Rn  | Rd  |x x x x|N|Q|M|x| Rm |
+
+  SIZE is passed in bits. -1 means size field isn't changed, in case it has a
+  different meaning for some instruction.  */
+
+static void
+neon_three_same (int isquad, int ubit, int size)
+{
+  neon_three_args (isquad);
+  inst.instruction |= (ubit != 0) << 24;
+  if (size != -1)
+    inst.instruction |= neon_logbits (size) << 20;
+
+  neon_dp_fixup (&inst);
+}
+
+/* Encode instructions of the form:
+
+  |28/24|23|22|21 20|19 18|17 16|15 12|11      7|6|5|4|3  0|
+  |  U  |x |D |x  x |size |x  x | Rd  |x x x x x|Q|M|x| Rm |
+
+  Don't write size if SIZE == -1.  */
+
+static void
+neon_two_same (int qbit, int ubit, int size)
+{
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= (qbit != 0) << 6;
+  inst.instruction |= (ubit != 0) << 24;
+
+  if (size != -1)
+    inst.instruction |= neon_logbits (size) << 18;
+
+  neon_dp_fixup (&inst);
  }
  
  enum vfp_or_neon_is_neon_bits
  {
-  NEON_CHECK_CC = 1,
-  NEON_CHECK_ARCH = 2,
-  NEON_CHECK_ARCH8 = 4
+NEON_CHECK_CC = 1,
+NEON_CHECK_ARCH = 2,
+NEON_CHECK_ARCH8 = 4
  };
  
  /* Call this function if an instruction which may have belonged to the VFP or
-   Neon instruction sets, but turned out to be a Neon instruction (due to the
-   operand types involved, etc.). We have to check and/or fix-up a couple of
-   things:
+ Neon instruction sets, but turned out to be a Neon instruction (due to the
+ operand types involved, etc.). We have to check and/or fix-up a couple of
+ things:
  
-     - Make sure the user hasn't attempted to make a Neon instruction
-       conditional.
-     - Alter the value in the condition code field if necessary.
-     - Make sure that the arch supports Neon instructions.
+   - Make sure the user hasn't attempted to make a Neon instruction
+     conditional.
+   - Alter the value in the condition code field if necessary.
+   - Make sure that the arch supports Neon instructions.
  
-   Which of these operations take place depends on bits from enum
-   vfp_or_neon_is_neon_bits.
+ Which of these operations take place depends on bits from enum
+ vfp_or_neon_is_neon_bits.
  
-   WARNING: This function has side effects! If NEON_CHECK_CC is used and the
-   current instruction's condition is COND_ALWAYS, the condition field is
-   changed to inst.uncond_value. This is necessary because instructions shared
-   between VFP and Neon may be conditional for the VFP variants only, and the
-   unconditional Neon version must have, e.g., 0xF in the condition field.  */
+ WARNING: This function has side effects! If NEON_CHECK_CC is used and the
+ current instruction's condition is COND_ALWAYS, the condition field is
+ changed to inst.uncond_value.  This is necessary because instructions shared
+ between VFP and Neon may be conditional for the VFP variants only, and the
+ unconditional Neon version must have, e.g., 0xF in the condition field.  */
  
  static int
  vfp_or_neon_is_neon (unsigned check)
  {
-  /* Conditions are always legal in Thumb mode (IT blocks).  */
-  if (!thumb_mode && (check & NEON_CHECK_CC))
+/* Conditions are always legal in Thumb mode (IT blocks).  */
+if (!thumb_mode && (check & NEON_CHECK_CC))
+  {
+    if (inst.cond != COND_ALWAYS)
+      {
+       first_error (_(BAD_COND));
+       return FAIL;
+      }
+    if (inst.uncond_value != -1)
+      inst.instruction |= inst.uncond_value << 28;
+  }
+
+
+  if (((check & NEON_CHECK_ARCH) && !mark_feature_used (&fpu_neon_ext_v1))
+      || ((check & NEON_CHECK_ARCH8)
+         && !mark_feature_used (&fpu_neon_ext_armv8)))
      {
-      if (inst.cond != COND_ALWAYS)
-       {
-         first_error (_(BAD_COND));
-         return FAIL;
-       }
-      if (inst.uncond_value != -1)
-       inst.instruction |= inst.uncond_value << 28;
+      first_error (_(BAD_FPU));
+      return FAIL;
      }
  
+return SUCCESS;
+}
+
  
-    if (((check & NEON_CHECK_ARCH) && !mark_feature_used (&fpu_neon_ext_v1))
-       || ((check & NEON_CHECK_ARCH8)
-           && !mark_feature_used (&fpu_neon_ext_armv8)))
+/* Return TRUE if the SIMD instruction is available for the current
+   cpu_variant.  FP is set to TRUE if this is a SIMD floating-point
+   instruction.  CHECK contains th.  CHECK contains the set of bits to pass to
+   vfp_or_neon_is_neon for the NEON specific checks.  */
+
+static bfd_boolean
+check_simd_pred_availability (int fp, unsigned check)
+{
+if (inst.cond > COND_ALWAYS)
+  {
+    if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
        {
-       first_error (_(BAD_FPU));
-       return FAIL;
+       inst.error = BAD_FPU;
+       return FALSE;
        }
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  }
+else if (inst.cond < COND_ALWAYS)
+  {
+    if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+    else if (vfp_or_neon_is_neon (check) == FAIL)
+      return FALSE;
+  }
+else
+  {
+    if (!ARM_CPU_HAS_FEATURE (cpu_variant, fp ? mve_fp_ext : mve_ext)
+       && vfp_or_neon_is_neon (check) == FAIL)
+      return FALSE;
  
-  return SUCCESS;
+    if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+  }
+return TRUE;
  }
  
-static int
-check_simd_pred_availability (int fp, unsigned check)
+/* Neon instruction encoders, in approximate order of appearance.  */
+
+static void
+do_neon_dyadic_i_su (void)
  {
-  if (inst.cond > COND_ALWAYS)
-    {
-      if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-       {
-         inst.error = BAD_FPU;
-         return 1;
-       }
-      inst.pred_insn_type = INSIDE_VPT_INSN;
-    }
-  else if (inst.cond < COND_ALWAYS)
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+
+  et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_32 | N_KEY);
+
+
+  if (rs != NS_QQR)
+    neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+  else
+    mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
+}
+
+static void
+do_neon_dyadic_i64_su (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
      {
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-       inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
-      else if (vfp_or_neon_is_neon (check) == FAIL)
-       return 2;
+      rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
      }
    else
      {
-      if (!ARM_CPU_HAS_FEATURE (cpu_variant, fp ? mve_fp_ext : mve_ext)
-         && vfp_or_neon_is_neon (check) == FAIL)
-       return 3;
-
-      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-       inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_ALL | N_KEY);
      }
-  return 0;
+  if (rs == NS_QQR)
+    mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
+  else
+    neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
  }
  
  static void
-do_mve_vstr_vldr_QI (int size, int elsize, int load)
+neon_imm_shift (int write_ubit, int uval, int isquad, struct neon_type_el et,
+               unsigned immbits)
  {
-  constraint (size < 32, BAD_ADDR_MODE);
-  constraint (size != elsize, BAD_EL_TYPE);
-  constraint (inst.operands[1].immisreg, BAD_ADDR_MODE);
-  constraint (!inst.operands[1].preind, BAD_ADDR_MODE);
-  constraint (load && inst.operands[0].reg == inst.operands[1].reg,
-             _("destination register and offset register may not be the"
-               " same"));
-
-  int imm = inst.relocs[0].exp.X_add_number;
-  int add = 1;
-  if (imm < 0)
-    {
-      add = 0;
-      imm = -imm;
-    }
-  constraint ((imm % (size / 8) != 0)
-             || imm > (0x7f << neon_logbits (size)),
-             (size == 32) ? _("immediate must be a multiple of 4 in the"
-                              " range of +/-[0,508]")
-                          : _("immediate must be a multiple of 8 in the"
-                              " range of +/-[0,1016]"));
-  inst.instruction |= 0x11 << 24;
-  inst.instruction |= add << 23;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= inst.operands[1].writeback << 21;
-  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  unsigned size = et.size >> 3;
    inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= 1 << 12;
-  inst.instruction |= (size == 64) << 8;
-  inst.instruction &= 0xffffff00;
-  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-  inst.instruction |= imm >> neon_logbits (size);
-}
-
-static void
-do_mve_vstr_vldr_RQ (int size, int elsize, int load)
-{
-    unsigned os = inst.operands[1].imm >> 5;
-    constraint (os != 0 && size == 8,
-               _("can not shift offsets when accessing less than half-word"));
-    constraint (os && os != neon_logbits (size),
-               _("shift immediate must be 1, 2 or 3 for half-word, word"
-                 " or double-word accesses respectively"));
-    if (inst.operands[1].reg == REG_PC)
-      as_tsktsk (MVE_BAD_PC);
-
-    switch (size)
-      {
-      case 8:
-       constraint (elsize >= 64, BAD_EL_TYPE);
-       break;
-      case 16:
-       constraint (elsize < 16 || elsize >= 64, BAD_EL_TYPE);
-       break;
-      case 32:
-      case 64:
-       constraint (elsize != size, BAD_EL_TYPE);
-       break;
-      default:
-       break;
-      }
-    constraint (inst.operands[1].writeback || !inst.operands[1].preind,
-               BAD_ADDR_MODE);
-    if (load)
-      {
-       constraint (inst.operands[0].reg == (inst.operands[1].imm & 0x1f),
-                   _("destination register and offset register may not be"
-                   " the same"));
-       constraint (size == elsize && inst.vectype.el[0].type != NT_unsigned,
-                   BAD_EL_TYPE);
-       constraint (inst.vectype.el[0].type != NT_unsigned
-                   && inst.vectype.el[0].type != NT_signed, BAD_EL_TYPE);
-       inst.instruction |= (inst.vectype.el[0].type == NT_unsigned) << 28;
-      }
-    else
-      {
-       constraint (inst.vectype.el[0].type != NT_untyped, BAD_EL_TYPE);
-      }
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= (isquad != 0) << 6;
+  inst.instruction |= immbits << 16;
+  inst.instruction |= (size >> 3) << 7;
+  inst.instruction |= (size & 0x7) << 19;
+  if (write_ubit)
+    inst.instruction |= (uval != 0) << 24;
  
-    inst.instruction |= 1 << 23;
-    inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-    inst.instruction |= inst.operands[1].reg << 16;
-    inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-    inst.instruction |= neon_logbits (elsize) << 7;
-    inst.instruction |= HI1 (inst.operands[1].imm) << 5;
-    inst.instruction |= LOW4 (inst.operands[1].imm);
-    inst.instruction |= !!os;
+  neon_dp_fixup (&inst);
  }
  
  static void
-do_mve_vstr_vldr_RI (int size, int elsize, int load)
+do_neon_shl (void)
  {
-  enum neon_el_type type = inst.vectype.el[0].type;
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
  
-  constraint (size >= 64, BAD_ADDR_MODE);
-  switch (size)
-    {
-    case 16:
-      constraint (elsize < 16 || elsize >= 64, BAD_EL_TYPE);
-      break;
-    case 32:
-      constraint (elsize != size, BAD_EL_TYPE);
-      break;
-    default:
-      break;
-    }
-  if (load)
+  if (!inst.operands[2].isreg)
      {
-      constraint (elsize != size && type != NT_unsigned
-                 && type != NT_signed, BAD_EL_TYPE);
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         rs = neon_select_shape (NS_QQI, NS_NULL);
+         et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_MVE);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+         et = neon_check_type (2, rs, N_EQK, N_KEY | N_I_ALL);
+       }
+      int imm = inst.operands[2].imm;
+
+      constraint (imm < 0 || (unsigned)imm >= et.size,
+                 _("immediate out of range for shift"));
+      NEON_ENCODE (IMMED, inst);
+      neon_imm_shift (FALSE, 0, neon_quad (rs), et, imm);
      }
    else
      {
-      constraint (elsize != size && type != NT_untyped, BAD_EL_TYPE);
-    }
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+         et = neon_check_type (3, rs, N_EQK, N_SU_MVE | N_KEY, N_EQK | N_EQK);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+         et = neon_check_type (3, rs, N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
+       }
  
-  int imm = inst.relocs[0].exp.X_add_number;
-  int add = 1;
-  if (imm < 0)
-    {
-      add = 0;
-      imm = -imm;
-    }
  
-  if ((imm % (size / 8) != 0) || imm > (0x7f << neon_logbits (size)))
-    {
-      switch (size)
+      if (rs == NS_QQR)
         {
-       case 8:
-         constraint (1, _("immediate must be in the range of +/-[0,127]"));
-         break;
-       case 16:
-         constraint (1, _("immediate must be a multiple of 2 in the"
-                          " range of +/-[0,254]"));
-         break;
-       case 32:
-         constraint (1, _("immediate must be a multiple of 4 in the"
-                          " range of +/-[0,508]"));
-         break;
+         constraint (inst.operands[0].reg != inst.operands[1].reg,
+                      _("invalid instruction shape"));
+         if (inst.operands[2].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         else if (inst.operands[2].reg == REG_PC)
+           as_tsktsk (MVE_BAD_PC);
+
+         inst.instruction = 0xee311e60;
+         inst.instruction |= (et.type == NT_unsigned) << 28;
+         inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+         inst.instruction |= neon_logbits (et.size) << 18;
+         inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+         inst.instruction |= inst.operands[2].reg;
+         inst.is_neon = 1;
         }
-    }
+      else
+       {
+         unsigned int tmp;
  
-  if (size != elsize)
-    {
-      constraint (inst.operands[1].reg > 7, BAD_HIREG);
-      constraint (inst.operands[0].reg > 14,
-                 _("MVE vector register in the range [Q0..Q7] expected"));
-      inst.instruction |= (load && type == NT_unsigned) << 28;
-      inst.instruction |= (size == 16) << 19;
-      inst.instruction |= neon_logbits (elsize) << 7;
-    }
-  else
-    {
-      if (inst.operands[1].reg == REG_PC)
-       as_tsktsk (MVE_BAD_PC);
-      else if (inst.operands[1].reg == REG_SP && inst.operands[1].writeback)
-       as_tsktsk (MVE_BAD_SP);
-      inst.instruction |= 1 << 12;
-      inst.instruction |= neon_logbits (size) << 7;
+         /* VSHL/VQSHL 3-register variants have syntax such as:
+              vshl.xx Dd, Dm, Dn
+            whereas other 3-register operations encoded by neon_three_same have
+            syntax like:
+              vadd.xx Dd, Dn, Dm
+            (i.e. with Dn & Dm reversed). Swap operands[1].reg and
+            operands[2].reg here.  */
+         tmp = inst.operands[2].reg;
+         inst.operands[2].reg = inst.operands[1].reg;
+         inst.operands[1].reg = tmp;
+         NEON_ENCODE (INTEGER, inst);
+         neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+       }
      }
-  inst.instruction |= inst.operands[1].preind << 24;
-  inst.instruction |= add << 23;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= inst.operands[1].writeback << 21;
-  inst.instruction |= inst.operands[1].reg << 16;
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction &= 0xffffff80;
-  inst.instruction |= imm >> neon_logbits (size);
-
  }
  
  static void
-do_mve_vstr_vldr (void)
+do_neon_qshl (void)
  {
-  unsigned size;
-  int load = 0;
-
-  if (inst.cond > COND_ALWAYS)
-    inst.pred_insn_type = INSIDE_VPT_INSN;
-  else
-    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
  
-  switch (inst.instruction)
+  if (!inst.operands[2].isreg)
      {
-    default:
-      gas_assert (0);
-      break;
-    case M_MNEM_vldrb:
-      load = 1;
-      /* fall through.  */
-    case M_MNEM_vstrb:
-      size = 8;
-      break;
-    case M_MNEM_vldrh:
-      load = 1;
-      /* fall through.  */
-    case M_MNEM_vstrh:
-      size = 16;
-      break;
-    case M_MNEM_vldrw:
-      load = 1;
-      /* fall through.  */
-    case M_MNEM_vstrw:
-      size = 32;
-      break;
-    case M_MNEM_vldrd:
-      load = 1;
-      /* fall through.  */
-    case M_MNEM_vstrd:
-      size = 64;
-      break;
-    }
-  unsigned elsize = inst.vectype.el[0].size;
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         rs = neon_select_shape (NS_QQI, NS_NULL);
+         et = neon_check_type (2, rs, N_EQK, N_KEY | N_SU_MVE);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+         et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+       }
+      int imm = inst.operands[2].imm;
  
-  if (inst.operands[1].isquad)
-    {
-      /* We are dealing with [Q, imm]{!} cases.  */
-      do_mve_vstr_vldr_QI (size, elsize, load);
+      constraint (imm < 0 || (unsigned)imm >= et.size,
+                 _("immediate out of range for shift"));
+      NEON_ENCODE (IMMED, inst);
+      neon_imm_shift (TRUE, et.type == NT_unsigned, neon_quad (rs), et, imm);
      }
    else
      {
-      if (inst.operands[1].immisreg == 2)
+      enum neon_shape rs;
+      struct neon_type_el et;
+
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
         {
-         /* We are dealing with [R, Q, {UXTW #os}] cases.  */
-         do_mve_vstr_vldr_RQ (size, elsize, load);
+         rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+         et = neon_check_type (3, rs, N_EQK, N_SU_MVE | N_KEY, N_EQK | N_EQK);
         }
-      else if (!inst.operands[1].immisreg)
+      else
         {
-         /* We are dealing with [R, Imm]{!}/[R], Imm cases.  */
-         do_mve_vstr_vldr_RI (size, elsize, load);
+         rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+         et = neon_check_type (3, rs, N_EQK, N_SU_ALL | N_KEY, N_EQK | N_SGN);
+       }
+
+      if (rs == NS_QQR)
+       {
+         constraint (inst.operands[0].reg != inst.operands[1].reg,
+                      _("invalid instruction shape"));
+         if (inst.operands[2].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         else if (inst.operands[2].reg == REG_PC)
+           as_tsktsk (MVE_BAD_PC);
+
+         inst.instruction = 0xee311ee0;
+         inst.instruction |= (et.type == NT_unsigned) << 28;
+         inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+         inst.instruction |= neon_logbits (et.size) << 18;
+         inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+         inst.instruction |= inst.operands[2].reg;
+         inst.is_neon = 1;
         }
        else
-       constraint (1, BAD_ADDR_MODE);
-    }
+       {
+         unsigned int tmp;
  
-  inst.is_neon = 1;
+         /* See note in do_neon_shl.  */
+         tmp = inst.operands[2].reg;
+         inst.operands[2].reg = inst.operands[1].reg;
+         inst.operands[1].reg = tmp;
+         NEON_ENCODE (INTEGER, inst);
+         neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
+       }
+    }
  }
  
  static void
-do_mve_vst_vld (void)
+do_neon_rshl (void)
  {
-  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
-    return;
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
  
-  constraint (!inst.operands[1].preind || inst.relocs[0].exp.X_add_symbol != 0
-             || inst.relocs[0].exp.X_add_number != 0
-             || inst.operands[1].immisreg != 0,
-             BAD_ADDR_MODE);
-  constraint (inst.vectype.el[0].size > 32, BAD_EL_TYPE);
-  if (inst.operands[1].reg == REG_PC)
-    as_tsktsk (MVE_BAD_PC);
-  else if (inst.operands[1].reg == REG_SP && inst.operands[1].writeback)
-    as_tsktsk (MVE_BAD_SP);
-
-
-  /* These instructions are one of the "exceptions" mentioned in
-     handle_pred_state.  They are MVE instructions that are not VPT compatible
-     and do not accept a VPT code, thus appending such a code is a syntax
-     error.  */
-  if (inst.cond > COND_ALWAYS)
-    first_error (BAD_SYNTAX);
-  /* If we append a scalar condition code we can set this to
-     MVE_OUTSIDE_PRED_INSN as it will also lead to a syntax error.  */
-  else if (inst.cond < COND_ALWAYS)
-    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+    }
    else
-    inst.pred_insn_type = MVE_UNPREDICABLE_INSN;
-
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= inst.operands[1].writeback << 21;
-  inst.instruction |= inst.operands[1].reg << 16;
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= neon_logbits (inst.vectype.el[0].size) << 7;
-  inst.is_neon = 1;
-}
-
-static void
-do_neon_dyadic_if_su (void)
-{
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_QQR, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs, N_EQK , N_EQK,
-                                           N_SUF_32 | N_KEY);
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_ALL | N_KEY);
+    }
  
-  if (check_simd_pred_availability (et.type == NT_float,
-                                   NEON_CHECK_ARCH | NEON_CHECK_CC))
-    return;
+  unsigned int tmp;
  
-  neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
-}
+  if (rs == NS_QQR)
+    {
+      if (inst.operands[2].reg == REG_PC)
+       as_tsktsk (MVE_BAD_PC);
+      else if (inst.operands[2].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
  
-static void
-do_neon_addsub_if_i (void)
-{
-  if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
-      && try_vfp_nsyn (3, do_vfp_nsyn_add_sub) == SUCCESS)
-    return;
+      constraint (inst.operands[0].reg != inst.operands[1].reg,
+                 _("invalid instruction shape"));
  
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_QQR, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs, N_EQK,
-                                           N_EQK, N_IF_32 | N_I64 | N_KEY);
+      if (inst.instruction == 0x0000510)
+       /* We are dealing with vqrshl.  */
+       inst.instruction = 0xee331ee0;
+      else
+       /* We are dealing with vrshl.  */
+       inst.instruction = 0xee331e60;
  
-  constraint (rs == NS_QQR && et.size == 64, BAD_FPU);
-  /* If we are parsing Q registers and the element types match MVE, which NEON
-     also supports, then we must check whether this is an instruction that can
-     be used by both MVE/NEON.  This distinction can be made based on whether
-     they are predicated or not.  */
-  if ((rs == NS_QQQ || rs == NS_QQR) && et.size != 64)
-    {
-      if (check_simd_pred_availability (et.type == NT_float,
-                                       NEON_CHECK_ARCH | NEON_CHECK_CC))
-       return;
+      inst.instruction |= (et.type == NT_unsigned) << 28;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= inst.operands[2].reg;
+      inst.is_neon = 1;
      }
    else
      {
-      /* If they are either in a D register or are using an unsupported.  */
-      if (rs != NS_QQR
-         && vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-       return;
+      tmp = inst.operands[2].reg;
+      inst.operands[2].reg = inst.operands[1].reg;
+      inst.operands[1].reg = tmp;
+      neon_three_same (neon_quad (rs), et.type == NT_unsigned, et.size);
      }
-
-  /* The "untyped" case can't happen. Do this to stop the "U" bit being
-     affected if we specify unsigned args.  */
-  neon_dyadic_misc (NT_untyped, N_IF_32 | N_I64, 0);
  }
  
-/* Swaps operands 1 and 2. If operand 1 (optional arg) was omitted, we want the
-   result to be:
-     V<op> A,B     (A is operand 0, B is operand 2)
-   to mean:
-     V<op> A,B,A
-   not:
-     V<op> A,B,B
-   so handle that case specially.  */
-
-static void
-neon_exchange_operands (void)
+static int
+neon_cmode_for_logic_imm (unsigned immediate, unsigned *immbits, int size)
  {
-  if (inst.operands[1].present)
+  /* Handle .I8 pseudo-instructions.  */
+  if (size == 8)
      {
-      void *scratch = xmalloc (sizeof (inst.operands[0]));
+      /* Unfortunately, this will make everything apart from zero out-of-range.
+        FIXME is this the intended semantics? There doesn't seem much point in
+        accepting .I8 if so.  */
+      immediate |= immediate << 8;
+      size = 16;
+    }
  
-      /* Swap operands[1] and operands[2].  */
-      memcpy (scratch, &inst.operands[1], sizeof (inst.operands[0]));
-      inst.operands[1] = inst.operands[2];
-      memcpy (&inst.operands[2], scratch, sizeof (inst.operands[0]));
-      free (scratch);
+  if (size >= 32)
+    {
+      if (immediate == (immediate & 0x000000ff))
+       {
+         *immbits = immediate;
+         return 0x1;
+       }
+      else if (immediate == (immediate & 0x0000ff00))
+       {
+         *immbits = immediate >> 8;
+         return 0x3;
+       }
+      else if (immediate == (immediate & 0x00ff0000))
+       {
+         *immbits = immediate >> 16;
+         return 0x5;
+       }
+      else if (immediate == (immediate & 0xff000000))
+       {
+         *immbits = immediate >> 24;
+         return 0x7;
+       }
+      if ((immediate & 0xffff) != (immediate >> 16))
+       goto bad_immediate;
+      immediate &= 0xffff;
      }
-  else
+
+  if (immediate == (immediate & 0x000000ff))
      {
-      inst.operands[1] = inst.operands[2];
-      inst.operands[2] = inst.operands[0];
+      *immbits = immediate;
+      return 0x9;
+    }
+  else if (immediate == (immediate & 0x0000ff00))
+    {
+      *immbits = immediate >> 8;
+      return 0xb;
      }
+
+  bad_immediate:
+  first_error (_("immediate value out of range"));
+  return FAIL;
  }
  
  static void
-neon_compare (unsigned regtypes, unsigned immtypes, int invert)
+do_neon_logic (void)
  {
-  if (inst.operands[2].isreg)
+  if (inst.operands[2].present && inst.operands[2].isreg)
      {
-      if (invert)
-       neon_exchange_operands ();
-      neon_dyadic_misc (NT_unsigned, regtypes, N_SIZ);
+      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      if (rs == NS_QQQ
+         && !check_simd_pred_availability (FALSE,
+                                           NEON_CHECK_ARCH | NEON_CHECK_CC))
+       return;
+      else if (rs != NS_QQQ
+              && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+       first_error (BAD_FPU);
+
+      neon_check_type (3, rs, N_IGNORE_TYPE);
+      /* U bit and size field were set as part of the bitmask.  */
+      NEON_ENCODE (INTEGER, inst);
+      neon_three_same (neon_quad (rs), 0, -1);
      }
    else
      {
-      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-      struct neon_type_el et = neon_check_type (2, rs,
-       N_EQK | N_SIZ, immtypes | N_KEY);
+      const int three_ops_form = (inst.operands[2].present
+                                 && !inst.operands[2].isreg);
+      const int immoperand = (three_ops_form ? 2 : 1);
+      enum neon_shape rs = (three_ops_form
+                           ? neon_select_shape (NS_DDI, NS_QQI, NS_NULL)
+                           : neon_select_shape (NS_DI, NS_QI, NS_NULL));
+      /* Because neon_select_shape makes the second operand a copy of the first
+        if the second operand is not present.  */
+      if (rs == NS_QQI
+         && !check_simd_pred_availability (FALSE,
+                                           NEON_CHECK_ARCH | NEON_CHECK_CC))
+       return;
+      else if (rs != NS_QQI
+              && !ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1))
+       first_error (BAD_FPU);
+
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       et = neon_check_type (2, rs, N_I32 | N_I16 | N_KEY, N_EQK);
+      else
+       et = neon_check_type (2, rs, N_I8 | N_I16 | N_I32 | N_I64 | N_F32
+                             | N_KEY, N_EQK);
+
+      if (et.type == NT_invtype)
+       return;
+      enum neon_opc opcode = (enum neon_opc) inst.instruction & 0x0fffffff;
+      unsigned immbits;
+      int cmode;
+
+
+      if (three_ops_form)
+       constraint (inst.operands[0].reg != inst.operands[1].reg,
+                   _("first and second operands shall be the same register"));
  
        NEON_ENCODE (IMMED, inst);
+
+      immbits = inst.operands[immoperand].imm;
+      if (et.size == 64)
+       {
+         /* .i64 is a pseudo-op, so the immediate must be a repeating
+            pattern.  */
+         if (immbits != (inst.operands[immoperand].regisimm ?
+                         inst.operands[immoperand].reg : 0))
+           {
+             /* Set immbits to an invalid constant.  */
+             immbits = 0xdeadbeef;
+           }
+       }
+
+      switch (opcode)
+       {
+       case N_MNEM_vbic:
+         cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
+         break;
+
+       case N_MNEM_vorr:
+         cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
+         break;
+
+       case N_MNEM_vand:
+         /* Pseudo-instruction for VBIC.  */
+         neon_invert_size (&immbits, 0, et.size);
+         cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
+         break;
+
+       case N_MNEM_vorn:
+         /* Pseudo-instruction for VORR.  */
+         neon_invert_size (&immbits, 0, et.size);
+         cmode = neon_cmode_for_logic_imm (immbits, &immbits, et.size);
+         break;
+
+       default:
+         abort ();
+       }
+
+      if (cmode == FAIL)
+       return;
+
+      inst.instruction |= neon_quad (rs) << 6;
        inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
        inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-      inst.instruction |= LOW4 (inst.operands[1].reg);
-      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-      inst.instruction |= neon_quad (rs) << 6;
-      inst.instruction |= (et.type == NT_float) << 10;
-      inst.instruction |= neon_logbits (et.size) << 18;
+      inst.instruction |= cmode << 8;
+      neon_write_immbits (immbits);
  
        neon_dp_fixup (&inst);
      }
  }
  
  static void
-do_neon_cmp (void)
+do_neon_bitfield (void)
  {
-  neon_compare (N_SUF_32, N_S_32 | N_F_16_32, FALSE);
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_IGNORE_TYPE);
+  neon_three_same (neon_quad (rs), 0, -1);
  }
  
  static void
-do_neon_cmp_inv (void)
+neon_dyadic_misc (enum neon_el_type ubit_meaning, unsigned types,
+                 unsigned destbits)
  {
-  neon_compare (N_SUF_32, N_S_32 | N_F_16_32, TRUE);
-}
-
-static void
-do_neon_ceq (void)
-{
-  neon_compare (N_IF_32, N_IF_32, FALSE);
-}
-
-/* For multiply instructions, we have the possibility of 16-bit or 32-bit
-   scalars, which are encoded in 5 bits, M : Rm.
-   For 16-bit scalars, the register is encoded in Rm[2:0] and the index in
-   M:Rm[3], and for 32-bit scalars, the register is encoded in Rm[3:0] and the
-   index in M.
-
-   Dot Product instructions are similar to multiply instructions except elsize
-   should always be 32.
-
-   This function translates SCALAR, which is GAS's internal encoding of indexed
-   scalar register, to raw encoding.  There is also register and index range
-   check based on ELSIZE.  */
-
-static unsigned
-neon_scalar_for_mul (unsigned scalar, unsigned elsize)
-{
-  unsigned regno = NEON_SCALAR_REG (scalar);
-  unsigned elno = NEON_SCALAR_INDEX (scalar);
-
-  switch (elsize)
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_QQR, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs, N_EQK | destbits, N_EQK,
+                                           types | N_KEY);
+  if (et.type == NT_float)
      {
-    case 16:
-      if (regno > 7 || elno > 3)
-       goto bad_scalar;
-      return regno | (elno << 3);
-
-    case 32:
-      if (regno > 15 || elno > 1)
-       goto bad_scalar;
-      return regno | (elno << 4);
-
-    default:
-    bad_scalar:
-      first_error (_("scalar out of range for multiply instruction"));
+      NEON_ENCODE (FLOAT, inst);
+      if (rs == NS_QQR)
+       mve_encode_qqr (et.size, 0, 1);
+      else
+       neon_three_same (neon_quad (rs), 0, et.size == 16 ? (int) et.size : -1);
+    }
+  else
+    {
+      NEON_ENCODE (INTEGER, inst);
+      if (rs == NS_QQR)
+       mve_encode_qqr (et.size, et.type == ubit_meaning, 0);
+      else
+       neon_three_same (neon_quad (rs), et.type == ubit_meaning, et.size);
      }
-
-  return 0;
  }
  
-/* Encode multiply / multiply-accumulate scalar instructions.  */
-
-static void
-neon_mul_mac (struct neon_type_el et, int ubit)
-{
-  unsigned scalar;
-
-  /* Give a more helpful error message if we have an invalid type.  */
-  if (et.type == NT_invtype)
-    return;
-
-  scalar = neon_scalar_for_mul (inst.operands[2].reg, et.size);
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
-  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-  inst.instruction |= LOW4 (scalar);
-  inst.instruction |= HI1 (scalar) << 5;
-  inst.instruction |= (et.type == NT_float) << 8;
-  inst.instruction |= neon_logbits (et.size) << 20;
-  inst.instruction |= (ubit != 0) << 24;
-
-  neon_dp_fixup (&inst);
-}
  
  static void
-do_neon_mac_maybe_scalar (void)
+do_neon_dyadic_if_su_d (void)
  {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_mla_mls) == SUCCESS)
-    return;
-
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
-
-  if (inst.operands[2].isscalar)
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_EQK, N_I16 | N_I32 | N_F_16_32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
-    }
-  else
-    {
-      /* The "untyped" case can't happen.  Do this to stop the "U" bit being
-        affected if we specify unsigned args.  */
-      neon_dyadic_misc (NT_untyped, N_IF_32, 0);
-    }
+  /* This version only allow D registers, but that constraint is enforced during
+     operand parsing so we don't need to do anything extra here.  */
+  neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
  }
  
  static void
-do_neon_fmac (void)
+do_neon_dyadic_if_i_d (void)
  {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_fma_fms) == SUCCESS)
-    return;
-
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
-
+  /* The "untyped" case can't happen. Do this to stop the "U" bit being
+     affected if we specify unsigned args.  */
    neon_dyadic_misc (NT_untyped, N_IF_32, 0);
  }
  
  static void
-do_neon_tst (void)
+do_mve_vstr_vldr_QI (int size, int elsize, int load)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-    N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
-  neon_three_same (neon_quad (rs), 0, et.size);
-}
+  constraint (size < 32, BAD_ADDR_MODE);
+  constraint (size != elsize, BAD_EL_TYPE);
+  constraint (inst.operands[1].immisreg, BAD_ADDR_MODE);
+  constraint (!inst.operands[1].preind, BAD_ADDR_MODE);
+  constraint (load && inst.operands[0].reg == inst.operands[1].reg,
+             _("destination register and offset register may not be the"
+               " same"));
  
-/* VMUL with 3 registers allows the P8 type. The scalar version supports the
-   same types as the MAC equivalents. The polynomial type for this instruction
-   is encoded the same as the integer type.  */
+  int imm = inst.relocs[0].exp.X_add_number;
+  int add = 1;
+  if (imm < 0)
+    {
+      add = 0;
+      imm = -imm;
+    }
+  constraint ((imm % (size / 8) != 0)
+             || imm > (0x7f << neon_logbits (size)),
+             (size == 32) ? _("immediate must be a multiple of 4 in the"
+                              " range of +/-[0,508]")
+                          : _("immediate must be a multiple of 8 in the"
+                              " range of +/-[0,1016]"));
+  inst.instruction |= 0x11 << 24;
+  inst.instruction |= add << 23;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= inst.operands[1].writeback << 21;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= 1 << 12;
+  inst.instruction |= (size == 64) << 8;
+  inst.instruction &= 0xffffff00;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= imm >> neon_logbits (size);
+}
  
  static void
-do_neon_mul (void)
+do_mve_vstr_vldr_RQ (int size, int elsize, int load)
  {
-  if (try_vfp_nsyn (3, do_vfp_nsyn_mul) == SUCCESS)
-    return;
+    unsigned os = inst.operands[1].imm >> 5;
+    unsigned type = inst.vectype.el[0].type;
+    constraint (os != 0 && size == 8,
+               _("can not shift offsets when accessing less than half-word"));
+    constraint (os && os != neon_logbits (size),
+               _("shift immediate must be 1, 2 or 3 for half-word, word"
+                 " or double-word accesses respectively"));
+    if (inst.operands[1].reg == REG_PC)
+      as_tsktsk (MVE_BAD_PC);
  
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-    return;
+    switch (size)
+      {
+      case 8:
+       constraint (elsize >= 64, BAD_EL_TYPE);
+       break;
+      case 16:
+       constraint (elsize < 16 || elsize >= 64, BAD_EL_TYPE);
+       break;
+      case 32:
+      case 64:
+       constraint (elsize != size, BAD_EL_TYPE);
+       break;
+      default:
+       break;
+      }
+    constraint (inst.operands[1].writeback || !inst.operands[1].preind,
+               BAD_ADDR_MODE);
+    if (load)
+      {
+       constraint (inst.operands[0].reg == (inst.operands[1].imm & 0x1f),
+                   _("destination register and offset register may not be"
+                   " the same"));
+       constraint (size == elsize && type == NT_signed, BAD_EL_TYPE);
+       constraint (size != elsize && type != NT_unsigned && type != NT_signed,
+                   BAD_EL_TYPE);
+       inst.instruction |= ((size == elsize) || (type == NT_unsigned)) << 28;
+      }
+    else
+      {
+       constraint (type != NT_untyped, BAD_EL_TYPE);
+      }
  
-  if (inst.operands[2].isscalar)
-    do_neon_mac_maybe_scalar ();
-  else
-    neon_dyadic_misc (NT_poly, N_I8 | N_I16 | N_I32 | N_F16 | N_F32 | N_P8, 0);
+    inst.instruction |= 1 << 23;
+    inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+    inst.instruction |= inst.operands[1].reg << 16;
+    inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+    inst.instruction |= neon_logbits (elsize) << 7;
+    inst.instruction |= HI1 (inst.operands[1].imm) << 5;
+    inst.instruction |= LOW4 (inst.operands[1].imm);
+    inst.instruction |= !!os;
  }
  
  static void
-do_neon_qdmulh (void)
+do_mve_vstr_vldr_RI (int size, int elsize, int load)
  {
-  if (inst.operands[2].isscalar)
+  enum neon_el_type type = inst.vectype.el[0].type;
+
+  constraint (size >= 64, BAD_ADDR_MODE);
+  switch (size)
      {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
+    case 16:
+      constraint (elsize < 16 || elsize >= 64, BAD_EL_TYPE);
+      break;
+    case 32:
+      constraint (elsize != size, BAD_EL_TYPE);
+      break;
+    default:
+      break;
+    }
+  if (load)
+    {
+      constraint (elsize != size && type != NT_unsigned
+                 && type != NT_signed, BAD_EL_TYPE);
      }
    else
      {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (INTEGER, inst);
-      /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (neon_quad (rs), 0, et.size);
+      constraint (elsize != size && type != NT_untyped, BAD_EL_TYPE);
      }
-}
-
-static void
-do_mve_vmull (void)
-{
  
-  enum neon_shape rs = neon_select_shape (NS_HHH, NS_FFF, NS_DDD, NS_DDS,
-                                         NS_QQS, NS_QQQ, NS_QQR, NS_NULL);
-  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
-      && inst.cond == COND_ALWAYS
-      && ((unsigned)inst.instruction) == M_MNEM_vmullt)
+  int imm = inst.relocs[0].exp.X_add_number;
+  int add = 1;
+  if (imm < 0)
      {
-      if (rs == NS_QQQ)
-       {
-
-         struct neon_type_el et = neon_check_type (3, rs, N_EQK , N_EQK,
-                                                   N_SUF_32 | N_F64 | N_P8
-                                                   | N_P16 | N_I_MVE | N_KEY);
-         if (((et.type == NT_poly) && et.size == 8
-              && ARM_CPU_IS_ANY (cpu_variant))
-             || (et.type == NT_integer) || (et.type == NT_float))
-           goto neon_vmul;
-       }
-      else
-       goto neon_vmul;
+      add = 0;
+      imm = -imm;
      }
  
-  constraint (rs != NS_QQQ, BAD_FPU);
-  struct neon_type_el et = neon_check_type (3, rs, N_EQK , N_EQK,
-                                           N_SU_32 | N_P8 | N_P16 | N_KEY);
+  if ((imm % (size / 8) != 0) || imm > (0x7f << neon_logbits (size)))
+    {
+      switch (size)
+       {
+       case 8:
+         constraint (1, _("immediate must be in the range of +/-[0,127]"));
+         break;
+       case 16:
+         constraint (1, _("immediate must be a multiple of 2 in the"
+                          " range of +/-[0,254]"));
+         break;
+       case 32:
+         constraint (1, _("immediate must be a multiple of 4 in the"
+                          " range of +/-[0,508]"));
+         break;
+       }
+    }
  
-  /* We are dealing with MVE's vmullt.  */
-  if (et.size == 32
-      && (inst.operands[0].reg == inst.operands[1].reg
-         || inst.operands[0].reg == inst.operands[2].reg))
-    as_tsktsk (BAD_MVE_SRCDEST);
+  if (size != elsize)
+    {
+      constraint (inst.operands[1].reg > 7, BAD_HIREG);
+      constraint (inst.operands[0].reg > 14,
+                 _("MVE vector register in the range [Q0..Q7] expected"));
+      inst.instruction |= (load && type == NT_unsigned) << 28;
+      inst.instruction |= (size == 16) << 19;
+      inst.instruction |= neon_logbits (elsize) << 7;
+    }
+  else
+    {
+      if (inst.operands[1].reg == REG_PC)
+       as_tsktsk (MVE_BAD_PC);
+      else if (inst.operands[1].reg == REG_SP && inst.operands[1].writeback)
+       as_tsktsk (MVE_BAD_SP);
+      inst.instruction |= 1 << 12;
+      inst.instruction |= neon_logbits (size) << 7;
+    }
+  inst.instruction |= inst.operands[1].preind << 24;
+  inst.instruction |= add << 23;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= inst.operands[1].writeback << 21;
+  inst.instruction |= inst.operands[1].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction &= 0xffffff80;
+  inst.instruction |= imm >> neon_logbits (size);
+
+}
+
+static void
+do_mve_vstr_vldr (void)
+{
+  unsigned size;
+  int load = 0;
  
    if (inst.cond > COND_ALWAYS)
      inst.pred_insn_type = INSIDE_VPT_INSN;
    else
      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-  if (et.type == NT_poly)
-    mve_encode_qqq (neon_logbits (et.size), 64);
-  else
-    mve_encode_qqq (et.type == NT_unsigned, et.size);
+  switch (inst.instruction)
+    {
+    default:
+      gas_assert (0);
+      break;
+    case M_MNEM_vldrb:
+      load = 1;
+      /* fall through.  */
+    case M_MNEM_vstrb:
+      size = 8;
+      break;
+    case M_MNEM_vldrh:
+      load = 1;
+      /* fall through.  */
+    case M_MNEM_vstrh:
+      size = 16;
+      break;
+    case M_MNEM_vldrw:
+      load = 1;
+      /* fall through.  */
+    case M_MNEM_vstrw:
+      size = 32;
+      break;
+    case M_MNEM_vldrd:
+      load = 1;
+      /* fall through.  */
+    case M_MNEM_vstrd:
+      size = 64;
+      break;
+    }
+  unsigned elsize = inst.vectype.el[0].size;
  
-  return;
+  if (inst.operands[1].isquad)
+    {
+      /* We are dealing with [Q, imm]{!} cases.  */
+      do_mve_vstr_vldr_QI (size, elsize, load);
+    }
+  else
+    {
+      if (inst.operands[1].immisreg == 2)
+       {
+         /* We are dealing with [R, Q, {UXTW #os}] cases.  */
+         do_mve_vstr_vldr_RQ (size, elsize, load);
+       }
+      else if (!inst.operands[1].immisreg)
+       {
+         /* We are dealing with [R, Imm]{!}/[R], Imm cases.  */
+         do_mve_vstr_vldr_RI (size, elsize, load);
+       }
+      else
+       constraint (1, BAD_ADDR_MODE);
+    }
  
-neon_vmul:
-  inst.instruction = N_MNEM_vmul;
-  inst.cond = 0xb;
-  if (thumb_mode)
-    inst.pred_insn_type = INSIDE_IT_INSN;
-  do_neon_mul ();
+  inst.is_neon = 1;
  }
  
  static void
-do_mve_vabav (void)
+do_mve_vst_vld (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_RQQ, NS_NULL);
-
-  if (rs == NS_NULL)
-    return;
-
    if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
      return;
  
-  struct neon_type_el et = neon_check_type (2, NS_NULL, N_EQK, N_KEY | N_S8
-                                           | N_S16 | N_S32 | N_U8 | N_U16
-                                           | N_U32);
+  constraint (!inst.operands[1].preind || inst.relocs[0].exp.X_add_symbol != 0
+             || inst.relocs[0].exp.X_add_number != 0
+             || inst.operands[1].immisreg != 0,
+             BAD_ADDR_MODE);
+  constraint (inst.vectype.el[0].size > 32, BAD_EL_TYPE);
+  if (inst.operands[1].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+  else if (inst.operands[1].reg == REG_SP && inst.operands[1].writeback)
+    as_tsktsk (MVE_BAD_SP);
+
  
+  /* These instructions are one of the "exceptions" mentioned in
+     handle_pred_state.  They are MVE instructions that are not VPT compatible
+     and do not accept a VPT code, thus appending such a code is a syntax
+     error.  */
    if (inst.cond > COND_ALWAYS)
-    inst.pred_insn_type = INSIDE_VPT_INSN;
-  else
+    first_error (BAD_SYNTAX);
+  /* If we append a scalar condition code we can set this to
+     MVE_OUTSIDE_PRED_INSN as it will also lead to a syntax error.  */
+  else if (inst.cond < COND_ALWAYS)
      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+  else
+    inst.pred_insn_type = MVE_UNPREDICABLE_INSN;
  
-  mve_encode_rqq (et.type == NT_unsigned, et.size);
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= inst.operands[1].writeback << 21;
+  inst.instruction |= inst.operands[1].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= neon_logbits (inst.vectype.el[0].size) << 7;
+  inst.is_neon = 1;
  }
  
  static void
-do_mve_vmladav (void)
+do_mve_vaddlv (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_RQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs,
-                                           N_EQK, N_EQK, N_SU_MVE | N_KEY);
-
-  if (et.type == NT_unsigned
-      && (inst.instruction == M_MNEM_vmladavx
-         || inst.instruction == M_MNEM_vmladavax
-         || inst.instruction == M_MNEM_vmlsdav
-         || inst.instruction == M_MNEM_vmlsdava
-         || inst.instruction == M_MNEM_vmlsdavx
-         || inst.instruction == M_MNEM_vmlsdavax))
-    first_error (BAD_SIMD_TYPE);
+  enum neon_shape rs = neon_select_shape (NS_RRQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S32 | N_U32 | N_KEY);
  
-  constraint (inst.operands[2].reg > 14,
-             _("MVE vector register in the range [Q0..Q7] expected"));
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
  
    if (inst.cond > COND_ALWAYS)
      inst.pred_insn_type = INSIDE_VPT_INSN;
    else
      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-  if (inst.instruction == M_MNEM_vmlsdav
-      || inst.instruction == M_MNEM_vmlsdava
-      || inst.instruction == M_MNEM_vmlsdavx
-      || inst.instruction == M_MNEM_vmlsdavax)
-    inst.instruction |= (et.size == 8) << 28;
-  else
-    inst.instruction |= (et.size == 8) << 8;
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
  
-  mve_encode_rqq (et.type == NT_unsigned, 64);
-  inst.instruction |= (et.size == 32) << 16;
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= inst.operands[1].reg << 19;
+  inst.instruction |= inst.operands[0].reg << 12;
+  inst.instruction |= inst.operands[2].reg;
+  inst.is_neon = 1;
  }
  
  static void
-do_neon_qrdmlah (void)
+do_neon_dyadic_if_su (void)
  {
-  /* Check we're on the correct architecture.  */
-  if (!mark_feature_used (&fpu_neon_ext_armv8))
-    inst.error =
-      _("instruction form not available on this architecture.");
-  else if (!mark_feature_used (&fpu_neon_ext_v8_1))
-    {
-      as_warn (_("this instruction implies use of ARMv8.1 AdvSIMD."));
-      record_feature_use (&fpu_neon_ext_v8_1);
-    }
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_QQR, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs, N_EQK , N_EQK,
+                                           N_SUF_32 | N_KEY);
  
-  if (inst.operands[2].isscalar)
+  constraint ((inst.instruction == ((unsigned) N_MNEM_vmax)
+              || inst.instruction == ((unsigned) N_MNEM_vmin))
+             && et.type == NT_float
+             && !ARM_CPU_HAS_FEATURE (cpu_variant,fpu_neon_ext_v1), BAD_FPU);
+
+  if (!check_simd_pred_availability (et.type == NT_float,
+                                    NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  neon_dyadic_misc (NT_unsigned, N_SUF_32, 0);
+}
+
+static void
+do_neon_addsub_if_i (void)
+{
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+      && try_vfp_nsyn (3, do_vfp_nsyn_add_sub) == SUCCESS)
+    return;
+
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_QQR, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs, N_EQK,
+                                           N_EQK, N_IF_32 | N_I64 | N_KEY);
+
+  constraint (rs == NS_QQR && et.size == 64, BAD_FPU);
+  /* If we are parsing Q registers and the element types match MVE, which NEON
+     also supports, then we must check whether this is an instruction that can
+     be used by both MVE/NEON.  This distinction can be made based on whether
+     they are predicated or not.  */
+  if ((rs == NS_QQQ || rs == NS_QQR) && et.size != 64)
      {
-      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (SCALAR, inst);
-      neon_mul_mac (et, neon_quad (rs));
+      if (!check_simd_pred_availability (et.type == NT_float,
+                                        NEON_CHECK_ARCH | NEON_CHECK_CC))
+       return;
      }
    else
      {
-      enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-      struct neon_type_el et = neon_check_type (3, rs,
-       N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
-      NEON_ENCODE (INTEGER, inst);
-      /* The U bit (rounding) comes from bit mask.  */
-      neon_three_same (neon_quad (rs), 0, et.size);
+      /* If they are either in a D register or are using an unsupported.  */
+      if (rs != NS_QQR
+         && vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+       return;
      }
-}
  
-static void
-do_neon_fcmp_absolute (void)
-{
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs, N_EQK, N_EQK,
-                                           N_F_16_32 | N_KEY);
-  /* Size field comes from bit mask.  */
-  neon_three_same (neon_quad (rs), 1, et.size == 16 ? (int) et.size : -1);
+  /* The "untyped" case can't happen. Do this to stop the "U" bit being
+     affected if we specify unsigned args.  */
+  neon_dyadic_misc (NT_untyped, N_IF_32 | N_I64, 0);
  }
  
+/* Swaps operands 1 and 2. If operand 1 (optional arg) was omitted, we want the
+   result to be:
+     V<op> A,B     (A is operand 0, B is operand 2)
+   to mean:
+     V<op> A,B,A
+   not:
+     V<op> A,B,B
+   so handle that case specially.  */
+
  static void
-do_neon_fcmp_absolute_inv (void)
+neon_exchange_operands (void)
  {
-  neon_exchange_operands ();
-  do_neon_fcmp_absolute ();
-}
+  if (inst.operands[1].present)
+    {
+      void *scratch = xmalloc (sizeof (inst.operands[0]));
  
-static void
-do_neon_step (void)
-{
-  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
-  struct neon_type_el et = neon_check_type (3, rs, N_EQK, N_EQK,
-                                           N_F_16_32 | N_KEY);
-  neon_three_same (neon_quad (rs), 0, et.size == 16 ? (int) et.size : -1);
+      /* Swap operands[1] and operands[2].  */
+      memcpy (scratch, &inst.operands[1], sizeof (inst.operands[0]));
+      inst.operands[1] = inst.operands[2];
+      memcpy (&inst.operands[2], scratch, sizeof (inst.operands[0]));
+      free (scratch);
+    }
+  else
+    {
+      inst.operands[1] = inst.operands[2];
+      inst.operands[2] = inst.operands[0];
+    }
  }
  
  static void
-do_neon_abs_neg (void)
+neon_compare (unsigned regtypes, unsigned immtypes, int invert)
  {
-  enum neon_shape rs;
-  struct neon_type_el et;
-
-  if (try_vfp_nsyn (2, do_vfp_nsyn_abs_neg) == SUCCESS)
-    return;
-
-  rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
-  et = neon_check_type (2, rs, N_EQK, N_S_32 | N_F_16_32 | N_KEY);
-
-  if (check_simd_pred_availability (et.type == NT_float,
-                                   NEON_CHECK_ARCH | NEON_CHECK_CC))
-    return;
+  if (inst.operands[2].isreg)
+    {
+      if (invert)
+       neon_exchange_operands ();
+      neon_dyadic_misc (NT_unsigned, regtypes, N_SIZ);
+    }
+  else
+    {
+      enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      struct neon_type_el et = neon_check_type (2, rs,
+       N_EQK | N_SIZ, immtypes | N_KEY);
  
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= LOW4 (inst.operands[1].reg);
-  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-  inst.instruction |= neon_quad (rs) << 6;
-  inst.instruction |= (et.type == NT_float) << 10;
-  inst.instruction |= neon_logbits (et.size) << 18;
+      NEON_ENCODE (IMMED, inst);
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= LOW4 (inst.operands[1].reg);
+      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+      inst.instruction |= neon_quad (rs) << 6;
+      inst.instruction |= (et.type == NT_float) << 10;
+      inst.instruction |= neon_logbits (et.size) << 18;
  
-  neon_dp_fixup (&inst);
+      neon_dp_fixup (&inst);
+    }
  }
  
  static void
-do_neon_sli (void)
+do_neon_cmp (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
-  int imm = inst.operands[2].imm;
-  constraint (imm < 0 || (unsigned)imm >= et.size,
-             _("immediate out of range for insert"));
-  neon_imm_shift (FALSE, 0, neon_quad (rs), et, imm);
+  neon_compare (N_SUF_32, N_S_32 | N_F_16_32, FALSE);
  }
  
  static void
-do_neon_sri (void)
+do_neon_cmp_inv (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
-  int imm = inst.operands[2].imm;
-  constraint (imm < 1 || (unsigned)imm > et.size,
-             _("immediate out of range for insert"));
-  neon_imm_shift (FALSE, 0, neon_quad (rs), et, et.size - imm);
+  neon_compare (N_SUF_32, N_S_32 | N_F_16_32, TRUE);
  }
  
  static void
-do_neon_qshlu_imm (void)
+do_neon_ceq (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs,
-    N_EQK | N_UNS, N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
-  int imm = inst.operands[2].imm;
-  constraint (imm < 0 || (unsigned)imm >= et.size,
-             _("immediate out of range for shift"));
-  /* Only encodes the 'U present' variant of the instruction.
-     In this case, signed types have OP (bit 8) set to 0.
-     Unsigned types have OP set to 1.  */
-  inst.instruction |= (et.type == NT_unsigned) << 8;
-  /* The rest of the bits are the same as other immediate shifts.  */
-  neon_imm_shift (FALSE, 0, neon_quad (rs), et, imm);
+  neon_compare (N_IF_32, N_IF_32, FALSE);
  }
  
-static void
-do_neon_qmovn (void)
+/* For multiply instructions, we have the possibility of 16-bit or 32-bit
+   scalars, which are encoded in 5 bits, M : Rm.
+   For 16-bit scalars, the register is encoded in Rm[2:0] and the index in
+   M:Rm[3], and for 32-bit scalars, the register is encoded in Rm[3:0] and the
+   index in M.
+
+   Dot Product instructions are similar to multiply instructions except elsize
+   should always be 32.
+
+   This function translates SCALAR, which is GAS's internal encoding of indexed
+   scalar register, to raw encoding.  There is also register and index range
+   check based on ELSIZE.  */
+
+static unsigned
+neon_scalar_for_mul (unsigned scalar, unsigned elsize)
  {
-  struct neon_type_el et = neon_check_type (2, NS_DQ,
-    N_EQK | N_HLF, N_SU_16_64 | N_KEY);
-  /* Saturating move where operands can be signed or unsigned, and the
-     destination has the same signedness.  */
-  NEON_ENCODE (INTEGER, inst);
-  if (et.type == NT_unsigned)
-    inst.instruction |= 0xc0;
-  else
-    inst.instruction |= 0x80;
-  neon_two_same (0, 1, et.size / 2);
+  unsigned regno = NEON_SCALAR_REG (scalar);
+  unsigned elno = NEON_SCALAR_INDEX (scalar);
+
+  switch (elsize)
+    {
+    case 16:
+      if (regno > 7 || elno > 3)
+       goto bad_scalar;
+      return regno | (elno << 3);
+
+    case 32:
+      if (regno > 15 || elno > 1)
+       goto bad_scalar;
+      return regno | (elno << 4);
+
+    default:
+    bad_scalar:
+      first_error (_("scalar out of range for multiply instruction"));
+    }
+
+  return 0;
  }
  
+/* Encode multiply / multiply-accumulate scalar instructions.  */
+
  static void
-do_neon_qmovun (void)
+neon_mul_mac (struct neon_type_el et, int ubit)
  {
-  struct neon_type_el et = neon_check_type (2, NS_DQ,
-    N_EQK | N_HLF | N_UNS, N_S16 | N_S32 | N_S64 | N_KEY);
-  /* Saturating move with unsigned results. Operands must be signed.  */
-  NEON_ENCODE (INTEGER, inst);
-  neon_two_same (0, 1, et.size / 2);
+  unsigned scalar;
+
+  /* Give a more helpful error message if we have an invalid type.  */
+  if (et.type == NT_invtype)
+    return;
+
+  scalar = neon_scalar_for_mul (inst.operands[2].reg, et.size);
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+  inst.instruction |= LOW4 (scalar);
+  inst.instruction |= HI1 (scalar) << 5;
+  inst.instruction |= (et.type == NT_float) << 8;
+  inst.instruction |= neon_logbits (et.size) << 20;
+  inst.instruction |= (ubit != 0) << 24;
+
+  neon_dp_fixup (&inst);
  }
  
  static void
-do_neon_rshift_sat_narrow (void)
+do_neon_mac_maybe_scalar (void)
  {
-  /* FIXME: Types for narrowing. If operands are signed, results can be signed
-     or unsigned. If operands are unsigned, results must also be unsigned.  */
-  struct neon_type_el et = neon_check_type (2, NS_DQI,
-    N_EQK | N_HLF, N_SU_16_64 | N_KEY);
-  int imm = inst.operands[2].imm;
-  /* This gets the bounds check, size encoding and immediate bits calculation
-     right.  */
-  et.size /= 2;
+  if (try_vfp_nsyn (3, do_vfp_nsyn_mla_mls) == SUCCESS)
+    return;
  
-  /* VQ{R}SHRN.I<size> <Dd>, <Qm>, #0 is a synonym for
-     VQMOVN.I<size> <Dd>, <Qm>.  */
-  if (imm == 0)
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  if (inst.operands[2].isscalar)
      {
-      inst.operands[2].present = 0;
-      inst.instruction = N_MNEM_vqmovn;
-      do_neon_qmovn ();
-      return;
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs,
+       N_EQK, N_EQK, N_I16 | N_I32 | N_F_16_32 | N_KEY);
+      NEON_ENCODE (SCALAR, inst);
+      neon_mul_mac (et, neon_quad (rs));
      }
+  else if (!inst.operands[2].isvec)
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
  
-  constraint (imm < 1 || (unsigned)imm > et.size,
-             _("immediate out of range"));
-  neon_imm_shift (TRUE, et.type == NT_unsigned, 0, et, et.size - imm);
+      enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+      neon_dyadic_misc (NT_unsigned, N_SU_MVE, 0);
+    }
+  else
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      /* The "untyped" case can't happen.  Do this to stop the "U" bit being
+        affected if we specify unsigned args.  */
+      neon_dyadic_misc (NT_untyped, N_IF_32, 0);
+    }
  }
  
  static void
-do_neon_rshift_sat_narrow_u (void)
+do_bfloat_vfma (void)
  {
-  /* FIXME: Types for narrowing. If operands are signed, results can be signed
-     or unsigned. If operands are unsigned, results must also be unsigned.  */
-  struct neon_type_el et = neon_check_type (2, NS_DQI,
-    N_EQK | N_HLF | N_UNS, N_S16 | N_S32 | N_S64 | N_KEY);
-  int imm = inst.operands[2].imm;
-  /* This gets the bounds check, size encoding and immediate bits calculation
-     right.  */
-  et.size /= 2;
+  constraint (!mark_feature_used (&fpu_neon_ext_armv8), _(BAD_FPU));
+  constraint (!mark_feature_used (&arm_ext_bf16), _(BAD_BF16));
+  enum neon_shape rs;
+  int t_bit = 0;
  
-  /* VQSHRUN.I<size> <Dd>, <Qm>, #0 is a synonym for
-     VQMOVUN.I<size> <Dd>, <Qm>.  */
-  if (imm == 0)
+  if (inst.instruction != B_MNEM_vfmab)
+  {
+      t_bit = 1;
+      inst.instruction = B_MNEM_vfmat;
+  }
+
+  if (inst.operands[2].isscalar)
      {
-      inst.operands[2].present = 0;
-      inst.instruction = N_MNEM_vqmovun;
-      do_neon_qmovun ();
-      return;
+      rs = neon_select_shape (NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint (!(index < 4), _("index must be in the range 0 to 3"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 8),
+                 _("indexed register must be less than 8"));
+      neon_three_args (t_bit);
+      inst.instruction |= ((index & 1) << 3);
+      inst.instruction |= ((index & 2) << 4);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_QQQ, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+      neon_three_args (t_bit);
      }
  
-  constraint (imm < 1 || (unsigned)imm > et.size,
-             _("immediate out of range"));
-  /* FIXME: The manual is kind of unclear about what value U should have in
-     VQ{R}SHRUN instructions, but U=0, op=0 definitely encodes VRSHR, so it
-     must be 1.  */
-  neon_imm_shift (TRUE, 1, 0, et, et.size - imm);
  }
  
  static void
-do_neon_movn (void)
+do_neon_fmac (void)
  {
-  struct neon_type_el et = neon_check_type (2, NS_DQ,
-    N_EQK | N_HLF, N_I16 | N_I32 | N_I64 | N_KEY);
-  NEON_ENCODE (INTEGER, inst);
-  neon_two_same (0, 1, et.size / 2);
-}
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_fma)
+      && try_vfp_nsyn (3, do_vfp_nsyn_fma_fms) == SUCCESS)
+    return;
  
-static void
-do_neon_rshift_narrow (void)
-{
-  struct neon_type_el et = neon_check_type (2, NS_DQI,
-    N_EQK | N_HLF, N_I16 | N_I32 | N_I64 | N_KEY);
-  int imm = inst.operands[2].imm;
-  /* This gets the bounds check, size encoding and immediate bits calculation
-     right.  */
-  et.size /= 2;
+  if (!check_simd_pred_availability (TRUE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
  
-  /* If immediate is zero then we are a pseudo-instruction for
-     VMOVN.I<size> <Dd>, <Qm>  */
-  if (imm == 0)
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
      {
-      inst.operands[2].present = 0;
-      inst.instruction = N_MNEM_vmovn;
-      do_neon_movn ();
-      return;
+      enum neon_shape rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs, N_F_MVE | N_KEY, N_EQK,
+                                               N_EQK);
+
+      if (rs == NS_QQR)
+       {
+
+         if (inst.operands[2].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         else if (inst.operands[2].reg == REG_PC)
+           as_tsktsk (MVE_BAD_PC);
+
+         inst.instruction = 0xee310e40;
+         inst.instruction |= (et.size == 16) << 28;
+         inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+         inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+         inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+         inst.instruction |= HI1 (inst.operands[1].reg) << 6;
+         inst.instruction |= inst.operands[2].reg;
+         inst.is_neon = 1;
+         return;
+       }
+    }
+  else
+    {
+      constraint (!inst.operands[2].isvec, BAD_FPU);
      }
  
-  constraint (imm < 1 || (unsigned)imm > et.size,
-             _("immediate out of range for narrowing operation"));
-  neon_imm_shift (FALSE, 0, 0, et, et.size - imm);
+  neon_dyadic_misc (NT_untyped, N_IF_32, 0);
  }
  
  static void
-do_neon_shll (void)
+do_mve_vfma (void)
  {
-  /* FIXME: Type checking when lengthening.  */
-  struct neon_type_el et = neon_check_type (2, NS_QDI,
-    N_EQK | N_DBL, N_I8 | N_I16 | N_I32 | N_KEY);
-  unsigned imm = inst.operands[2].imm;
-
-  if (imm == et.size)
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_bf16) &&
+      inst.cond == COND_ALWAYS)
      {
-      /* Maximum shift variant.  */
-      NEON_ENCODE (INTEGER, inst);
-      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-      inst.instruction |= LOW4 (inst.operands[1].reg);
-      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-      inst.instruction |= neon_logbits (et.size) << 18;
-
-      neon_dp_fixup (&inst);
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      inst.instruction = N_MNEM_vfma;
+      inst.pred_insn_type = INSIDE_VPT_INSN;
+      inst.cond = 0xf;
+      return do_neon_fmac();
      }
    else
      {
-      /* A more-specific type check for non-max versions.  */
-      et = neon_check_type (2, NS_QDI,
-       N_EQK | N_DBL, N_SU_32 | N_KEY);
-      NEON_ENCODE (IMMED, inst);
-      neon_imm_shift (TRUE, et.type == NT_unsigned, 0, et, imm);
+      do_bfloat_vfma();
      }
  }
  
-/* Check the various types for the VCVT instruction, and return which version
-   the current instruction is.  */
-
-#define CVT_FLAVOUR_VAR                                                              \
-  CVT_VAR (s32_f32, N_S32, N_F32, whole_reg,   "ftosls", "ftosis", "ftosizs") \
-  CVT_VAR (u32_f32, N_U32, N_F32, whole_reg,   "ftouls", "ftouis", "ftouizs") \
-  CVT_VAR (f32_s32, N_F32, N_S32, whole_reg,   "fsltos", "fsitos", NULL)      \
-  CVT_VAR (f32_u32, N_F32, N_U32, whole_reg,   "fultos", "fuitos", NULL)      \
-  /* Half-precision conversions.  */                                         \
-  CVT_VAR (s16_f16, N_S16, N_F16 | N_KEY, whole_reg, NULL, NULL, NULL)       \
-  CVT_VAR (u16_f16, N_U16, N_F16 | N_KEY, whole_reg, NULL, NULL, NULL)       \
-  CVT_VAR (f16_s16, N_F16 | N_KEY, N_S16, whole_reg, NULL, NULL, NULL)       \
-  CVT_VAR (f16_u16, N_F16 | N_KEY, N_U16, whole_reg, NULL, NULL, NULL)       \
-  CVT_VAR (f32_f16, N_F32, N_F16, whole_reg,   NULL,     NULL,     NULL)      \
-  CVT_VAR (f16_f32, N_F16, N_F32, whole_reg,   NULL,     NULL,     NULL)      \
-  /* New VCVT instructions introduced by ARMv8.2 fp16 extension.             \
-     Compared with single/double precision variants, only the co-processor    \
-     field is different, so the encoding flow is reused here.  */            \
-  CVT_VAR (f16_s32, N_F16 | N_KEY, N_S32, N_VFP, "fsltos", "fsitos", NULL)    \
-  CVT_VAR (f16_u32, N_F16 | N_KEY, N_U32, N_VFP, "fultos", "fuitos", NULL)    \
-  CVT_VAR (u32_f16, N_U32, N_F16 | N_KEY, N_VFP, "ftouls", "ftouis", "ftouizs")\
-  CVT_VAR (s32_f16, N_S32, N_F16 | N_KEY, N_VFP, "ftosls", "ftosis", "ftosizs")\
-  /* VFP instructions.  */                                                   \
-  CVT_VAR (f32_f64, N_F32, N_F64, N_VFP,       NULL,     "fcvtsd", NULL)      \
-  CVT_VAR (f64_f32, N_F64, N_F32, N_VFP,       NULL,     "fcvtds", NULL)      \
-  CVT_VAR (s32_f64, N_S32, N_F64 | key, N_VFP, "ftosld", "ftosid", "ftosizd") \
-  CVT_VAR (u32_f64, N_U32, N_F64 | key, N_VFP, "ftould", "ftouid", "ftouizd") \
-  CVT_VAR (f64_s32, N_F64 | key, N_S32, N_VFP, "fsltod", "fsitod", NULL)      \
-  CVT_VAR (f64_u32, N_F64 | key, N_U32, N_VFP, "fultod", "fuitod", NULL)      \
-  /* VFP instructions with bitshift.  */                                     \
-  CVT_VAR (f32_s16, N_F32 | key, N_S16, N_VFP, "fshtos", NULL,     NULL)      \
-  CVT_VAR (f32_u16, N_F32 | key, N_U16, N_VFP, "fuhtos", NULL,     NULL)      \
-  CVT_VAR (f64_s16, N_F64 | key, N_S16, N_VFP, "fshtod", NULL,     NULL)      \
-  CVT_VAR (f64_u16, N_F64 | key, N_U16, N_VFP, "fuhtod", NULL,     NULL)      \
-  CVT_VAR (s16_f32, N_S16, N_F32 | key, N_VFP, "ftoshs", NULL,     NULL)      \
-  CVT_VAR (u16_f32, N_U16, N_F32 | key, N_VFP, "ftouhs", NULL,     NULL)      \
-  CVT_VAR (s16_f64, N_S16, N_F64 | key, N_VFP, "ftoshd", NULL,     NULL)      \
-  CVT_VAR (u16_f64, N_U16, N_F64 | key, N_VFP, "ftouhd", NULL,     NULL)
-
-#define CVT_VAR(C, X, Y, R, BSN, CN, ZN) \
-  neon_cvt_flavour_##C,
-
-/* The different types of conversions we can do.  */
-enum neon_cvt_flavour
+static void
+do_neon_tst (void)
  {
-  CVT_FLAVOUR_VAR
-  neon_cvt_flavour_invalid,
-  neon_cvt_flavour_first_fp = neon_cvt_flavour_f32_f64
-};
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs,
+    N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+  neon_three_same (neon_quad (rs), 0, et.size);
+}
  
-#undef CVT_VAR
+/* VMUL with 3 registers allows the P8 type. The scalar version supports the
+   same types as the MAC equivalents. The polynomial type for this instruction
+   is encoded the same as the integer type.  */
  
-static enum neon_cvt_flavour
-get_neon_cvt_flavour (enum neon_shape rs)
+static void
+do_neon_mul (void)
  {
-#define CVT_VAR(C,X,Y,R,BSN,CN,ZN)                     \
-  et = neon_check_type (2, rs, (R) | (X), (R) | (Y));  \
-  if (et.type != NT_invtype)                           \
-    {                                                  \
-      inst.error = NULL;                               \
-      return (neon_cvt_flavour_##C);                   \
-    }
+  if (try_vfp_nsyn (3, do_vfp_nsyn_mul) == SUCCESS)
+    return;
  
-  struct neon_type_el et;
-  unsigned whole_reg = (rs == NS_FFI || rs == NS_FD || rs == NS_DF
-                       || rs == NS_FF) ? N_VFP : 0;
-  /* The instruction versions which take an immediate take one register
-     argument, which is extended to the width of the full register. Thus the
-     "source" and "destination" registers must have the same width.  Hack that
-     here by making the size equal to the key (wider, in this case) operand.  */
-  unsigned key = (rs == NS_QQI || rs == NS_DDI || rs == NS_FFI) ? N_KEY : 0;
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
  
-  CVT_FLAVOUR_VAR;
+  if (inst.operands[2].isscalar)
+    {
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      do_neon_mac_maybe_scalar ();
+    }
+  else
+    {
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         enum neon_shape rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+         struct neon_type_el et
+           = neon_check_type (3, rs, N_EQK, N_EQK, N_I_MVE | N_F_MVE | N_KEY);
+         if (et.type == NT_float)
+           constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext),
+                       BAD_FPU);
  
-  return neon_cvt_flavour_invalid;
-#undef CVT_VAR
+         neon_dyadic_misc (NT_float, N_I_MVE | N_F_MVE, 0);
+       }
+      else
+       {
+         constraint (!inst.operands[2].isvec, BAD_FPU);
+         neon_dyadic_misc (NT_poly,
+                           N_I8 | N_I16 | N_I32 | N_F16 | N_F32 | N_P8, 0);
+       }
+    }
  }
  
-enum neon_cvt_mode
-{
-  neon_cvt_mode_a,
-  neon_cvt_mode_n,
-  neon_cvt_mode_p,
-  neon_cvt_mode_m,
-  neon_cvt_mode_z,
-  neon_cvt_mode_x,
-  neon_cvt_mode_r
-};
-
-/* Neon-syntax VFP conversions.  */
-
  static void
-do_vfp_nsyn_cvt (enum neon_shape rs, enum neon_cvt_flavour flavour)
+do_neon_qdmulh (void)
  {
-  const char *opname = 0;
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
  
-  if (rs == NS_DDI || rs == NS_QQI || rs == NS_FFI
-      || rs == NS_FHI || rs == NS_HFI)
+  if (inst.operands[2].isscalar)
      {
-      /* Conversions with immediate bitshift.  */
-      const char *enc[] =
-       {
-#define CVT_VAR(C,A,B,R,BSN,CN,ZN) BSN,
-         CVT_FLAVOUR_VAR
-         NULL
-#undef CVT_VAR
-       };
-
-      if (flavour < (int) ARRAY_SIZE (enc))
-       {
-         opname = enc[flavour];
-         constraint (inst.operands[0].reg != inst.operands[1].reg,
-                     _("operands 0 and 1 must be the same register"));
-         inst.operands[1] = inst.operands[2];
-         memset (&inst.operands[2], '\0', sizeof (inst.operands[2]));
-       }
+      constraint (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      struct neon_type_el et = neon_check_type (3, rs,
+       N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+      NEON_ENCODE (SCALAR, inst);
+      neon_mul_mac (et, neon_quad (rs));
      }
    else
      {
-      /* Conversions without bitshift.  */
-      const char *enc[] =
+      enum neon_shape rs;
+      struct neon_type_el et;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
         {
-#define CVT_VAR(C,A,B,R,BSN,CN,ZN) CN,
-         CVT_FLAVOUR_VAR
-         NULL
-#undef CVT_VAR
-       };
+         rs = neon_select_shape (NS_QQR, NS_QQQ, NS_NULL);
+         et = neon_check_type (3, rs,
+           N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+       }
+      else
+       {
+         rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+         et = neon_check_type (3, rs,
+           N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+       }
  
-      if (flavour < (int) ARRAY_SIZE (enc))
-       opname = enc[flavour];
+      NEON_ENCODE (INTEGER, inst);
+      if (rs == NS_QQR)
+       mve_encode_qqr (et.size, 0, 0);
+      else
+       /* The U bit (rounding) comes from bit mask.  */
+       neon_three_same (neon_quad (rs), 0, et.size);
      }
-
-  if (opname)
-    do_vfp_nsyn_opcode (opname);
-
-  /* ARMv8.2 fp16 VCVT instruction.  */
-  if (flavour == neon_cvt_flavour_s32_f16
-      || flavour == neon_cvt_flavour_u32_f16
-      || flavour == neon_cvt_flavour_f16_u32
-      || flavour == neon_cvt_flavour_f16_s32)
-    do_scalar_fp16_v82_encode ();
  }
  
  static void
-do_vfp_nsyn_cvtz (void)
+do_mve_vaddv (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_FH, NS_FF, NS_FD, NS_NULL);
-  enum neon_cvt_flavour flavour = get_neon_cvt_flavour (rs);
-  const char *enc[] =
-    {
-#define CVT_VAR(C,A,B,R,BSN,CN,ZN) ZN,
-      CVT_FLAVOUR_VAR
-      NULL
-#undef CVT_VAR
-    };
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK,  N_SU_32 | N_KEY);
  
-  if (flavour < (int) ARRAY_SIZE (enc) && enc[flavour])
-    do_vfp_nsyn_opcode (enc[flavour]);
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  constraint (inst.operands[1].reg > 14, MVE_BAD_QREG);
+
+  mve_encode_rq (et.type == NT_unsigned, et.size);
  }
  
  static void
-do_vfp_nsyn_cvt_fpv8 (enum neon_cvt_flavour flavour,
-                     enum neon_cvt_mode mode)
+do_mve_vhcadd (void)
  {
-  int sz, op;
-  int rm;
+  enum neon_shape rs = neon_select_shape (NS_QQQI, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
  
-  /* Targets like FPv5-SP-D16 don't support FP v8 instructions with
-     D register operands.  */
-  if (flavour == neon_cvt_flavour_s32_f64
-      || flavour == neon_cvt_flavour_u32_f64)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-               _(BAD_FPU));
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
  
-  if (flavour == neon_cvt_flavour_s32_f16
-      || flavour == neon_cvt_flavour_u32_f16)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_fp16),
-               _(BAD_FP16));
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 90 && rot != 270, _("immediate out of range"));
  
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (et.size == 32 && inst.operands[0].reg == inst.operands[2].reg)
+    as_tsktsk (_("Warning: 32-bit element size and same first and third "
+                "operand makes instruction UNPREDICTABLE"));
  
-  switch (flavour)
-    {
-    case neon_cvt_flavour_s32_f64:
-      sz = 1;
-      op = 1;
+  mve_encode_qqq (0, et.size);
+  inst.instruction |= (rot == 270) << 12;
+  inst.is_neon = 1;
+}
+
+static void
+do_mve_vqdmull (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+
+  if (et.size == 32
+      && (inst.operands[0].reg == inst.operands[1].reg
+         || (rs == NS_QQQ && inst.operands[0].reg == inst.operands[2].reg)))
+    as_tsktsk (BAD_MVE_SRCDEST);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (rs == NS_QQQ)
+    {
+      mve_encode_qqq (et.size == 32, 64);
+      inst.instruction |= 1;
+    }
+  else
+    {
+      mve_encode_qqr (64, et.size == 32, 0);
+      inst.instruction |= 0x3 << 5;
+    }
+}
+
+static void
+do_mve_vadc (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_KEY | N_I32, N_EQK, N_EQK);
+
+  if (et.type == NT_invtype)
+    first_error (BAD_EL_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (0, 64);
+}
+
+static void
+do_mve_vbrsr (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqr (et.size, 0, 0);
+}
+
+static void
+do_mve_vsbc (void)
+{
+  neon_check_type (3, NS_QQQ, N_EQK, N_EQK, N_I32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (1, 64);
+}
+
+static void
+do_mve_vmulh (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vqdmlah (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S_32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
+}
+
+static void
+do_mve_vqdmladh (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_qqq (0, et.size);
+}
+
+
+static void
+do_mve_vmull (void)
+{
+
+  enum neon_shape rs = neon_select_shape (NS_HHH, NS_FFF, NS_DDD, NS_DDS,
+                                         NS_QQS, NS_QQQ, NS_QQR, NS_NULL);
+  if (inst.cond == COND_ALWAYS
+      && ((unsigned)inst.instruction) == M_MNEM_vmullt)
+    {
+
+      if (rs == NS_QQQ)
+       {
+         if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+           goto neon_vmul;
+       }
+      else
+       goto neon_vmul;
+    }
+
+  constraint (rs != NS_QQQ, BAD_FPU);
+  struct neon_type_el et = neon_check_type (3, rs, N_EQK , N_EQK,
+                                           N_SU_32 | N_P8 | N_P16 | N_KEY);
+
+  /* We are dealing with MVE's vmullt.  */
+  if (et.size == 32
+      && (inst.operands[0].reg == inst.operands[1].reg
+         || inst.operands[0].reg == inst.operands[2].reg))
+    as_tsktsk (BAD_MVE_SRCDEST);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (et.type == NT_poly)
+    mve_encode_qqq (neon_logbits (et.size), 64);
+  else
+    mve_encode_qqq (et.type == NT_unsigned, et.size);
+
+  return;
+
+ neon_vmul:
+  inst.instruction = N_MNEM_vmul;
+  inst.cond = 0xb;
+  if (thumb_mode)
+    inst.pred_insn_type = INSIDE_IT_INSN;
+  do_neon_mul ();
+}
+
+static void
+do_mve_vabav (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQQ, NS_NULL);
+
+  if (rs == NS_NULL)
+    return;
+
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+
+  struct neon_type_el et = neon_check_type (2, NS_NULL, N_EQK, N_KEY | N_S8
+                                           | N_S16 | N_S32 | N_U8 | N_U16
+                                           | N_U32);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vmladav (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQQ, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs,
+                                           N_EQK, N_EQK, N_SU_MVE | N_KEY);
+
+  if (et.type == NT_unsigned
+      && (inst.instruction == M_MNEM_vmladavx
+         || inst.instruction == M_MNEM_vmladavax
+         || inst.instruction == M_MNEM_vmlsdav
+         || inst.instruction == M_MNEM_vmlsdava
+         || inst.instruction == M_MNEM_vmlsdavx
+         || inst.instruction == M_MNEM_vmlsdavax))
+    first_error (BAD_SIMD_TYPE);
+
+  constraint (inst.operands[2].reg > 14,
+             _("MVE vector register in the range [Q0..Q7] expected"));
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.instruction == M_MNEM_vmlsdav
+      || inst.instruction == M_MNEM_vmlsdava
+      || inst.instruction == M_MNEM_vmlsdavx
+      || inst.instruction == M_MNEM_vmlsdavax)
+    inst.instruction |= (et.size == 8) << 28;
+  else
+    inst.instruction |= (et.size == 8) << 8;
+
+  mve_encode_rqq (et.type == NT_unsigned, 64);
+  inst.instruction |= (et.size == 32) << 16;
+}
+
+static void
+do_mve_vmlaldav (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RRQQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (4, rs, N_EQK, N_EQK, N_EQK,
+                      N_S16 | N_S32 | N_U16 | N_U32 | N_KEY);
+
+  if (et.type == NT_unsigned
+      && (inst.instruction == M_MNEM_vmlsldav
+         || inst.instruction == M_MNEM_vmlsldava
+         || inst.instruction == M_MNEM_vmlsldavx
+         || inst.instruction == M_MNEM_vmlsldavax))
+    first_error (BAD_SIMD_TYPE);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rrqq (et.type == NT_unsigned, et.size);
+}
+
+static void
+do_mve_vrmlaldavh (void)
+{
+  struct neon_type_el et;
+  if (inst.instruction == M_MNEM_vrmlsldavh
+     || inst.instruction == M_MNEM_vrmlsldavha
+     || inst.instruction == M_MNEM_vrmlsldavhx
+     || inst.instruction == M_MNEM_vrmlsldavhax)
+    {
+      et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK, N_S32 | N_KEY);
+      if (inst.operands[1].reg == REG_SP)
+       as_tsktsk (MVE_BAD_SP);
+    }
+  else
+    {
+      if (inst.instruction == M_MNEM_vrmlaldavhx
+         || inst.instruction == M_MNEM_vrmlaldavhax)
+       et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK, N_S32 | N_KEY);
+      else
+       et = neon_check_type (4, NS_RRQQ, N_EQK, N_EQK, N_EQK,
+                             N_U32 | N_S32 | N_KEY);
+      /* vrmlaldavh's encoding with SP as the second, odd, GPR operand may alias
+        with vmax/min instructions, making the use of SP in assembly really
+        nonsensical, so instead of issuing a warning like we do for other uses
+        of SP for the odd register operand we error out.  */
+      constraint (inst.operands[1].reg == REG_SP, BAD_SP);
+    }
+
+  /* Make sure we still check the second operand is an odd one and that PC is
+     disallowed.  This because we are parsing for any GPR operand, to be able
+     to distinguish between giving a warning or an error for SP as described
+     above.  */
+  constraint ((inst.operands[1].reg % 2) != 1, BAD_EVEN);
+  constraint (inst.operands[1].reg == REG_PC, BAD_PC);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  mve_encode_rrqq (et.type == NT_unsigned, 0);
+}
+
+
+static void
+do_mve_vmaxnmv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et
+    = neon_check_type (2, rs, N_EQK, N_F_MVE | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[0].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[0].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  mve_encode_rq (et.size == 16, 64);
+}
+
+static void
+do_mve_vmaxv (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_RQ, NS_NULL);
+  struct neon_type_el et;
+
+  if (inst.instruction == M_MNEM_vmaxv || inst.instruction == M_MNEM_vminv)
+    et = neon_check_type (2, rs, N_EQK, N_SU_MVE | N_KEY);
+  else
+    et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  if (inst.operands[0].reg == REG_SP)
+    as_tsktsk (MVE_BAD_SP);
+  else if (inst.operands[0].reg == REG_PC)
+    as_tsktsk (MVE_BAD_PC);
+
+  mve_encode_rq (et.type == NT_unsigned, et.size);
+}
+
+
+static void
+do_neon_qrdmlah (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      /* Check we're on the correct architecture.  */
+      if (!mark_feature_used (&fpu_neon_ext_armv8))
+       inst.error
+         = _("instruction form not available on this architecture.");
+      else if (!mark_feature_used (&fpu_neon_ext_v8_1))
+       {
+         as_warn (_("this instruction implies use of ARMv8.1 AdvSIMD."));
+         record_feature_use (&fpu_neon_ext_v8_1);
+       }
+       if (inst.operands[2].isscalar)
+         {
+           enum neon_shape rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+           struct neon_type_el et = neon_check_type (3, rs,
+             N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+           NEON_ENCODE (SCALAR, inst);
+           neon_mul_mac (et, neon_quad (rs));
+         }
+       else
+         {
+           enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+           struct neon_type_el et = neon_check_type (3, rs,
+             N_EQK, N_EQK, N_S16 | N_S32 | N_KEY);
+           NEON_ENCODE (INTEGER, inst);
+           /* The U bit (rounding) comes from bit mask.  */
+           neon_three_same (neon_quad (rs), 0, et.size);
+         }
+    }
+  else
+    {
+      enum neon_shape rs = neon_select_shape (NS_QQR, NS_NULL);
+      struct neon_type_el et
+       = neon_check_type (3, rs, N_EQK, N_EQK, N_S_32 | N_KEY);
+
+      NEON_ENCODE (INTEGER, inst);
+      mve_encode_qqr (et.size, et.type == NT_unsigned, 0);
+    }
+}
+
+static void
+do_neon_fcmp_absolute (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs, N_EQK, N_EQK,
+                                           N_F_16_32 | N_KEY);
+  /* Size field comes from bit mask.  */
+  neon_three_same (neon_quad (rs), 1, et.size == 16 ? (int) et.size : -1);
+}
+
+static void
+do_neon_fcmp_absolute_inv (void)
+{
+  neon_exchange_operands ();
+  do_neon_fcmp_absolute ();
+}
+
+static void
+do_neon_step (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+  struct neon_type_el et = neon_check_type (3, rs, N_EQK, N_EQK,
+                                           N_F_16_32 | N_KEY);
+  neon_three_same (neon_quad (rs), 0, et.size == 16 ? (int) et.size : -1);
+}
+
+static void
+do_neon_abs_neg (void)
+{
+  enum neon_shape rs;
+  struct neon_type_el et;
+
+  if (try_vfp_nsyn (2, do_vfp_nsyn_abs_neg) == SUCCESS)
+    return;
+
+  rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  et = neon_check_type (2, rs, N_EQK, N_S_32 | N_F_16_32 | N_KEY);
+
+  if (!check_simd_pred_availability (et.type == NT_float,
+                                    NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= neon_quad (rs) << 6;
+  inst.instruction |= (et.type == NT_float) << 10;
+  inst.instruction |= neon_logbits (et.size) << 18;
+
+  neon_dp_fixup (&inst);
+}
+
+static void
+do_neon_sli (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+    }
+
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 0 || (unsigned)imm >= et.size,
+             _("immediate out of range for insert"));
+  neon_imm_shift (FALSE, 0, neon_quad (rs), et, imm);
+}
+
+static void
+do_neon_sri (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_8 | N_16 | N_32 | N_64 | N_KEY);
+    }
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 1 || (unsigned)imm > et.size,
+             _("immediate out of range for insert"));
+  neon_imm_shift (FALSE, 0, neon_quad (rs), et, et.size - imm);
+}
+
+static void
+do_neon_qshlu_imm (void)
+{
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK | N_UNS,
+                           N_S8 | N_S16 | N_S32 | N_S64 | N_KEY);
+    }
+
+  int imm = inst.operands[2].imm;
+  constraint (imm < 0 || (unsigned)imm >= et.size,
+             _("immediate out of range for shift"));
+  /* Only encodes the 'U present' variant of the instruction.
+     In this case, signed types have OP (bit 8) set to 0.
+     Unsigned types have OP set to 1.  */
+  inst.instruction |= (et.type == NT_unsigned) << 8;
+  /* The rest of the bits are the same as other immediate shifts.  */
+  neon_imm_shift (FALSE, 0, neon_quad (rs), et, imm);
+}
+
+static void
+do_neon_qmovn (void)
+{
+  struct neon_type_el et = neon_check_type (2, NS_DQ,
+    N_EQK | N_HLF, N_SU_16_64 | N_KEY);
+  /* Saturating move where operands can be signed or unsigned, and the
+     destination has the same signedness.  */
+  NEON_ENCODE (INTEGER, inst);
+  if (et.type == NT_unsigned)
+    inst.instruction |= 0xc0;
+  else
+    inst.instruction |= 0x80;
+  neon_two_same (0, 1, et.size / 2);
+}
+
+static void
+do_neon_qmovun (void)
+{
+  struct neon_type_el et = neon_check_type (2, NS_DQ,
+    N_EQK | N_HLF | N_UNS, N_S16 | N_S32 | N_S64 | N_KEY);
+  /* Saturating move with unsigned results. Operands must be signed.  */
+  NEON_ENCODE (INTEGER, inst);
+  neon_two_same (0, 1, et.size / 2);
+}
+
+static void
+do_neon_rshift_sat_narrow (void)
+{
+  /* FIXME: Types for narrowing. If operands are signed, results can be signed
+     or unsigned. If operands are unsigned, results must also be unsigned.  */
+  struct neon_type_el et = neon_check_type (2, NS_DQI,
+    N_EQK | N_HLF, N_SU_16_64 | N_KEY);
+  int imm = inst.operands[2].imm;
+  /* This gets the bounds check, size encoding and immediate bits calculation
+     right.  */
+  et.size /= 2;
+
+  /* VQ{R}SHRN.I<size> <Dd>, <Qm>, #0 is a synonym for
+     VQMOVN.I<size> <Dd>, <Qm>.  */
+  if (imm == 0)
+    {
+      inst.operands[2].present = 0;
+      inst.instruction = N_MNEM_vqmovn;
+      do_neon_qmovn ();
+      return;
+    }
+
+  constraint (imm < 1 || (unsigned)imm > et.size,
+             _("immediate out of range"));
+  neon_imm_shift (TRUE, et.type == NT_unsigned, 0, et, et.size - imm);
+}
+
+static void
+do_neon_rshift_sat_narrow_u (void)
+{
+  /* FIXME: Types for narrowing. If operands are signed, results can be signed
+     or unsigned. If operands are unsigned, results must also be unsigned.  */
+  struct neon_type_el et = neon_check_type (2, NS_DQI,
+    N_EQK | N_HLF | N_UNS, N_S16 | N_S32 | N_S64 | N_KEY);
+  int imm = inst.operands[2].imm;
+  /* This gets the bounds check, size encoding and immediate bits calculation
+     right.  */
+  et.size /= 2;
+
+  /* VQSHRUN.I<size> <Dd>, <Qm>, #0 is a synonym for
+     VQMOVUN.I<size> <Dd>, <Qm>.  */
+  if (imm == 0)
+    {
+      inst.operands[2].present = 0;
+      inst.instruction = N_MNEM_vqmovun;
+      do_neon_qmovun ();
+      return;
+    }
+
+  constraint (imm < 1 || (unsigned)imm > et.size,
+             _("immediate out of range"));
+  /* FIXME: The manual is kind of unclear about what value U should have in
+     VQ{R}SHRUN instructions, but U=0, op=0 definitely encodes VRSHR, so it
+     must be 1.  */
+  neon_imm_shift (TRUE, 1, 0, et, et.size - imm);
+}
+
+static void
+do_neon_movn (void)
+{
+  struct neon_type_el et = neon_check_type (2, NS_DQ,
+    N_EQK | N_HLF, N_I16 | N_I32 | N_I64 | N_KEY);
+  NEON_ENCODE (INTEGER, inst);
+  neon_two_same (0, 1, et.size / 2);
+}
+
+static void
+do_neon_rshift_narrow (void)
+{
+  struct neon_type_el et = neon_check_type (2, NS_DQI,
+    N_EQK | N_HLF, N_I16 | N_I32 | N_I64 | N_KEY);
+  int imm = inst.operands[2].imm;
+  /* This gets the bounds check, size encoding and immediate bits calculation
+     right.  */
+  et.size /= 2;
+
+  /* If immediate is zero then we are a pseudo-instruction for
+     VMOVN.I<size> <Dd>, <Qm>  */
+  if (imm == 0)
+    {
+      inst.operands[2].present = 0;
+      inst.instruction = N_MNEM_vmovn;
+      do_neon_movn ();
+      return;
+    }
+
+  constraint (imm < 1 || (unsigned)imm > et.size,
+             _("immediate out of range for narrowing operation"));
+  neon_imm_shift (FALSE, 0, 0, et, et.size - imm);
+}
+
+static void
+do_neon_shll (void)
+{
+  /* FIXME: Type checking when lengthening.  */
+  struct neon_type_el et = neon_check_type (2, NS_QDI,
+    N_EQK | N_DBL, N_I8 | N_I16 | N_I32 | N_KEY);
+  unsigned imm = inst.operands[2].imm;
+
+  if (imm == et.size)
+    {
+      /* Maximum shift variant.  */
+      NEON_ENCODE (INTEGER, inst);
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= LOW4 (inst.operands[1].reg);
+      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+      inst.instruction |= neon_logbits (et.size) << 18;
+
+      neon_dp_fixup (&inst);
+    }
+  else
+    {
+      /* A more-specific type check for non-max versions.  */
+      et = neon_check_type (2, NS_QDI,
+       N_EQK | N_DBL, N_SU_32 | N_KEY);
+      NEON_ENCODE (IMMED, inst);
+      neon_imm_shift (TRUE, et.type == NT_unsigned, 0, et, imm);
+    }
+}
+
+/* Check the various types for the VCVT instruction, and return which version
+   the current instruction is.  */
+
+#define CVT_FLAVOUR_VAR                                                              \
+  CVT_VAR (s32_f32, N_S32, N_F32, whole_reg,   "ftosls", "ftosis", "ftosizs") \
+  CVT_VAR (u32_f32, N_U32, N_F32, whole_reg,   "ftouls", "ftouis", "ftouizs") \
+  CVT_VAR (f32_s32, N_F32, N_S32, whole_reg,   "fsltos", "fsitos", NULL)      \
+  CVT_VAR (f32_u32, N_F32, N_U32, whole_reg,   "fultos", "fuitos", NULL)      \
+  /* Half-precision conversions.  */                                         \
+  CVT_VAR (s16_f16, N_S16, N_F16 | N_KEY, whole_reg, NULL, NULL, NULL)       \
+  CVT_VAR (u16_f16, N_U16, N_F16 | N_KEY, whole_reg, NULL, NULL, NULL)       \
+  CVT_VAR (f16_s16, N_F16 | N_KEY, N_S16, whole_reg, NULL, NULL, NULL)       \
+  CVT_VAR (f16_u16, N_F16 | N_KEY, N_U16, whole_reg, NULL, NULL, NULL)       \
+  CVT_VAR (f32_f16, N_F32, N_F16, whole_reg,   NULL,     NULL,     NULL)      \
+  CVT_VAR (f16_f32, N_F16, N_F32, whole_reg,   NULL,     NULL,     NULL)      \
+  /* New VCVT instructions introduced by ARMv8.2 fp16 extension.             \
+     Compared with single/double precision variants, only the co-processor    \
+     field is different, so the encoding flow is reused here.  */            \
+  CVT_VAR (f16_s32, N_F16 | N_KEY, N_S32, N_VFP, "fsltos", "fsitos", NULL)    \
+  CVT_VAR (f16_u32, N_F16 | N_KEY, N_U32, N_VFP, "fultos", "fuitos", NULL)    \
+  CVT_VAR (u32_f16, N_U32, N_F16 | N_KEY, N_VFP, "ftouls", "ftouis", "ftouizs")\
+  CVT_VAR (s32_f16, N_S32, N_F16 | N_KEY, N_VFP, "ftosls", "ftosis", "ftosizs")\
+  CVT_VAR (bf16_f32, N_BF16, N_F32, whole_reg,   NULL, NULL, NULL)           \
+  /* VFP instructions.  */                                                   \
+  CVT_VAR (f32_f64, N_F32, N_F64, N_VFP,       NULL,     "fcvtsd", NULL)      \
+  CVT_VAR (f64_f32, N_F64, N_F32, N_VFP,       NULL,     "fcvtds", NULL)      \
+  CVT_VAR (s32_f64, N_S32, N_F64 | key, N_VFP, "ftosld", "ftosid", "ftosizd") \
+  CVT_VAR (u32_f64, N_U32, N_F64 | key, N_VFP, "ftould", "ftouid", "ftouizd") \
+  CVT_VAR (f64_s32, N_F64 | key, N_S32, N_VFP, "fsltod", "fsitod", NULL)      \
+  CVT_VAR (f64_u32, N_F64 | key, N_U32, N_VFP, "fultod", "fuitod", NULL)      \
+  /* VFP instructions with bitshift.  */                                     \
+  CVT_VAR (f32_s16, N_F32 | key, N_S16, N_VFP, "fshtos", NULL,     NULL)      \
+  CVT_VAR (f32_u16, N_F32 | key, N_U16, N_VFP, "fuhtos", NULL,     NULL)      \
+  CVT_VAR (f64_s16, N_F64 | key, N_S16, N_VFP, "fshtod", NULL,     NULL)      \
+  CVT_VAR (f64_u16, N_F64 | key, N_U16, N_VFP, "fuhtod", NULL,     NULL)      \
+  CVT_VAR (s16_f32, N_S16, N_F32 | key, N_VFP, "ftoshs", NULL,     NULL)      \
+  CVT_VAR (u16_f32, N_U16, N_F32 | key, N_VFP, "ftouhs", NULL,     NULL)      \
+  CVT_VAR (s16_f64, N_S16, N_F64 | key, N_VFP, "ftoshd", NULL,     NULL)      \
+  CVT_VAR (u16_f64, N_U16, N_F64 | key, N_VFP, "ftouhd", NULL,     NULL)
+
+#define CVT_VAR(C, X, Y, R, BSN, CN, ZN) \
+  neon_cvt_flavour_##C,
+
+/* The different types of conversions we can do.  */
+enum neon_cvt_flavour
+{
+  CVT_FLAVOUR_VAR
+  neon_cvt_flavour_invalid,
+  neon_cvt_flavour_first_fp = neon_cvt_flavour_f32_f64
+};
+
+#undef CVT_VAR
+
+static enum neon_cvt_flavour
+get_neon_cvt_flavour (enum neon_shape rs)
+{
+#define CVT_VAR(C,X,Y,R,BSN,CN,ZN)                     \
+  et = neon_check_type (2, rs, (R) | (X), (R) | (Y));  \
+  if (et.type != NT_invtype)                           \
+    {                                                  \
+      inst.error = NULL;                               \
+      return (neon_cvt_flavour_##C);                   \
+    }
+
+  struct neon_type_el et;
+  unsigned whole_reg = (rs == NS_FFI || rs == NS_FD || rs == NS_DF
+                       || rs == NS_FF) ? N_VFP : 0;
+  /* The instruction versions which take an immediate take one register
+     argument, which is extended to the width of the full register. Thus the
+     "source" and "destination" registers must have the same width.  Hack that
+     here by making the size equal to the key (wider, in this case) operand.  */
+  unsigned key = (rs == NS_QQI || rs == NS_DDI || rs == NS_FFI) ? N_KEY : 0;
+
+  CVT_FLAVOUR_VAR;
+
+  return neon_cvt_flavour_invalid;
+#undef CVT_VAR
+}
+
+enum neon_cvt_mode
+{
+  neon_cvt_mode_a,
+  neon_cvt_mode_n,
+  neon_cvt_mode_p,
+  neon_cvt_mode_m,
+  neon_cvt_mode_z,
+  neon_cvt_mode_x,
+  neon_cvt_mode_r
+};
+
+/* Neon-syntax VFP conversions.  */
+
+static void
+do_vfp_nsyn_cvt (enum neon_shape rs, enum neon_cvt_flavour flavour)
+{
+  const char *opname = 0;
+
+  if (rs == NS_DDI || rs == NS_QQI || rs == NS_FFI
+      || rs == NS_FHI || rs == NS_HFI)
+    {
+      /* Conversions with immediate bitshift.  */
+      const char *enc[] =
+       {
+#define CVT_VAR(C,A,B,R,BSN,CN,ZN) BSN,
+         CVT_FLAVOUR_VAR
+         NULL
+#undef CVT_VAR
+       };
+
+      if (flavour < (int) ARRAY_SIZE (enc))
+       {
+         opname = enc[flavour];
+         constraint (inst.operands[0].reg != inst.operands[1].reg,
+                     _("operands 0 and 1 must be the same register"));
+         inst.operands[1] = inst.operands[2];
+         memset (&inst.operands[2], '\0', sizeof (inst.operands[2]));
+       }
+    }
+  else
+    {
+      /* Conversions without bitshift.  */
+      const char *enc[] =
+       {
+#define CVT_VAR(C,A,B,R,BSN,CN,ZN) CN,
+         CVT_FLAVOUR_VAR
+         NULL
+#undef CVT_VAR
+       };
+
+      if (flavour < (int) ARRAY_SIZE (enc))
+       opname = enc[flavour];
+    }
+
+  if (opname)
+    do_vfp_nsyn_opcode (opname);
+
+  /* ARMv8.2 fp16 VCVT instruction.  */
+  if (flavour == neon_cvt_flavour_s32_f16
+      || flavour == neon_cvt_flavour_u32_f16
+      || flavour == neon_cvt_flavour_f16_u32
+      || flavour == neon_cvt_flavour_f16_s32)
+    do_scalar_fp16_v82_encode ();
+}
+
+static void
+do_vfp_nsyn_cvtz (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_FH, NS_FF, NS_FD, NS_NULL);
+  enum neon_cvt_flavour flavour = get_neon_cvt_flavour (rs);
+  const char *enc[] =
+    {
+#define CVT_VAR(C,A,B,R,BSN,CN,ZN) ZN,
+      CVT_FLAVOUR_VAR
+      NULL
+#undef CVT_VAR
+    };
+
+  if (flavour < (int) ARRAY_SIZE (enc) && enc[flavour])
+    do_vfp_nsyn_opcode (enc[flavour]);
+}
+
+static void
+do_vfp_nsyn_cvt_fpv8 (enum neon_cvt_flavour flavour,
+                     enum neon_cvt_mode mode)
+{
+  int sz, op;
+  int rm;
+
+  /* Targets like FPv5-SP-D16 don't support FP v8 instructions with
+     D register operands.  */
+  if (flavour == neon_cvt_flavour_s32_f64
+      || flavour == neon_cvt_flavour_u32_f64)
+    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+               _(BAD_FPU));
+
+  if (flavour == neon_cvt_flavour_s32_f16
+      || flavour == neon_cvt_flavour_u32_f16)
+    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_fp16),
+               _(BAD_FP16));
+
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  switch (flavour)
+    {
+    case neon_cvt_flavour_s32_f64:
+      sz = 1;
+      op = 1;
        break;
      case neon_cvt_flavour_s32_f32:
        sz = 0;
@@ -17019,6 +19086,16 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
        return;
      }
  
+  if ((rs == NS_FD || rs == NS_QQI) && mode == neon_cvt_mode_n
+      && ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      /* We are dealing with vcvt with the 'ne' condition.  */
+      inst.cond = 0x1;
+      inst.instruction = N_MNEM_vcvt;
+      do_neon_cvt_1 (neon_cvt_mode_z);
+      return;
+    }
+
    /* VFP rather than Neon conversions.  */
    if (flavour >= neon_cvt_flavour_first_fp)
      {
@@ -17032,15 +19109,57 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
  
    switch (rs)
      {
-    case NS_DDI:
      case NS_QQI:
+      if (mode == neon_cvt_mode_z
+         && (flavour == neon_cvt_flavour_f16_s16
+             || flavour == neon_cvt_flavour_f16_u16
+             || flavour == neon_cvt_flavour_s16_f16
+             || flavour == neon_cvt_flavour_u16_f16
+             || flavour == neon_cvt_flavour_f32_u32
+             || flavour == neon_cvt_flavour_f32_s32
+             || flavour == neon_cvt_flavour_s32_f32
+             || flavour == neon_cvt_flavour_u32_f32))
+       {
+         if (!check_simd_pred_availability (TRUE,
+                                            NEON_CHECK_CC | NEON_CHECK_ARCH))
+           return;
+       }
+      /* fall through.  */
+    case NS_DDI:
        {
         unsigned immbits;
         unsigned enctab[] = {0x0000100, 0x1000100, 0x0, 0x1000000,
                              0x0000100, 0x1000100, 0x0, 0x1000000};
  
-       if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-         return;
+       if ((rs != NS_QQI || !ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+           && vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+           return;
+
+       if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+         {
+           constraint (inst.operands[2].present && inst.operands[2].imm == 0,
+                       _("immediate value out of range"));
+           switch (flavour)
+             {
+               case neon_cvt_flavour_f16_s16:
+               case neon_cvt_flavour_f16_u16:
+               case neon_cvt_flavour_s16_f16:
+               case neon_cvt_flavour_u16_f16:
+                 constraint (inst.operands[2].imm > 16,
+                             _("immediate value out of range"));
+                 break;
+               case neon_cvt_flavour_f32_u32:
+               case neon_cvt_flavour_f32_s32:
+               case neon_cvt_flavour_s32_f32:
+               case neon_cvt_flavour_u32_f32:
+                 constraint (inst.operands[2].imm > 32,
+                             _("immediate value out of range"));
+                 break;
+               default:
+                 inst.error = BAD_FPU;
+                 return;
+             }
+         }
  
         /* Fixed-point conversion with #0 immediate is encoded as an
            integer conversion.  */
@@ -17073,14 +19192,40 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
        }
        break;
  
-    case NS_DD:
      case NS_QQ:
+      if ((mode == neon_cvt_mode_a || mode == neon_cvt_mode_n
+          || mode == neon_cvt_mode_m || mode == neon_cvt_mode_p)
+         && (flavour == neon_cvt_flavour_s16_f16
+             || flavour == neon_cvt_flavour_u16_f16
+             || flavour == neon_cvt_flavour_s32_f32
+             || flavour == neon_cvt_flavour_u32_f32))
+       {
+         if (!check_simd_pred_availability (TRUE,
+                                            NEON_CHECK_CC | NEON_CHECK_ARCH8))
+           return;
+       }
+      else if (mode == neon_cvt_mode_z
+              && (flavour == neon_cvt_flavour_f16_s16
+                  || flavour == neon_cvt_flavour_f16_u16
+                  || flavour == neon_cvt_flavour_s16_f16
+                  || flavour == neon_cvt_flavour_u16_f16
+                  || flavour == neon_cvt_flavour_f32_u32
+                  || flavour == neon_cvt_flavour_f32_s32
+                  || flavour == neon_cvt_flavour_s32_f32
+                  || flavour == neon_cvt_flavour_u32_f32))
+       {
+         if (!check_simd_pred_availability (TRUE,
+                                            NEON_CHECK_CC | NEON_CHECK_ARCH))
+           return;
+       }
+      /* fall through.  */
+    case NS_DD:
        if (mode != neon_cvt_mode_x && mode != neon_cvt_mode_z)
         {
-         NEON_ENCODE (FLOAT, inst);
-         set_pred_insn_type (OUTSIDE_PRED_INSN);
  
-         if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
+         NEON_ENCODE (FLOAT, inst);
+         if (!check_simd_pred_availability (TRUE,
+                                            NEON_CHECK_CC | NEON_CHECK_ARCH8))
             return;
  
           inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17110,8 +19255,11 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
  
             NEON_ENCODE (INTEGER, inst);
  
-           if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
-             return;
+         if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+           {
+             if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+               return;
+           }
  
             if (flavour != neon_cvt_flavour_invalid)
               inst.instruction |= enctab[flavour];
@@ -17154,8 +19302,21 @@ do_neon_cvt_1 (enum neon_cvt_mode mode)
           }
  
        if (rs == NS_DQ)
-       inst.instruction = 0x3b60600;
+       {
+         if (flavour == neon_cvt_flavour_bf16_f32)
+           {
+             if (vfp_or_neon_is_neon (NEON_CHECK_ARCH8) == FAIL)
+               return;
+             constraint (!mark_feature_used (&arm_ext_bf16), _(BAD_BF16));
+             /* VCVT.bf16.f32.  */
+             inst.instruction = 0x11b60640;
+           }
+         else
+           /* VCVT.f16.f32.  */
+           inst.instruction = 0x3b60600;
+       }
        else
+       /* VCVT.f32.f16.  */
         inst.instruction = 0x3b60700;
  
        inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17230,10 +19391,51 @@ static void
  do_neon_cvttb_1 (bfd_boolean t)
  {
    enum neon_shape rs = neon_select_shape (NS_HF, NS_HD, NS_FH, NS_FF, NS_FD,
-                                         NS_DF, NS_DH, NS_NULL);
+                                         NS_DF, NS_DH, NS_QQ, NS_QQI, NS_NULL);
  
    if (rs == NS_NULL)
      return;
+  else if (rs == NS_QQ || rs == NS_QQI)
+    {
+      int single_to_half = 0;
+      if (!check_simd_pred_availability (TRUE, NEON_CHECK_ARCH))
+       return;
+
+      enum neon_cvt_flavour flavour = get_neon_cvt_flavour (rs);
+
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+         && (flavour ==  neon_cvt_flavour_u16_f16
+             || flavour ==  neon_cvt_flavour_s16_f16
+             || flavour ==  neon_cvt_flavour_f16_s16
+             || flavour ==  neon_cvt_flavour_f16_u16
+             || flavour ==  neon_cvt_flavour_u32_f32
+             || flavour ==  neon_cvt_flavour_s32_f32
+             || flavour ==  neon_cvt_flavour_f32_s32
+             || flavour ==  neon_cvt_flavour_f32_u32))
+       {
+         inst.cond = 0xf;
+         inst.instruction = N_MNEM_vcvt;
+         set_pred_insn_type (INSIDE_VPT_INSN);
+         do_neon_cvt_1 (neon_cvt_mode_z);
+         return;
+       }
+      else if (rs == NS_QQ && flavour == neon_cvt_flavour_f32_f16)
+       single_to_half = 1;
+      else if (rs == NS_QQ && flavour != neon_cvt_flavour_f16_f32)
+       {
+         first_error (BAD_FPU);
+         return;
+       }
+
+      inst.instruction = 0xee3f0e01;
+      inst.instruction |= single_to_half << 28;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 13;
+      inst.instruction |= t << 12;
+      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 1;
+      inst.is_neon = 1;
+    }
    else if (neon_check_type (2, rs, N_F16, N_F32 | N_VFP).type != NT_invtype)
      {
        inst.error = NULL;
@@ -17264,6 +19466,14 @@ do_neon_cvttb_1 (bfd_boolean t)
        inst.error = NULL;
        do_neon_cvttb_2 (t, /*to=*/FALSE, /*is_double=*/TRUE);
      }
+  else if (neon_check_type (2, rs, N_BF16 | N_VFP, N_F32).type != NT_invtype)
+    {
+      constraint (!mark_feature_used (&arm_ext_bf16), _(BAD_BF16));
+      inst.error = NULL;
+      inst.instruction |= (1 << 8);
+      inst.instruction &= ~(1 << 9);
+      do_neon_cvttb_2 (t, /*to=*/TRUE, /*is_double=*/FALSE);
+    }
    else
      return;
  }
@@ -17336,9 +19546,16 @@ neon_move_immediate (void)
  static void
  do_neon_mvn (void)
  {
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
    if (inst.operands[1].isreg)
      {
-      enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+      enum neon_shape rs;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       rs = neon_select_shape (NS_QQ, NS_NULL);
+      else
+       rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
  
        NEON_ENCODE (INTEGER, inst);
        inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
@@ -17354,6 +19571,11 @@ do_neon_mvn (void)
      }
  
    neon_dp_fixup (&inst);
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      constraint (!inst.operands[1].isreg && !inst.operands[0].isquad, BAD_FPU);
+    }
  }
  
  /* Encode instructions of form:
@@ -17379,7 +19601,7 @@ neon_mixed_length (struct neon_type_el et, unsigned size)
  static void
  do_neon_dyadic_long (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_QDD, NS_QQQ, NS_QQR, NS_NULL);
+  enum neon_shape rs = neon_select_shape (NS_QDD, NS_HHH, NS_FFF, NS_DDD, NS_NULL);
    if (rs == NS_QDD)
      {
        if (vfp_or_neon_is_neon (NEON_CHECK_ARCH | NEON_CHECK_CC) == FAIL)
@@ -17485,7 +19707,7 @@ neon_scalar_for_fmac_fp16_long (unsigned scalar, unsigned quad_p)
               | ((elno & 0x1) << 3));
      }
  
-bad_scalar:
+ bad_scalar:
    first_error (_("scalar out of range for multiply instruction"));
    return 0;
  }
@@ -17501,16 +19723,6 @@ do_neon_fmac_maybe_scalar_long (int subtype)
       0x2.  */
    int size = -1;
  
-  if (inst.cond != COND_ALWAYS)
-    as_warn (_("vfmal/vfmsl with FP16 type cannot be conditional, the "
-              "behaviour is UNPREDICTABLE"));
-
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_fp16_fml),
-             _(BAD_FP16));
-
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-             _(BAD_FPU));
-
    /* vfmal/vfmsl are in three-same D/Q register format or the third operand can
       be a scalar index register.  */
    if (inst.operands[2].isscalar)
@@ -17529,7 +19741,16 @@ do_neon_fmac_maybe_scalar_long (int subtype)
        rs = neon_select_shape (NS_DHH, NS_QDD, NS_NULL);
      }
  
-  neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16);
+
+  if (inst.cond != COND_ALWAYS)
+    as_warn (_("vfmal/vfmsl with FP16 type cannot be conditional, the "
+              "behaviour is UNPREDICTABLE"));
+
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_fp16_fml),
+             _(BAD_FP16));
+
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
+             _(BAD_FPU));
  
    /* "opcode" from template has included "ubit", so simply pass 0 here.  Also,
       the "S" bit in size field has been reused to differentiate vfmal and vfmsl,
@@ -17541,8 +19762,6 @@ do_neon_fmac_maybe_scalar_long (int subtype)
    inst.instruction &= 0x00ffffff;
    inst.instruction |= high8;
  
-#define LOW1(R) ((R) & 0x1)
-#define HI4(R) (((R) >> 1) & 0xf)
    /* Unlike usually NEON three-same, encoding for Vn and Vm will depend on
       whether the instruction is in Q form and whether Vm is a scalar indexed
       operand.  */
@@ -17666,14 +19885,29 @@ do_neon_ext (void)
  static void
  do_neon_rev (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
    struct neon_type_el et = neon_check_type (2, rs,
      N_EQK, N_8 | N_16 | N_32 | N_KEY);
+
    unsigned op = (inst.instruction >> 7) & 3;
    /* N (width of reversed regions) is encoded as part of the bitmask. We
       extract it here to check the elements to be reversed are smaller.
       Otherwise we'd get a reserved instruction.  */
    unsigned elsize = (op == 2) ? 16 : (op == 1) ? 32 : (op == 0) ? 64 : 0;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext) && elsize == 64
+      && inst.operands[0].reg == inst.operands[1].reg)
+    as_tsktsk (_("Warning: 64-bit element size and same destination and source"
+                " operands makes instruction UNPREDICTABLE"));
+
    gas_assert (elsize != 0);
    constraint (et.size >= elsize,
               _("elements must be smaller than reversal region"));
@@ -17685,6 +19919,8 @@ do_neon_dup (void)
  {
    if (inst.operands[1].isscalar)
      {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1),
+                 BAD_FPU);
        enum neon_shape rs = neon_select_shape (NS_DS, NS_QS, NS_NULL);
        struct neon_type_el et = neon_check_type (2, rs,
         N_EQK, N_8 | N_16 | N_32 | N_KEY);
@@ -17712,6 +19948,23 @@ do_neon_dup (void)
        enum neon_shape rs = neon_select_shape (NS_DR, NS_QR, NS_NULL);
        struct neon_type_el et = neon_check_type (2, rs,
         N_8 | N_16 | N_32 | N_KEY, N_EQK);
+      if (rs == NS_QR)
+       {
+         if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH))
+           return;
+       }
+      else
+       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1),
+                   BAD_FPU);
+
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+       {
+         if (inst.operands[1].reg == REG_SP)
+           as_tsktsk (MVE_BAD_SP);
+         else if (inst.operands[1].reg == REG_PC)
+           as_tsktsk (MVE_BAD_PC);
+       }
+
        /* Duplicate ARM register to lanes of vector.  */
        NEON_ENCODE (ARMREG, inst);
        switch (et.size)
@@ -17731,6 +19984,67 @@ do_neon_dup (void)
      }
  }
  
+static void
+do_mve_mov (int toQ)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = MVE_UNPREDICABLE_INSN;
+
+  unsigned Rt = 0, Rt2 = 1, Q0 = 2, Q1 = 3;
+  if (toQ)
+    {
+      Q0 = 0;
+      Q1 = 1;
+      Rt = 2;
+      Rt2 = 3;
+    }
+
+  constraint (inst.operands[Q0].reg != inst.operands[Q1].reg + 2,
+             _("Index one must be [2,3] and index two must be two less than"
+               " index one."));
+  constraint (inst.operands[Rt].reg == inst.operands[Rt2].reg,
+             _("General purpose registers may not be the same"));
+  constraint (inst.operands[Rt].reg == REG_SP
+             || inst.operands[Rt2].reg == REG_SP,
+             BAD_SP);
+  constraint (inst.operands[Rt].reg == REG_PC
+             || inst.operands[Rt2].reg == REG_PC,
+             BAD_PC);
+
+  inst.instruction = 0xec000f00;
+  inst.instruction |= HI1 (inst.operands[Q1].reg / 32) << 23;
+  inst.instruction |= !!toQ << 20;
+  inst.instruction |= inst.operands[Rt2].reg << 16;
+  inst.instruction |= LOW4 (inst.operands[Q1].reg / 32) << 13;
+  inst.instruction |= (inst.operands[Q1].reg % 4) << 4;
+  inst.instruction |= inst.operands[Rt].reg;
+}
+
+static void
+do_mve_movn (void)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+
+  if (inst.cond > COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+  else
+    inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+
+  struct neon_type_el et = neon_check_type (2, NS_QQ, N_EQK, N_I16 | N_I32
+                                           | N_KEY);
+
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (neon_logbits (et.size) - 1) << 18;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+
+}
+
  /* VMOV has particularly many variations. It can be one of:
       0. VMOV<c><q> <Qd>, <Qm>
       1. VMOV<c><q> <Dd>, <Dm>
@@ -17760,6 +20074,10 @@ do_neon_dup (void)
     (Two ARM regs to two VFP singles.)
      15. VMOV <Sd>, <Se>, <Rn>, <Rm>
     (Two VFP singles to two ARM regs.)
+   16. VMOV<c> <Rt>, <Rt2>, <Qd[idx]>, <Qd[idx2]>
+   17. VMOV<c> <Qd[idx]>, <Qd[idx2]>, <Rt>, <Rt2>
+   18. VMOV<c>.<dt> <Rt>, <Qn[idx]>
+   19. VMOV<c>.<dt> <Qd[idx]>, <Rt>
  
     These cases can be disambiguated using neon_select_shape, except cases 1/9
     and 3/11 which depend on the operand type too.
@@ -17775,10 +20093,11 @@ do_neon_dup (void)
  static void
  do_neon_mov (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_RRFF, NS_FFRR, NS_DRR, NS_RRD,
-                                         NS_QQ, NS_DD, NS_QI, NS_DI, NS_SR,
-                                         NS_RS, NS_FF, NS_FI, NS_RF, NS_FR,
-                                         NS_HR, NS_RH, NS_HI, NS_NULL);
+  enum neon_shape rs = neon_select_shape (NS_RRSS, NS_SSRR, NS_RRFF, NS_FFRR,
+                                         NS_DRR, NS_RRD, NS_QQ, NS_DD, NS_QI,
+                                         NS_DI, NS_SR, NS_RS, NS_FF, NS_FI,
+                                         NS_RF, NS_FR, NS_HR, NS_RH, NS_HI,
+                                         NS_NULL);
    struct neon_type_el et;
    const char *ldconst = 0;
  
@@ -17788,7 +20107,13 @@ do_neon_mov (void)
        et = neon_check_type (2, rs, N_EQK, N_F64 | N_KEY);
        /* It is not an error here if no type is given.  */
        inst.error = NULL;
-      if (et.type == NT_float && et.size == 64)
+
+      /* In MVE we interpret the following instructions as same, so ignoring
+        the following type (float) and size (64) checks.
+        a: VMOV<c><q> <Dd>, <Dm>
+        b: VMOV<c><q>.F64 <Dd>, <Dm>.  */
+      if ((et.type == NT_float && et.size == 64)
+         || (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)))
         {
           do_vfp_nsyn_opcode ("fcpyd");
           break;
@@ -17797,7 +20122,8 @@ do_neon_mov (void)
  
      case NS_QQ:  /* case 0/1.  */
        {
-       if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+       if (!check_simd_pred_availability (FALSE,
+                                          NEON_CHECK_CC | NEON_CHECK_ARCH))
           return;
         /* The architecture manual I have doesn't explicitly state which
            value the U bit should have for register->register moves, but
@@ -17827,7 +20153,8 @@ do_neon_mov (void)
        /* fall through.  */
  
      case NS_QI:  /* case 2/3.  */
-      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH) == FAIL)
+      if (!check_simd_pred_availability (FALSE,
+                                        NEON_CHECK_CC | NEON_CHECK_ARCH))
         return;
        inst.instruction = 0x0800010;
        neon_move_immediate ();
@@ -17854,12 +20181,31 @@ do_neon_mov (void)
         et = neon_check_type (2, NS_NULL, N_8 | N_16 | N_32 | N_KEY, N_EQK);
         logsize = neon_logbits (et.size);
  
-       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
-                   _(BAD_FPU));
-       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
-                   && et.size != 32, _(BAD_FPU));
+       if (et.size != 32)
+         {
+           if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+               && vfp_or_neon_is_neon (NEON_CHECK_ARCH) == FAIL)
+             return;
+         }
+       else
+         {
+           constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+                       && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+                       _(BAD_FPU));
+         }
+
+       if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+         {
+           if (inst.operands[1].reg == REG_SP)
+             as_tsktsk (MVE_BAD_SP);
+           else if (inst.operands[1].reg == REG_PC)
+             as_tsktsk (MVE_BAD_PC);
+         }
+       unsigned size = inst.operands[0].isscalar == 1 ? 64 : 128;
+
         constraint (et.type == NT_invtype, _("bad type for scalar"));
-       constraint (x >= 64 / et.size, _("scalar index out of range"));
+       constraint (x >= size / et.size, _("scalar index out of range"));
+
  
         switch (et.size)
           {
@@ -17869,7 +20215,7 @@ do_neon_mov (void)
           default: ;
           }
  
-       bcdebits |= x << logsize;
+       bcdebits |= (x & ((1 << (3-logsize)) - 1)) << logsize;
  
         inst.instruction = 0xe000b10;
         do_vfp_cond_or_thumb ();
@@ -17877,12 +20223,14 @@ do_neon_mov (void)
         inst.instruction |= HI1 (dn) << 7;
         inst.instruction |= inst.operands[1].reg << 12;
         inst.instruction |= (bcdebits & 3) << 5;
-       inst.instruction |= (bcdebits >> 2) << 21;
+       inst.instruction |= ((bcdebits >> 2) & 3) << 21;
+       inst.instruction |= (x >> (3-logsize)) << 16;
        }
        break;
  
      case NS_DRR:  /* case 5 (fmdrr).  */
-      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+                 && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
                   _(BAD_FPU));
  
        inst.instruction = 0xc400b10;
@@ -17914,12 +20262,32 @@ do_neon_mov (void)
                               N_EQK, N_S8 | N_S16 | N_U8 | N_U16 | N_32 | N_KEY);
         logsize = neon_logbits (et.size);
  
-       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1),
-                   _(BAD_FPU));
-       constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_v1)
-                   && et.size != 32, _(BAD_FPU));
+       if (et.size != 32)
+         {
+           if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+               && vfp_or_neon_is_neon (NEON_CHECK_CC
+                                       | NEON_CHECK_ARCH) == FAIL)
+             return;
+         }
+       else
+         {
+           constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1)
+                       && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+                       _(BAD_FPU));
+         }
+
+       if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+         {
+           if (inst.operands[0].reg == REG_SP)
+             as_tsktsk (MVE_BAD_SP);
+           else if (inst.operands[0].reg == REG_PC)
+             as_tsktsk (MVE_BAD_PC);
+         }
+
+       unsigned size = inst.operands[1].isscalar == 1 ? 64 : 128;
+
         constraint (et.type == NT_invtype, _("bad type for scalar"));
-       constraint (x >= 64 / et.size, _("scalar index out of range"));
+       constraint (x >= size / et.size, _("scalar index out of range"));
  
         switch (et.size)
           {
@@ -17929,7 +20297,7 @@ do_neon_mov (void)
           default: ;
           }
  
-       abcdebits |= x << logsize;
+       abcdebits |= (x & ((1 << (3-logsize)) - 1)) << logsize;
         inst.instruction = 0xe100b10;
         do_vfp_cond_or_thumb ();
         inst.instruction |= LOW4 (dn) << 16;
@@ -17937,11 +20305,13 @@ do_neon_mov (void)
         inst.instruction |= inst.operands[0].reg << 12;
         inst.instruction |= (abcdebits & 3) << 5;
         inst.instruction |= (abcdebits >> 2) << 21;
+       inst.instruction |= (x >> (3-logsize)) << 16;
        }
        break;
  
      case NS_RRD:  /* case 7 (fmrrd).  */
-      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2),
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+                 && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
                   _(BAD_FPU));
  
        inst.instruction = 0xc500b10;
@@ -18008,11 +20378,21 @@ do_neon_mov (void)
         do_scalar_fp16_v82_encode ();
        break;
  
+    case NS_RRSS:
+      do_mve_mov (0);
+      break;
+    case NS_SSRR:
+      do_mve_mov (1);
+      break;
+
      /* The encoders for the fmrrs and fmsrr instructions expect three operands
         (one of which is a list), but we have parsed four.  Do some fiddling to
         make the operands what do_vfp_reg2_from_sp2 and do_vfp_sp2_from_reg2
         expect.  */
      case NS_RRFF:  /* case 14 (fmrrs).  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+                 && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+                 _(BAD_FPU));
        constraint (inst.operands[3].reg != inst.operands[2].reg + 1,
                   _("VFP registers must be adjacent"));
        inst.operands[2].imm = 2;
@@ -18021,6 +20401,9 @@ do_neon_mov (void)
        break;
  
      case NS_FFRR:  /* case 15 (fmsrr).  */
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v2)
+                 && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+                 _(BAD_FPU));
        constraint (inst.operands[1].reg != inst.operands[0].reg + 1,
                   _("VFP registers must be adjacent"));
        inst.operands[1] = inst.operands[2];
@@ -18040,11 +20423,58 @@ do_neon_mov (void)
      }
  }
  
+static void
+do_mve_movl (void)
+{
+  if (!(inst.operands[0].present && inst.operands[0].isquad
+      && inst.operands[1].present && inst.operands[1].isquad
+      && !inst.operands[2].present))
+    {
+      inst.instruction = 0;
+      inst.cond = 0xb;
+      if (thumb_mode)
+       set_pred_insn_type (INSIDE_IT_INSN);
+      do_neon_mov ();
+      return;
+    }
+
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    return;
+
+  if (inst.cond != COND_ALWAYS)
+    inst.pred_insn_type = INSIDE_VPT_INSN;
+
+  struct neon_type_el et = neon_check_type (2, NS_QQ, N_EQK, N_S8 | N_U8
+                                           | N_S16 | N_U16 | N_KEY);
+
+  inst.instruction |= (et.type == NT_unsigned) << 28;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= (neon_logbits (et.size) + 1) << 19;
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+  inst.instruction |= LOW4 (inst.operands[1].reg);
+  inst.is_neon = 1;
+}
+
  static void
  do_neon_rshift_round_imm (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
-  struct neon_type_el et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+   return;
+
+  enum neon_shape rs;
+  struct neon_type_el et;
+
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    {
+      rs = neon_select_shape (NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_SU_MVE | N_KEY);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDI, NS_QQI, NS_NULL);
+      et = neon_check_type (2, rs, N_EQK, N_SU_ALL | N_KEY);
+    }
    int imm = inst.operands[2].imm;
  
    /* imm == 0 case is encoded as VMOV for V{R}SHR.  */
@@ -18126,7 +20556,14 @@ do_neon_zip_uzp (void)
  static void
  do_neon_sat_abs_neg (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_CC | NEON_CHECK_ARCH))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+    rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
    struct neon_type_el et = neon_check_type (2, rs,
      N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
    neon_two_same (neon_quad (rs), 1, et.size);
@@ -18155,7 +20592,15 @@ do_neon_recip_est (void)
  static void
  do_neon_cls (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+   rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
    struct neon_type_el et = neon_check_type (2, rs,
      N_EQK, N_S8 | N_S16 | N_S32 | N_KEY);
    neon_two_same (neon_quad (rs), 1, et.size);
@@ -18164,7 +20609,15 @@ do_neon_cls (void)
  static void
  do_neon_clz (void)
  {
-  enum neon_shape rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+  if (!check_simd_pred_availability (FALSE, NEON_CHECK_ARCH | NEON_CHECK_CC))
+    return;
+
+  enum neon_shape rs;
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+   rs = neon_select_shape (NS_QQ, NS_NULL);
+  else
+   rs = neon_select_shape (NS_DD, NS_QQ, NS_NULL);
+
    struct neon_type_el et = neon_check_type (2, rs,
      N_EQK, N_I8 | N_I16 | N_I32 | N_KEY);
    neon_two_same (neon_quad (rs), 1, et.size);
@@ -18213,6 +20666,9 @@ do_neon_tbl_tbx (void)
  static void
  do_neon_ldm_stm (void)
  {
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd)
+             && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext),
+             _(BAD_FPU));
    /* P, U and L bits are part of bitmask.  */
    int is_dbmode = (inst.instruction & (1 << 24)) != 0;
    unsigned offsetbits = inst.operands[1].imm * 2;
@@ -18235,11 +20691,54 @@ do_neon_ldm_stm (void)
    inst.instruction |= LOW4 (inst.operands[1].reg) << 12;
    inst.instruction |= HI1 (inst.operands[1].reg) << 22;
  
-  inst.instruction |= offsetbits;
+  inst.instruction |= offsetbits;
+
+  do_vfp_cond_or_thumb ();
+}
+
+static void
+do_vfp_nsyn_pop (void)
+{
+  nsyn_insert_sp ();
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)) {
+    return do_vfp_nsyn_opcode ("vldm");
+  }
+
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd),
+             _(BAD_FPU));
+
+  constraint (inst.operands[1].imm < 1 || inst.operands[1].imm > 16,
+             _("register list must contain at least 1 and at most 16 "
+               "registers"));
+
+  if (inst.operands[1].issingle)
+    do_vfp_nsyn_opcode ("fldmias");
+  else
+    do_vfp_nsyn_opcode ("fldmiad");
+}
+
+static void
+do_vfp_nsyn_push (void)
+{
+  nsyn_insert_sp ();
+  if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)) {
+    return do_vfp_nsyn_opcode ("vstmdb");
+  }
+
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_v1xd),
+             _(BAD_FPU));
+
+  constraint (inst.operands[1].imm < 1 || inst.operands[1].imm > 16,
+             _("register list must contain at least 1 and at most 16 "
+               "registers"));
  
-  do_vfp_cond_or_thumb ();
+  if (inst.operands[1].issingle)
+    do_vfp_nsyn_opcode ("fstmdbs");
+  else
+    do_vfp_nsyn_opcode ("fstmdbd");
  }
  
+
  static void
  do_neon_ldr_str (void)
  {
@@ -18320,7 +20819,8 @@ do_vldr_vstr (void)
    /* VLDR/VSTR.  */
    else
      {
-      if (!mark_feature_used (&fpu_vfp_ext_v1xd))
+      if (!mark_feature_used (&fpu_vfp_ext_v1xd)
+         && !ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
         as_bad (_("Instruction not permitted on this architecture"));
        do_neon_ldr_str ();
      }
@@ -18385,616 +20885,1195 @@ do_neon_ld_st_interleave (void)
    idx = ((inst.operands[0].imm >> 4) & 7)
         | (((inst.instruction >> 8) & 3) << 3);
  
-  typebits = typetable[idx];
+  typebits = typetable[idx];
+
+  constraint (typebits == -1, _("bad list type for instruction"));
+  constraint (((inst.instruction >> 8) & 3) && et.size == 64,
+             BAD_EL_TYPE);
+
+  inst.instruction &= ~0xf00;
+  inst.instruction |= typebits << 8;
+}
+
+/* Check alignment is valid for do_neon_ld_st_lane and do_neon_ld_dup.
+   *DO_ALIGN is set to 1 if the relevant alignment bit should be set, 0
+   otherwise. The variable arguments are a list of pairs of legal (size, align)
+   values, terminated with -1.  */
+
+static int
+neon_alignment_bit (int size, int align, int *do_alignment, ...)
+{
+  va_list ap;
+  int result = FAIL, thissize, thisalign;
+
+  if (!inst.operands[1].immisalign)
+    {
+      *do_alignment = 0;
+      return SUCCESS;
+    }
+
+  va_start (ap, do_alignment);
+
+  do
+    {
+      thissize = va_arg (ap, int);
+      if (thissize == -1)
+       break;
+      thisalign = va_arg (ap, int);
+
+      if (size == thissize && align == thisalign)
+       result = SUCCESS;
+    }
+  while (result != SUCCESS);
+
+  va_end (ap);
+
+  if (result == SUCCESS)
+    *do_alignment = 1;
+  else
+    first_error (_("unsupported alignment for instruction"));
+
+  return result;
+}
+
+static void
+do_neon_ld_st_lane (void)
+{
+  struct neon_type_el et = neon_check_type (1, NS_NULL, N_8 | N_16 | N_32);
+  int align_good, do_alignment = 0;
+  int logsize = neon_logbits (et.size);
+  int align = inst.operands[1].imm >> 8;
+  int n = (inst.instruction >> 8) & 3;
+  int max_el = 64 / et.size;
+
+  if (et.type == NT_invtype)
+    return;
+
+  constraint (NEON_REGLIST_LENGTH (inst.operands[0].imm) != n + 1,
+             _("bad list length"));
+  constraint (NEON_LANE (inst.operands[0].imm) >= max_el,
+             _("scalar index out of range"));
+  constraint (n != 0 && NEON_REG_STRIDE (inst.operands[0].imm) == 2
+             && et.size == 8,
+             _("stride of 2 unavailable when element size is 8"));
+
+  switch (n)
+    {
+    case 0:  /* VLD1 / VST1.  */
+      align_good = neon_alignment_bit (et.size, align, &do_alignment, 16, 16,
+                                      32, 32, -1);
+      if (align_good == FAIL)
+       return;
+      if (do_alignment)
+       {
+         unsigned alignbits = 0;
+         switch (et.size)
+           {
+           case 16: alignbits = 0x1; break;
+           case 32: alignbits = 0x3; break;
+           default: ;
+           }
+         inst.instruction |= alignbits << 4;
+       }
+      break;
+
+    case 1:  /* VLD2 / VST2.  */
+      align_good = neon_alignment_bit (et.size, align, &do_alignment, 8, 16,
+                     16, 32, 32, 64, -1);
+      if (align_good == FAIL)
+       return;
+      if (do_alignment)
+       inst.instruction |= 1 << 4;
+      break;
+
+    case 2:  /* VLD3 / VST3.  */
+      constraint (inst.operands[1].immisalign,
+                 _("can't use alignment with this instruction"));
+      break;
+
+    case 3:  /* VLD4 / VST4.  */
+      align_good = neon_alignment_bit (et.size, align, &do_alignment, 8, 32,
+                                      16, 64, 32, 64, 32, 128, -1);
+      if (align_good == FAIL)
+       return;
+      if (do_alignment)
+       {
+         unsigned alignbits = 0;
+         switch (et.size)
+           {
+           case 8:  alignbits = 0x1; break;
+           case 16: alignbits = 0x1; break;
+           case 32: alignbits = (align == 64) ? 0x1 : 0x2; break;
+           default: ;
+           }
+         inst.instruction |= alignbits << 4;
+       }
+      break;
+
+    default: ;
+    }
+
+  /* Reg stride of 2 is encoded in bit 5 when size==16, bit 6 when size==32.  */
+  if (n != 0 && NEON_REG_STRIDE (inst.operands[0].imm) == 2)
+    inst.instruction |= 1 << (4 + logsize);
+
+  inst.instruction |= NEON_LANE (inst.operands[0].imm) << (logsize + 5);
+  inst.instruction |= logsize << 10;
+}
+
+/* Encode single n-element structure to all lanes VLD<n> instructions.  */
+
+static void
+do_neon_ld_dup (void)
+{
+  struct neon_type_el et = neon_check_type (1, NS_NULL, N_8 | N_16 | N_32);
+  int align_good, do_alignment = 0;
+
+  if (et.type == NT_invtype)
+    return;
+
+  switch ((inst.instruction >> 8) & 3)
+    {
+    case 0:  /* VLD1.  */
+      gas_assert (NEON_REG_STRIDE (inst.operands[0].imm) != 2);
+      align_good = neon_alignment_bit (et.size, inst.operands[1].imm >> 8,
+                                      &do_alignment, 16, 16, 32, 32, -1);
+      if (align_good == FAIL)
+       return;
+      switch (NEON_REGLIST_LENGTH (inst.operands[0].imm))
+       {
+       case 1: break;
+       case 2: inst.instruction |= 1 << 5; break;
+       default: first_error (_("bad list length")); return;
+       }
+      inst.instruction |= neon_logbits (et.size) << 6;
+      break;
+
+    case 1:  /* VLD2.  */
+      align_good = neon_alignment_bit (et.size, inst.operands[1].imm >> 8,
+                                      &do_alignment, 8, 16, 16, 32, 32, 64,
+                                      -1);
+      if (align_good == FAIL)
+       return;
+      constraint (NEON_REGLIST_LENGTH (inst.operands[0].imm) != 2,
+                 _("bad list length"));
+      if (NEON_REG_STRIDE (inst.operands[0].imm) == 2)
+       inst.instruction |= 1 << 5;
+      inst.instruction |= neon_logbits (et.size) << 6;
+      break;
+
+    case 2:  /* VLD3.  */
+      constraint (inst.operands[1].immisalign,
+                 _("can't use alignment with this instruction"));
+      constraint (NEON_REGLIST_LENGTH (inst.operands[0].imm) != 3,
+                 _("bad list length"));
+      if (NEON_REG_STRIDE (inst.operands[0].imm) == 2)
+       inst.instruction |= 1 << 5;
+      inst.instruction |= neon_logbits (et.size) << 6;
+      break;
+
+    case 3:  /* VLD4.  */
+      {
+       int align = inst.operands[1].imm >> 8;
+       align_good = neon_alignment_bit (et.size, align, &do_alignment, 8, 32,
+                                        16, 64, 32, 64, 32, 128, -1);
+       if (align_good == FAIL)
+         return;
+       constraint (NEON_REGLIST_LENGTH (inst.operands[0].imm) != 4,
+                   _("bad list length"));
+       if (NEON_REG_STRIDE (inst.operands[0].imm) == 2)
+         inst.instruction |= 1 << 5;
+       if (et.size == 32 && align == 128)
+         inst.instruction |= 0x3 << 6;
+       else
+         inst.instruction |= neon_logbits (et.size) << 6;
+      }
+      break;
+
+    default: ;
+    }
+
+  inst.instruction |= do_alignment << 4;
+}
+
+/* Disambiguate VLD<n> and VST<n> instructions, and fill in common bits (those
+   apart from bits [11:4].  */
+
+static void
+do_neon_ldx_stx (void)
+{
+  if (inst.operands[1].isreg)
+    constraint (inst.operands[1].reg == REG_PC, BAD_PC);
+
+  switch (NEON_LANE (inst.operands[0].imm))
+    {
+    case NEON_INTERLEAVE_LANES:
+      NEON_ENCODE (INTERLV, inst);
+      do_neon_ld_st_interleave ();
+      break;
+
+    case NEON_ALL_LANES:
+      NEON_ENCODE (DUP, inst);
+      if (inst.instruction == N_INV)
+       {
+         first_error ("only loads support such operands");
+         break;
+       }
+      do_neon_ld_dup ();
+      break;
+
+    default:
+      NEON_ENCODE (LANE, inst);
+      do_neon_ld_st_lane ();
+    }
+
+  /* L bit comes from bit mask.  */
+  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+  inst.instruction |= inst.operands[1].reg << 16;
+
+  if (inst.operands[1].postind)
+    {
+      int postreg = inst.operands[1].imm & 0xf;
+      constraint (!inst.operands[1].immisreg,
+                 _("post-index must be a register"));
+      constraint (postreg == 0xd || postreg == 0xf,
+                 _("bad register for post-index"));
+      inst.instruction |= postreg;
+    }
+  else
+    {
+      constraint (inst.operands[1].immisreg, BAD_ADDR_MODE);
+      constraint (inst.relocs[0].exp.X_op != O_constant
+                 || inst.relocs[0].exp.X_add_number != 0,
+                 BAD_ADDR_MODE);
  
-  constraint (typebits == -1, _("bad list type for instruction"));
-  constraint (((inst.instruction >> 8) & 3) && et.size == 64,
-             BAD_EL_TYPE);
+      if (inst.operands[1].writeback)
+       {
+         inst.instruction |= 0xd;
+       }
+      else
+       inst.instruction |= 0xf;
+    }
  
-  inst.instruction &= ~0xf00;
-  inst.instruction |= typebits << 8;
+  if (thumb_mode)
+    inst.instruction |= 0xf9000000;
+  else
+    inst.instruction |= 0xf4000000;
  }
  
-/* Check alignment is valid for do_neon_ld_st_lane and do_neon_ld_dup.
-   *DO_ALIGN is set to 1 if the relevant alignment bit should be set, 0
-   otherwise. The variable arguments are a list of pairs of legal (size, align)
-   values, terminated with -1.  */
-
-static int
-neon_alignment_bit (int size, int align, int *do_alignment, ...)
+/* FP v8.  */
+static void
+do_vfp_nsyn_fpv8 (enum neon_shape rs)
  {
-  va_list ap;
-  int result = FAIL, thissize, thisalign;
+  /* Targets like FPv5-SP-D16 don't support FP v8 instructions with
+     D register operands.  */
+  if (neon_shape_class[rs] == SC_DOUBLE)
+    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+               _(BAD_FPU));
  
-  if (!inst.operands[1].immisalign)
+  NEON_ENCODE (FPV8, inst);
+
+  if (rs == NS_FFF || rs == NS_HHH)
      {
-      *do_alignment = 0;
-      return SUCCESS;
+      do_vfp_sp_dyadic ();
+
+      /* ARMv8.2 fp16 instruction.  */
+      if (rs == NS_HHH)
+       do_scalar_fp16_v82_encode ();
      }
+  else
+    do_vfp_dp_rd_rn_rm ();
  
-  va_start (ap, do_alignment);
+  if (rs == NS_DDD)
+    inst.instruction |= 0x100;
  
-  do
-    {
-      thissize = va_arg (ap, int);
-      if (thissize == -1)
-       break;
-      thisalign = va_arg (ap, int);
+  inst.instruction |= 0xf0000000;
+}
  
-      if (size == thissize && align == thisalign)
-       result = SUCCESS;
-    }
-  while (result != SUCCESS);
+static void
+do_vsel (void)
+{
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
  
-  va_end (ap);
+  if (try_vfp_nsyn (3, do_vfp_nsyn_fpv8) != SUCCESS)
+    first_error (_("invalid instruction shape"));
+}
  
-  if (result == SUCCESS)
-    *do_alignment = 1;
-  else
-    first_error (_("unsupported alignment for instruction"));
+static void
+do_vmaxnm (void)
+{
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
+    set_pred_insn_type (OUTSIDE_PRED_INSN);
  
-  return result;
+  if (try_vfp_nsyn (3, do_vfp_nsyn_fpv8) == SUCCESS)
+    return;
+
+  if (!check_simd_pred_availability (TRUE, NEON_CHECK_CC | NEON_CHECK_ARCH8))
+    return;
+
+  neon_dyadic_misc (NT_untyped, N_F_16_32, 0);
  }
  
  static void
-do_neon_ld_st_lane (void)
+do_vrint_1 (enum neon_cvt_mode mode)
  {
-  struct neon_type_el et = neon_check_type (1, NS_NULL, N_8 | N_16 | N_32);
-  int align_good, do_alignment = 0;
-  int logsize = neon_logbits (et.size);
-  int align = inst.operands[1].imm >> 8;
-  int n = (inst.instruction >> 8) & 3;
-  int max_el = 64 / et.size;
+  enum neon_shape rs = neon_select_shape (NS_HH, NS_FF, NS_DD, NS_QQ, NS_NULL);
+  struct neon_type_el et;
  
-  if (et.type == NT_invtype)
+  if (rs == NS_NULL)
      return;
  
-  constraint (NEON_REGLIST_LENGTH (inst.operands[0].imm) != n + 1,
-             _("bad list length"));
-  constraint (NEON_LANE (inst.operands[0].imm) >= max_el,
-             _("scalar index out of range"));
-  constraint (n != 0 && NEON_REG_STRIDE (inst.operands[0].imm) == 2
-             && et.size == 8,
-             _("stride of 2 unavailable when element size is 8"));
+  /* Targets like FPv5-SP-D16 don't support FP v8 instructions with
+     D register operands.  */
+  if (neon_shape_class[rs] == SC_DOUBLE)
+    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
+               _(BAD_FPU));
  
-  switch (n)
+  et = neon_check_type (2, rs, N_EQK | N_VFP, N_F_ALL | N_KEY
+                       | N_VFP);
+  if (et.type != NT_invtype)
      {
-    case 0:  /* VLD1 / VST1.  */
-      align_good = neon_alignment_bit (et.size, align, &do_alignment, 16, 16,
-                                      32, 32, -1);
-      if (align_good == FAIL)
-       return;
-      if (do_alignment)
+      /* VFP encodings.  */
+      if (mode == neon_cvt_mode_a || mode == neon_cvt_mode_n
+         || mode == neon_cvt_mode_p || mode == neon_cvt_mode_m)
+       set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+      NEON_ENCODE (FPV8, inst);
+      if (rs == NS_FF || rs == NS_HH)
+       do_vfp_sp_monadic ();
+      else
+       do_vfp_dp_rd_rm ();
+
+      switch (mode)
         {
-         unsigned alignbits = 0;
-         switch (et.size)
-           {
-           case 16: alignbits = 0x1; break;
-           case 32: alignbits = 0x3; break;
-           default: ;
-           }
-         inst.instruction |= alignbits << 4;
+       case neon_cvt_mode_r: inst.instruction |= 0x00000000; break;
+       case neon_cvt_mode_z: inst.instruction |= 0x00000080; break;
+       case neon_cvt_mode_x: inst.instruction |= 0x00010000; break;
+       case neon_cvt_mode_a: inst.instruction |= 0xf0000000; break;
+       case neon_cvt_mode_n: inst.instruction |= 0xf0010000; break;
+       case neon_cvt_mode_p: inst.instruction |= 0xf0020000; break;
+       case neon_cvt_mode_m: inst.instruction |= 0xf0030000; break;
+       default: abort ();
         }
-      break;
  
-    case 1:  /* VLD2 / VST2.  */
-      align_good = neon_alignment_bit (et.size, align, &do_alignment, 8, 16,
-                     16, 32, 32, 64, -1);
-      if (align_good == FAIL)
-       return;
-      if (do_alignment)
-       inst.instruction |= 1 << 4;
-      break;
+      inst.instruction |= (rs == NS_DD) << 8;
+      do_vfp_cond_or_thumb ();
  
-    case 2:  /* VLD3 / VST3.  */
-      constraint (inst.operands[1].immisalign,
-                 _("can't use alignment with this instruction"));
-      break;
+      /* ARMv8.2 fp16 vrint instruction.  */
+      if (rs == NS_HH)
+      do_scalar_fp16_v82_encode ();
+    }
+  else
+    {
+      /* Neon encodings (or something broken...).  */
+      inst.error = NULL;
+      et = neon_check_type (2, rs, N_EQK, N_F_16_32 | N_KEY);
  
-    case 3:  /* VLD4 / VST4.  */
-      align_good = neon_alignment_bit (et.size, align, &do_alignment, 8, 32,
-                                      16, 64, 32, 64, 32, 128, -1);
-      if (align_good == FAIL)
+      if (et.type == NT_invtype)
         return;
-      if (do_alignment)
+
+      if (!check_simd_pred_availability (TRUE,
+                                        NEON_CHECK_CC | NEON_CHECK_ARCH8))
+       return;
+
+      NEON_ENCODE (FLOAT, inst);
+
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= LOW4 (inst.operands[1].reg);
+      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
+      inst.instruction |= neon_quad (rs) << 6;
+      /* Mask off the original size bits and reencode them.  */
+      inst.instruction = ((inst.instruction & 0xfff3ffff)
+                         | neon_logbits (et.size) << 18);
+
+      switch (mode)
         {
-         unsigned alignbits = 0;
-         switch (et.size)
-           {
-           case 8:  alignbits = 0x1; break;
-           case 16: alignbits = 0x1; break;
-           case 32: alignbits = (align == 64) ? 0x1 : 0x2; break;
-           default: ;
-           }
-         inst.instruction |= alignbits << 4;
+       case neon_cvt_mode_z: inst.instruction |= 3 << 7; break;
+       case neon_cvt_mode_x: inst.instruction |= 1 << 7; break;
+       case neon_cvt_mode_a: inst.instruction |= 2 << 7; break;
+       case neon_cvt_mode_n: inst.instruction |= 0 << 7; break;
+       case neon_cvt_mode_p: inst.instruction |= 7 << 7; break;
+       case neon_cvt_mode_m: inst.instruction |= 5 << 7; break;
+       case neon_cvt_mode_r: inst.error = _("invalid rounding mode"); break;
+       default: abort ();
         }
-      break;
  
-    default: ;
-    }
+      if (thumb_mode)
+       inst.instruction |= 0xfc000000;
+      else
+       inst.instruction |= 0xf0000000;
+    }
+}
+
+static void
+do_vrintx (void)
+{
+  do_vrint_1 (neon_cvt_mode_x);
+}
+
+static void
+do_vrintz (void)
+{
+  do_vrint_1 (neon_cvt_mode_z);
+}
+
+static void
+do_vrintr (void)
+{
+  do_vrint_1 (neon_cvt_mode_r);
+}
+
+static void
+do_vrinta (void)
+{
+  do_vrint_1 (neon_cvt_mode_a);
+}
+
+static void
+do_vrintn (void)
+{
+  do_vrint_1 (neon_cvt_mode_n);
+}
  
-  /* Reg stride of 2 is encoded in bit 5 when size==16, bit 6 when size==32.  */
-  if (n != 0 && NEON_REG_STRIDE (inst.operands[0].imm) == 2)
-    inst.instruction |= 1 << (4 + logsize);
+static void
+do_vrintp (void)
+{
+  do_vrint_1 (neon_cvt_mode_p);
+}
  
-  inst.instruction |= NEON_LANE (inst.operands[0].imm) << (logsize + 5);
-  inst.instruction |= logsize << 10;
+static void
+do_vrintm (void)
+{
+  do_vrint_1 (neon_cvt_mode_m);
  }
  
-/* Encode single n-element structure to all lanes VLD<n> instructions.  */
+static unsigned
+neon_scalar_for_vcmla (unsigned opnd, unsigned elsize)
+{
+  unsigned regno = NEON_SCALAR_REG (opnd);
+  unsigned elno = NEON_SCALAR_INDEX (opnd);
+
+  if (elsize == 16 && elno < 2 && regno < 16)
+    return regno | (elno << 4);
+  else if (elsize == 32 && elno == 0)
+    return regno;
+
+  first_error (_("scalar out of range"));
+  return 0;
+}
  
  static void
-do_neon_ld_dup (void)
+do_vcmla (void)
  {
-  struct neon_type_el et = neon_check_type (1, NS_NULL, N_8 | N_16 | N_32);
-  int align_good, do_alignment = 0;
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext)
+             && (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8)
+                 || !mark_feature_used (&arm_ext_v8_3)), (BAD_FPU));
+  constraint (inst.relocs[0].exp.X_op != O_constant,
+             _("expression too complex"));
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
+             _("immediate out of range"));
+  rot /= 90;
  
-  if (et.type == NT_invtype)
+  if (!check_simd_pred_availability (TRUE,
+                                    NEON_CHECK_ARCH8 | NEON_CHECK_CC))
      return;
  
-  switch ((inst.instruction >> 8) & 3)
+  if (inst.operands[2].isscalar)
      {
-    case 0:  /* VLD1.  */
-      gas_assert (NEON_REG_STRIDE (inst.operands[0].imm) != 2);
-      align_good = neon_alignment_bit (et.size, inst.operands[1].imm >> 8,
-                                      &do_alignment, 16, 16, 32, 32, -1);
-      if (align_good == FAIL)
-       return;
-      switch (NEON_REGLIST_LENGTH (inst.operands[0].imm))
-       {
-       case 1: break;
-       case 2: inst.instruction |= 1 << 5; break;
-       default: first_error (_("bad list length")); return;
-       }
-      inst.instruction |= neon_logbits (et.size) << 6;
-      break;
-
-    case 1:  /* VLD2.  */
-      align_good = neon_alignment_bit (et.size, inst.operands[1].imm >> 8,
-                                      &do_alignment, 8, 16, 16, 32, 32, 64,
-                                      -1);
-      if (align_good == FAIL)
-       return;
-      constraint (NEON_REGLIST_LENGTH (inst.operands[0].imm) != 2,
-                 _("bad list length"));
-      if (NEON_REG_STRIDE (inst.operands[0].imm) == 2)
-       inst.instruction |= 1 << 5;
-      inst.instruction |= neon_logbits (et.size) << 6;
-      break;
-
-    case 2:  /* VLD3.  */
-      constraint (inst.operands[1].immisalign,
-                 _("can't use alignment with this instruction"));
-      constraint (NEON_REGLIST_LENGTH (inst.operands[0].imm) != 3,
-                 _("bad list length"));
-      if (NEON_REG_STRIDE (inst.operands[0].imm) == 2)
-       inst.instruction |= 1 << 5;
-      inst.instruction |= neon_logbits (et.size) << 6;
-      break;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+       first_error (_("invalid instruction shape"));
+      enum neon_shape rs = neon_select_shape (NS_DDSI, NS_QQSI, NS_NULL);
+      unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
+                                      N_KEY | N_F16 | N_F32).size;
+      unsigned m = neon_scalar_for_vcmla (inst.operands[2].reg, size);
+      inst.is_neon = 1;
+      inst.instruction = 0xfe000800;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+      inst.instruction |= LOW4 (m);
+      inst.instruction |= HI1 (m) << 5;
+      inst.instruction |= neon_quad (rs) << 6;
+      inst.instruction |= rot << 20;
+      inst.instruction |= (size == 32) << 23;
+    }
+  else
+    {
+      enum neon_shape rs;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext))
+       rs = neon_select_shape (NS_QQQI, NS_NULL);
+      else
+       rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
  
-    case 3:  /* VLD4.  */
-      {
-       int align = inst.operands[1].imm >> 8;
-       align_good = neon_alignment_bit (et.size, align, &do_alignment, 8, 32,
-                                        16, 64, 32, 64, 32, 128, -1);
-       if (align_good == FAIL)
-         return;
-       constraint (NEON_REGLIST_LENGTH (inst.operands[0].imm) != 4,
-                   _("bad list length"));
-       if (NEON_REG_STRIDE (inst.operands[0].imm) == 2)
-         inst.instruction |= 1 << 5;
-       if (et.size == 32 && align == 128)
-         inst.instruction |= 0x3 << 6;
-       else
-         inst.instruction |= neon_logbits (et.size) << 6;
-      }
-      break;
+      unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
+                                      N_KEY | N_F16 | N_F32).size;
+      if (ARM_CPU_HAS_FEATURE (cpu_variant, mve_fp_ext) && size == 32
+         && (inst.operands[0].reg == inst.operands[1].reg
+             || inst.operands[0].reg == inst.operands[2].reg))
+       as_tsktsk (BAD_MVE_SRCDEST);
  
-    default: ;
+      neon_three_same (neon_quad (rs), 0, -1);
+      inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
+      inst.instruction |= 0xfc200800;
+      inst.instruction |= rot << 23;
+      inst.instruction |= (size == 32) << 20;
      }
-
-  inst.instruction |= do_alignment << 4;
  }
  
-/* Disambiguate VLD<n> and VST<n> instructions, and fill in common bits (those
-   apart from bits [11:4].  */
-
  static void
-do_neon_ldx_stx (void)
+do_vcadd (void)
  {
-  if (inst.operands[1].isreg)
-    constraint (inst.operands[1].reg == REG_PC, BAD_PC);
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext)
+             && (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8)
+                 || !mark_feature_used (&arm_ext_v8_3)), (BAD_FPU));
+  constraint (inst.relocs[0].exp.X_op != O_constant,
+             _("expression too complex"));
  
-  switch (NEON_LANE (inst.operands[0].imm))
+  unsigned rot = inst.relocs[0].exp.X_add_number;
+  constraint (rot != 90 && rot != 270, _("immediate out of range"));
+  enum neon_shape rs;
+  struct neon_type_el et;
+  if (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext))
      {
-    case NEON_INTERLEAVE_LANES:
-      NEON_ENCODE (INTERLV, inst);
-      do_neon_ld_st_interleave ();
-      break;
+      rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16 | N_F32);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_QQQI, NS_NULL);
+      et = neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_F16 | N_F32 | N_I8
+                           | N_I16 | N_I32);
+      if (et.size == 32 && inst.operands[0].reg == inst.operands[2].reg)
+       as_tsktsk (_("Warning: 32-bit element size and same first and third "
+                    "operand makes instruction UNPREDICTABLE"));
+    }
  
-    case NEON_ALL_LANES:
-      NEON_ENCODE (DUP, inst);
-      if (inst.instruction == N_INV)
-       {
-         first_error ("only loads support such operands");
-         break;
-       }
-      do_neon_ld_dup ();
-      break;
+  if (et.type == NT_invtype)
+    return;
  
-    default:
-      NEON_ENCODE (LANE, inst);
-      do_neon_ld_st_lane ();
+  if (!check_simd_pred_availability (et.type == NT_float,
+                                    NEON_CHECK_ARCH8 | NEON_CHECK_CC))
+    return;
+
+  if (et.type == NT_float)
+    {
+      neon_three_same (neon_quad (rs), 0, -1);
+      inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
+      inst.instruction |= 0xfc800800;
+      inst.instruction |= (rot == 270) << 24;
+      inst.instruction |= (et.size == 32) << 20;
+    }
+  else
+    {
+      constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, mve_ext), BAD_FPU);
+      inst.instruction = 0xfe000f00;
+      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
+      inst.instruction |= neon_logbits (et.size) << 20;
+      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
+      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
+      inst.instruction |= (rot == 270) << 12;
+      inst.instruction |= HI1 (inst.operands[1].reg) << 7;
+      inst.instruction |= HI1 (inst.operands[2].reg) << 5;
+      inst.instruction |= LOW4 (inst.operands[2].reg);
+      inst.is_neon = 1;
      }
+}
  
-  /* L bit comes from bit mask.  */
-  inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-  inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-  inst.instruction |= inst.operands[1].reg << 16;
+/* Dot Product instructions encoding support.  */
  
-  if (inst.operands[1].postind)
+static void
+do_neon_dotproduct (int unsigned_p)
+{
+  enum neon_shape rs;
+  unsigned scalar_oprd2 = 0;
+  int high8;
+
+  if (inst.cond != COND_ALWAYS)
+    as_warn (_("Dot Product instructions cannot be conditional,  the behaviour "
+              "is UNPREDICTABLE"));
+
+  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
+             _(BAD_FPU));
+
+  /* Dot Product instructions are in three-same D/Q register format or the third
+     operand can be a scalar index register.  */
+  if (inst.operands[2].isscalar)
      {
-      int postreg = inst.operands[1].imm & 0xf;
-      constraint (!inst.operands[1].immisreg,
-                 _("post-index must be a register"));
-      constraint (postreg == 0xd || postreg == 0xf,
-                 _("bad register for post-index"));
-      inst.instruction |= postreg;
+      scalar_oprd2 = neon_scalar_for_mul (inst.operands[2].reg, 32);
+      high8 = 0xfe000000;
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
      }
    else
      {
-      constraint (inst.operands[1].immisreg, BAD_ADDR_MODE);
-      constraint (inst.relocs[0].exp.X_op != O_constant
-                 || inst.relocs[0].exp.X_add_number != 0,
-                 BAD_ADDR_MODE);
+      high8 = 0xfc000000;
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+    }
  
-      if (inst.operands[1].writeback)
-       {
-         inst.instruction |= 0xd;
-       }
-      else
-       inst.instruction |= 0xf;
+  if (unsigned_p)
+    neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_U8);
+  else
+    neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_S8);
+
+  /* The "U" bit in traditional Three Same encoding is fixed to 0 for Dot
+     Product instruction, so we pass 0 as the "ubit" parameter.  And the
+     "Size" field are fixed to 0x2, so we pass 32 as the "size" parameter.  */
+  neon_three_same (neon_quad (rs), 0, 32);
+
+  /* Undo neon_dp_fixup.  Dot Product instructions are using a slightly
+     different NEON three-same encoding.  */
+  inst.instruction &= 0x00ffffff;
+  inst.instruction |= high8;
+  /* Encode 'U' bit which indicates signedness.  */
+  inst.instruction |= (unsigned_p ? 1 : 0) << 4;
+  /* Re-encode operand2 if it's indexed scalar operand.  What has been encoded
+     from inst.operand[2].reg in neon_three_same is GAS's internal encoding, not
+     the instruction encoding.  */
+  if (inst.operands[2].isscalar)
+    {
+      inst.instruction &= 0xffffffd0;
+      inst.instruction |= LOW4 (scalar_oprd2);
+      inst.instruction |= HI1 (scalar_oprd2) << 5;
      }
-
-  if (thumb_mode)
-    inst.instruction |= 0xf9000000;
-  else
-    inst.instruction |= 0xf4000000;
  }
  
-/* FP v8.  */
+/* Dot Product instructions for signed integer.  */
+
  static void
-do_vfp_nsyn_fpv8 (enum neon_shape rs)
+do_neon_dotproduct_s (void)
  {
-  /* Targets like FPv5-SP-D16 don't support FP v8 instructions with
-     D register operands.  */
-  if (neon_shape_class[rs] == SC_DOUBLE)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-               _(BAD_FPU));
+  return do_neon_dotproduct (0);
+}
  
-  NEON_ENCODE (FPV8, inst);
+/* Dot Product instructions for unsigned integer.  */
  
-  if (rs == NS_FFF || rs == NS_HHH)
+static void
+do_neon_dotproduct_u (void)
+{
+  return do_neon_dotproduct (1);
+}
+
+static void
+do_vusdot (void)
+{
+  enum neon_shape rs;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
      {
-      do_vfp_sp_dyadic ();
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
  
-      /* ARMv8.2 fp16 instruction.  */
-      if (rs == NS_HHH)
-       do_scalar_fp16_v82_encode ();
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+                 _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
      }
    else
-    do_vfp_dp_rd_rn_rm ();
+    {
+      inst.instruction |= (1 << 21);
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+      neon_three_args (rs == NS_QQQ);
+    }
+}
  
-  if (rs == NS_DDD)
-    inst.instruction |= 0x100;
+static void
+do_vsudot (void)
+{
+  enum neon_shape rs;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_U8 | N_KEY);
  
-  inst.instruction |= 0xf0000000;
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+                 _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
  }
  
  static void
-do_vsel (void)
+do_vsmmla (void)
  {
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_S8 | N_KEY);
+
    set_pred_insn_type (OUTSIDE_PRED_INSN);
  
-  if (try_vfp_nsyn (3, do_vfp_nsyn_fpv8) != SUCCESS)
-    first_error (_("invalid instruction shape"));
+  neon_three_args (1);
+
  }
  
  static void
-do_vmaxnm (void)
+do_vummla (void)
  {
-  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_U8 | N_KEY);
  
-  if (try_vfp_nsyn (3, do_vfp_nsyn_fpv8) == SUCCESS)
-    return;
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
  
-  if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
-    return;
+  neon_three_args (1);
  
-  neon_dyadic_misc (NT_untyped, N_F_16_32, 0);
  }
  
  static void
-do_vrint_1 (enum neon_cvt_mode mode)
+check_cde_operand (size_t index, int is_dual)
  {
-  enum neon_shape rs = neon_select_shape (NS_HH, NS_FF, NS_DD, NS_QQ, NS_NULL);
-  struct neon_type_el et;
+  unsigned Rx = inst.operands[index].reg;
+  bfd_boolean isvec = inst.operands[index].isvec;
+  if (is_dual == 0 && thumb_mode)
+    constraint (
+               !((Rx <= 14 && Rx != 13) || (Rx == REG_PC && isvec)),
+               _("Register must be r0-r14 except r13, or APSR_nzcv."));
+  else
+    constraint ( !((Rx <= 10 && Rx % 2 == 0 )),
+      _("Register must be an even register between r0-r10."));
+}
  
-  if (rs == NS_NULL)
-    return;
+static bfd_boolean
+cde_coproc_enabled (unsigned coproc)
+{
+  switch (coproc)
+  {
+    case 0: return mark_feature_used (&arm_ext_cde0);
+    case 1: return mark_feature_used (&arm_ext_cde1);
+    case 2: return mark_feature_used (&arm_ext_cde2);
+    case 3: return mark_feature_used (&arm_ext_cde3);
+    case 4: return mark_feature_used (&arm_ext_cde4);
+    case 5: return mark_feature_used (&arm_ext_cde5);
+    case 6: return mark_feature_used (&arm_ext_cde6);
+    case 7: return mark_feature_used (&arm_ext_cde7);
+    default: return FALSE;
+  }
+}
  
-  /* Targets like FPv5-SP-D16 don't support FP v8 instructions with
-     D register operands.  */
-  if (neon_shape_class[rs] == SC_DOUBLE)
-    constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_vfp_ext_armv8),
-               _(BAD_FPU));
+#define cde_coproc_pos 8
+static void
+cde_handle_coproc (void)
+{
+  unsigned coproc = inst.operands[0].reg;
+  constraint (coproc > 7, _("CDE Coprocessor must be in range 0-7"));
+  constraint (!(cde_coproc_enabled (coproc)), BAD_CDE_COPROC);
+  inst.instruction |= coproc << cde_coproc_pos;
+}
+#undef cde_coproc_pos
  
-  et = neon_check_type (2, rs, N_EQK | N_VFP, N_F_ALL | N_KEY
-                       | N_VFP);
-  if (et.type != NT_invtype)
-    {
-      /* VFP encodings.  */
-      if (mode == neon_cvt_mode_a || mode == neon_cvt_mode_n
-         || mode == neon_cvt_mode_p || mode == neon_cvt_mode_m)
-       set_pred_insn_type (OUTSIDE_PRED_INSN);
+static void
+cxn_handle_predication (bfd_boolean is_accum)
+{
+  if (is_accum && conditional_insn ())
+    set_pred_insn_type (INSIDE_IT_INSN);
+  else if (conditional_insn ())
+  /* conditional_insn essentially checks for a suffix, not whether the
+     instruction is inside an IT block or not.
+     The non-accumulator versions should not have suffixes.  */
+    inst.error = BAD_SYNTAX;
+  else
+    set_pred_insn_type (OUTSIDE_PRED_INSN);
+}
  
-      NEON_ENCODE (FPV8, inst);
-      if (rs == NS_FF || rs == NS_HH)
-       do_vfp_sp_monadic ();
-      else
-       do_vfp_dp_rd_rm ();
+static void
+do_custom_instruction_1 (int is_dual, bfd_boolean is_accum)
+{
  
-      switch (mode)
-       {
-       case neon_cvt_mode_r: inst.instruction |= 0x00000000; break;
-       case neon_cvt_mode_z: inst.instruction |= 0x00000080; break;
-       case neon_cvt_mode_x: inst.instruction |= 0x00010000; break;
-       case neon_cvt_mode_a: inst.instruction |= 0xf0000000; break;
-       case neon_cvt_mode_n: inst.instruction |= 0xf0010000; break;
-       case neon_cvt_mode_p: inst.instruction |= 0xf0020000; break;
-       case neon_cvt_mode_m: inst.instruction |= 0xf0030000; break;
-       default: abort ();
-       }
+  constraint (!mark_feature_used (&arm_ext_cde), _(BAD_CDE));
  
-      inst.instruction |= (rs == NS_DD) << 8;
-      do_vfp_cond_or_thumb ();
+  unsigned imm, Rd;
  
-      /* ARMv8.2 fp16 vrint instruction.  */
-      if (rs == NS_HH)
-      do_scalar_fp16_v82_encode ();
+  Rd = inst.operands[1].reg;
+  check_cde_operand (1, is_dual);
+
+  if (is_dual == 1)
+    {
+      constraint (inst.operands[2].reg != Rd + 1,
+                 _("cx1d requires consecutive destination registers."));
+      imm = inst.operands[3].imm;
      }
+  else if (is_dual == 0)
+    imm = inst.operands[2].imm;
    else
-    {
-      /* Neon encodings (or something broken...).  */
-      inst.error = NULL;
-      et = neon_check_type (2, rs, N_EQK, N_F_16_32 | N_KEY);
+    abort ();
  
-      if (et.type == NT_invtype)
-       return;
+  inst.instruction |= Rd << 12;
+  inst.instruction |= (imm & 0x1F80) << 9;
+  inst.instruction |= (imm & 0x0040) << 1;
+  inst.instruction |= (imm & 0x003f);
  
-      set_pred_insn_type (OUTSIDE_PRED_INSN);
-      NEON_ENCODE (FLOAT, inst);
+  cde_handle_coproc ();
+  cxn_handle_predication (is_accum);
+}
  
-      if (vfp_or_neon_is_neon (NEON_CHECK_CC | NEON_CHECK_ARCH8) == FAIL)
-       return;
+static void
+do_custom_instruction_2 (int is_dual, bfd_boolean is_accum)
+{
  
-      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-      inst.instruction |= LOW4 (inst.operands[1].reg);
-      inst.instruction |= HI1 (inst.operands[1].reg) << 5;
-      inst.instruction |= neon_quad (rs) << 6;
-      /* Mask off the original size bits and reencode them.  */
-      inst.instruction = ((inst.instruction & 0xfff3ffff)
-                         | neon_logbits (et.size) << 18);
+  constraint (!mark_feature_used (&arm_ext_cde), _(BAD_CDE));
  
-      switch (mode)
-       {
-       case neon_cvt_mode_z: inst.instruction |= 3 << 7; break;
-       case neon_cvt_mode_x: inst.instruction |= 1 << 7; break;
-       case neon_cvt_mode_a: inst.instruction |= 2 << 7; break;
-       case neon_cvt_mode_n: inst.instruction |= 0 << 7; break;
-       case neon_cvt_mode_p: inst.instruction |= 7 << 7; break;
-       case neon_cvt_mode_m: inst.instruction |= 5 << 7; break;
-       case neon_cvt_mode_r: inst.error = _("invalid rounding mode"); break;
-       default: abort ();
-       }
+  unsigned imm, Rd, Rn;
  
-      if (thumb_mode)
-       inst.instruction |= 0xfc000000;
-      else
-       inst.instruction |= 0xf0000000;
+  Rd = inst.operands[1].reg;
+
+  if (is_dual == 1)
+    {
+      constraint (inst.operands[2].reg != Rd + 1,
+                 _("cx2d requires consecutive destination registers."));
+      imm = inst.operands[4].imm;
+      Rn = inst.operands[3].reg;
      }
-}
+  else if (is_dual == 0)
+  {
+    imm = inst.operands[3].imm;
+    Rn = inst.operands[2].reg;
+  }
+  else
+    abort ();
  
-static void
-do_vrintx (void)
-{
-  do_vrint_1 (neon_cvt_mode_x);
-}
+  check_cde_operand (2 + is_dual, /* is_dual = */0);
+  check_cde_operand (1, is_dual);
  
-static void
-do_vrintz (void)
-{
-  do_vrint_1 (neon_cvt_mode_z);
-}
+  inst.instruction |= Rd << 12;
+  inst.instruction |= Rn << 16;
  
-static void
-do_vrintr (void)
-{
-  do_vrint_1 (neon_cvt_mode_r);
+  inst.instruction |= (imm & 0x0380) << 13;
+  inst.instruction |= (imm & 0x0040) << 1;
+  inst.instruction |= (imm & 0x003f);
+
+  cde_handle_coproc ();
+  cxn_handle_predication (is_accum);
  }
  
  static void
-do_vrinta (void)
+do_custom_instruction_3 (int is_dual, bfd_boolean is_accum)
  {
-  do_vrint_1 (neon_cvt_mode_a);
+
+  constraint (!mark_feature_used (&arm_ext_cde), _(BAD_CDE));
+
+  unsigned imm, Rd, Rn, Rm;
+
+  Rd = inst.operands[1].reg;
+
+  if (is_dual == 1)
+    {
+      constraint (inst.operands[2].reg != Rd + 1,
+                 _("cx3d requires consecutive destination registers."));
+      imm = inst.operands[5].imm;
+      Rn = inst.operands[3].reg;
+      Rm = inst.operands[4].reg;
+    }
+  else if (is_dual == 0)
+  {
+    imm = inst.operands[4].imm;
+    Rn = inst.operands[2].reg;
+    Rm = inst.operands[3].reg;
+  }
+  else
+    abort ();
+
+  check_cde_operand (1, is_dual);
+  check_cde_operand (2 + is_dual, /* is_dual = */0);
+  check_cde_operand (3 + is_dual, /* is_dual = */0);
+
+  inst.instruction |= Rd;
+  inst.instruction |= Rn << 16;
+  inst.instruction |= Rm << 12;
+
+  inst.instruction |= (imm & 0x0038) << 17;
+  inst.instruction |= (imm & 0x0004) << 5;
+  inst.instruction |= (imm & 0x0003) << 4;
+
+  cde_handle_coproc ();
+  cxn_handle_predication (is_accum);
  }
  
  static void
-do_vrintn (void)
+do_cx1 (void)
  {
-  do_vrint_1 (neon_cvt_mode_n);
+  return do_custom_instruction_1 (0, 0);
  }
  
  static void
-do_vrintp (void)
+do_cx1a (void)
  {
-  do_vrint_1 (neon_cvt_mode_p);
+  return do_custom_instruction_1 (0, 1);
  }
  
  static void
-do_vrintm (void)
+do_cx1d (void)
  {
-  do_vrint_1 (neon_cvt_mode_m);
+  return do_custom_instruction_1 (1, 0);
  }
  
-static unsigned
-neon_scalar_for_vcmla (unsigned opnd, unsigned elsize)
+static void
+do_cx1da (void)
  {
-  unsigned regno = NEON_SCALAR_REG (opnd);
-  unsigned elno = NEON_SCALAR_INDEX (opnd);
-
-  if (elsize == 16 && elno < 2 && regno < 16)
-    return regno | (elno << 4);
-  else if (elsize == 32 && elno == 0)
-    return regno;
-
-  first_error (_("scalar out of range"));
-  return 0;
+  return do_custom_instruction_1 (1, 1);
  }
  
  static void
-do_vcmla (void)
+do_cx2 (void)
  {
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-             _(BAD_FPU));
-  constraint (inst.relocs[0].exp.X_op != O_constant,
-             _("expression too complex"));
-  unsigned rot = inst.relocs[0].exp.X_add_number;
-  constraint (rot != 0 && rot != 90 && rot != 180 && rot != 270,
-             _("immediate out of range"));
-  rot /= 90;
-  if (inst.operands[2].isscalar)
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDSI, NS_QQSI, NS_NULL);
-      unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
-                                      N_KEY | N_F16 | N_F32).size;
-      unsigned m = neon_scalar_for_vcmla (inst.operands[2].reg, size);
-      inst.is_neon = 1;
-      inst.instruction = 0xfe000800;
-      inst.instruction |= LOW4 (inst.operands[0].reg) << 12;
-      inst.instruction |= HI1 (inst.operands[0].reg) << 22;
-      inst.instruction |= LOW4 (inst.operands[1].reg) << 16;
-      inst.instruction |= HI1 (inst.operands[1].reg) << 7;
-      inst.instruction |= LOW4 (m);
-      inst.instruction |= HI1 (m) << 5;
-      inst.instruction |= neon_quad (rs) << 6;
-      inst.instruction |= rot << 20;
-      inst.instruction |= (size == 32) << 23;
-    }
-  else
-    {
-      enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
-      unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
-                                      N_KEY | N_F16 | N_F32).size;
-      neon_three_same (neon_quad (rs), 0, -1);
-      inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
-      inst.instruction |= 0xfc200800;
-      inst.instruction |= rot << 23;
-      inst.instruction |= (size == 32) << 20;
-    }
+  return do_custom_instruction_2 (0, 0);
+}
+
+static void
+do_cx2a (void)
+{
+  return do_custom_instruction_2 (0, 1);
  }
  
  static void
-do_vcadd (void)
+do_cx2d (void)
  {
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-             _(BAD_FPU));
-  constraint (inst.relocs[0].exp.X_op != O_constant,
-             _("expression too complex"));
-  unsigned rot = inst.relocs[0].exp.X_add_number;
-  constraint (rot != 90 && rot != 270, _("immediate out of range"));
-  enum neon_shape rs = neon_select_shape (NS_DDDI, NS_QQQI, NS_NULL);
-  unsigned size = neon_check_type (3, rs, N_EQK, N_EQK,
-                                  N_KEY | N_F16 | N_F32).size;
-  neon_three_same (neon_quad (rs), 0, -1);
-  inst.instruction &= 0x00ffffff; /* Undo neon_dp_fixup.  */
-  inst.instruction |= 0xfc800800;
-  inst.instruction |= (rot == 270) << 24;
-  inst.instruction |= (size == 32) << 20;
+  return do_custom_instruction_2 (1, 0);
  }
  
-/* Dot Product instructions encoding support.  */
+static void
+do_cx2da (void)
+{
+  return do_custom_instruction_2 (1, 1);
+}
  
  static void
-do_neon_dotproduct (int unsigned_p)
+do_cx3 (void)
  {
-  enum neon_shape rs;
-  unsigned scalar_oprd2 = 0;
-  int high8;
+  return do_custom_instruction_3 (0, 0);
+}
  
-  if (inst.cond != COND_ALWAYS)
-    as_warn (_("Dot Product instructions cannot be conditional,  the behaviour "
-              "is UNPREDICTABLE"));
+static void
+do_cx3a (void)
+{
+  return do_custom_instruction_3 (0, 1);
+}
  
-  constraint (!ARM_CPU_HAS_FEATURE (cpu_variant, fpu_neon_ext_armv8),
-             _(BAD_FPU));
+static void
+do_cx3d (void)
+{
+  return do_custom_instruction_3 (1, 0);
+}
  
-  /* Dot Product instructions are in three-same D/Q register format or the third
-     operand can be a scalar index register.  */
-  if (inst.operands[2].isscalar)
-    {
-      scalar_oprd2 = neon_scalar_for_mul (inst.operands[2].reg, 32);
-      high8 = 0xfe000000;
-      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
-    }
+static void
+do_cx3da (void)
+{
+  return do_custom_instruction_3 (1, 1);
+}
+
+static void
+vcx_assign_vec_d (unsigned regnum)
+{
+  inst.instruction |= HI4 (regnum) << 12;
+  inst.instruction |= LOW1 (regnum) << 22;
+}
+
+static void
+vcx_assign_vec_m (unsigned regnum)
+{
+  inst.instruction |= HI4 (regnum);
+  inst.instruction |= LOW1 (regnum) << 5;
+}
+
+static void
+vcx_assign_vec_n (unsigned regnum)
+{
+  inst.instruction |= HI4 (regnum) << 16;
+  inst.instruction |= LOW1 (regnum) << 7;
+}
+
+enum vcx_reg_type {
+    q_reg,
+    d_reg,
+    s_reg
+};
+
+static enum vcx_reg_type
+vcx_get_reg_type (enum neon_shape ns)
+{
+  gas_assert (ns == NS_PQI
+             || ns == NS_PDI
+             || ns == NS_PFI
+             || ns == NS_PQQI
+             || ns == NS_PDDI
+             || ns == NS_PFFI
+             || ns == NS_PQQQI
+             || ns == NS_PDDDI
+             || ns == NS_PFFFI);
+  if (ns == NS_PQI || ns == NS_PQQI || ns == NS_PQQQI)
+    return q_reg;
+  if (ns == NS_PDI || ns == NS_PDDI || ns == NS_PDDDI)
+    return d_reg;
+  return s_reg;
+}
+
+#define vcx_size_pos 24
+#define vcx_vec_pos 6
+static unsigned
+vcx_handle_shape (enum vcx_reg_type reg_type)
+{
+  unsigned mult = 2;
+  if (reg_type == q_reg)
+    inst.instruction |= 1 << vcx_vec_pos;
+  else if (reg_type == d_reg)
+    inst.instruction |= 1 << vcx_size_pos;
    else
+    mult = 1;
+  /* NOTE:
+     The documentation says that the Q registers are encoded as 2*N in the D:Vd
+     bits (or equivalent for N and M registers).
+     Similarly the D registers are encoded as N in D:Vd bits.
+     While the S registers are encoded as N in the Vd:D bits.
+
+     Taking into account the maximum values of these registers we can see a
+     nicer pattern for calculation:
+       Q -> 7, D -> 15, S -> 31
+
+     If we say that everything is encoded in the Vd:D bits, then we can say
+     that Q is encoded as 4*N, and D is encoded as 2*N.
+     This way the bits will end up the same, and calculation is simpler.
+     (calculation is now:
+       1. Multiply by a number determined by the register letter.
+       2. Encode resulting number in Vd:D bits.)
+
+      This is made a little more complicated by automatic handling of 'Q'
+      registers elsewhere, which means the register number is already 2*N where
+      N is the number the user wrote after the register letter.
+     */
+  return mult;
+}
+#undef vcx_vec_pos
+#undef vcx_size_pos
+
+static void
+vcx_ensure_register_in_range (unsigned R, enum vcx_reg_type reg_type)
+{
+  if (reg_type == q_reg)
      {
-      high8 = 0xfc000000;
-      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      gas_assert (R % 2 == 0);
+      constraint (R >= 16, _("'q' register must be in range 0-7"));
      }
-
-  if (unsigned_p)
-    neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_U8);
+  else if (reg_type == d_reg)
+    constraint (R >= 16, _("'d' register must be in range 0-15"));
    else
-    neon_check_type (3, rs, N_EQK, N_EQK, N_KEY | N_S8);
+    constraint (R >= 32, _("'s' register must be in range 0-31"));
+}
  
-  /* The "U" bit in traditional Three Same encoding is fixed to 0 for Dot
-     Product instruction, so we pass 0 as the "ubit" parameter.  And the
-     "Size" field are fixed to 0x2, so we pass 32 as the "size" parameter.  */
-  neon_three_same (neon_quad (rs), 0, 32);
+static void (*vcx_assign_vec[3]) (unsigned) = {
+    vcx_assign_vec_d,
+    vcx_assign_vec_m,
+    vcx_assign_vec_n
+};
  
-  /* Undo neon_dp_fixup.  Dot Product instructions are using a slightly
-     different NEON three-same encoding.  */
-  inst.instruction &= 0x00ffffff;
-  inst.instruction |= high8;
-  /* Encode 'U' bit which indicates signedness.  */
-  inst.instruction |= (unsigned_p ? 1 : 0) << 4;
-  /* Re-encode operand2 if it's indexed scalar operand.  What has been encoded
-     from inst.operand[2].reg in neon_three_same is GAS's internal encoding, not
-     the instruction encoding.  */
-  if (inst.operands[2].isscalar)
+static void
+vcx_handle_register_arguments (unsigned num_registers,
+                              enum vcx_reg_type reg_type)
+{
+  unsigned R, i;
+  unsigned reg_mult = vcx_handle_shape (reg_type);
+  for (i = 0; i < num_registers; i++)
      {
-      inst.instruction &= 0xffffffd0;
-      inst.instruction |= LOW4 (scalar_oprd2);
-      inst.instruction |= HI1 (scalar_oprd2) << 5;
+      R = inst.operands[i+1].reg;
+      vcx_ensure_register_in_range (R, reg_type);
+      if (num_registers == 3 && i > 0)
+       {
+         if (i == 2)
+           vcx_assign_vec[1] (R * reg_mult);
+         else
+           vcx_assign_vec[2] (R * reg_mult);
+         continue;
+       }
+      vcx_assign_vec[i](R * reg_mult);
      }
  }
  
-/* Dot Product instructions for signed integer.  */
+static void
+vcx_handle_insn_block (enum vcx_reg_type reg_type)
+{
+  if (reg_type == q_reg)
+    if (inst.cond > COND_ALWAYS)
+      inst.pred_insn_type = INSIDE_VPT_INSN;
+    else
+      inst.pred_insn_type = MVE_OUTSIDE_PRED_INSN;
+  else if (inst.cond == COND_ALWAYS)
+    inst.pred_insn_type = OUTSIDE_PRED_INSN;
+  else
+    inst.error = BAD_NOT_IT;
+}
  
  static void
-do_neon_dotproduct_s (void)
+vcx_handle_common_checks (unsigned num_args, enum neon_shape rs)
  {
-  return do_neon_dotproduct (0);
+  constraint (!mark_feature_used (&arm_ext_cde), _(BAD_CDE));
+  cde_handle_coproc ();
+  enum vcx_reg_type reg_type = vcx_get_reg_type (rs);
+  vcx_handle_register_arguments (num_args, reg_type);
+  vcx_handle_insn_block (reg_type);
+  if (reg_type == q_reg)
+    constraint (!mark_feature_used (&mve_ext),
+               _("vcx instructions with Q registers require MVE"));
+  else
+    constraint (!(ARM_FSET_CPU_SUBSET (armv8m_fp, cpu_variant)
+                 && mark_feature_used (&armv8m_fp))
+               && !mark_feature_used (&mve_ext),
+               _("vcx instructions with S or D registers require either MVE"
+                 " or Armv8-M floating point extension."));
  }
  
-/* Dot Product instructions for unsigned integer.  */
+static void
+do_vcx1 (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_PQI, NS_PDI, NS_PFI, NS_NULL);
+  vcx_handle_common_checks (1, rs);
+
+  unsigned imm = inst.operands[2].imm;
+  inst.instruction |= (imm & 0x03f);
+  inst.instruction |= (imm & 0x040) << 1;
+  inst.instruction |= (imm & 0x780) << 9;
+  if (rs != NS_PQI)
+    constraint (imm >= 2048,
+               _("vcx1 with S or D registers takes immediate within 0-2047"));
+  inst.instruction |= (imm & 0x800) << 13;
+}
  
  static void
-do_neon_dotproduct_u (void)
+do_vcx2 (void)
  {
-  return do_neon_dotproduct (1);
+  enum neon_shape rs = neon_select_shape (NS_PQQI, NS_PDDI, NS_PFFI, NS_NULL);
+  vcx_handle_common_checks (2, rs);
+
+  unsigned imm = inst.operands[3].imm;
+  inst.instruction |= (imm & 0x01) << 4;
+  inst.instruction |= (imm & 0x02) << 6;
+  inst.instruction |= (imm & 0x3c) << 14;
+  if (rs != NS_PQQI)
+    constraint (imm >= 64,
+               _("vcx2 with S or D registers takes immediate within 0-63"));
+  inst.instruction |= (imm & 0x40) << 18;
+}
+
+static void
+do_vcx3 (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_PQQQI, NS_PDDDI, NS_PFFFI, NS_NULL);
+  vcx_handle_common_checks (3, rs);
+
+  unsigned imm = inst.operands[4].imm;
+  inst.instruction |= (imm & 0x1) << 4;
+  inst.instruction |= (imm & 0x6) << 19;
+  if (rs != NS_PQQQI)
+    constraint (imm >= 8,
+               _("vcx2 with S or D registers takes immediate within 0-7"));
+  inst.instruction |= (imm & 0x8) << 21;
  }
  
  /* Crypto v1 instructions.  */
@@ -19186,6 +22265,46 @@ do_vjcvt (void)
    do_vfp_cond_or_thumb ();
  }
  
+static void
+do_vdot (void)
+{
+  enum neon_shape rs;
+  constraint (!mark_feature_used (&fpu_neon_ext_armv8), _(BAD_FPU));
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+  if (inst.operands[2].isscalar)
+    {
+      rs = neon_select_shape (NS_DDS, NS_QQS, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+
+      inst.instruction |= (1 << 25);
+      int index = inst.operands[2].reg & 0xf;
+      constraint ((index != 1 && index != 0), _("index must be 0 or 1"));
+      inst.operands[2].reg >>= 4;
+      constraint (!(inst.operands[2].reg < 16),
+                 _("indexed register must be less than 16"));
+      neon_three_args (rs == NS_QQS);
+      inst.instruction |= (index << 5);
+    }
+  else
+    {
+      rs = neon_select_shape (NS_DDD, NS_QQQ, NS_NULL);
+      neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+      neon_three_args (rs == NS_QQQ);
+    }
+}
+
+static void
+do_vmmla (void)
+{
+  enum neon_shape rs = neon_select_shape (NS_QQQ, NS_NULL);
+  neon_check_type (3, rs, N_EQK, N_EQK, N_BF16 | N_KEY);
+
+  constraint (!mark_feature_used (&fpu_neon_ext_armv8), _(BAD_FPU));
+  set_pred_insn_type (OUTSIDE_PRED_INSN);
+
+  neon_three_args (1);
+}
+
  \f
  /* Overall per-instruction processing. */
  
@@ -19500,7 +22619,7 @@ opcode_lookup (char **str)
      *str = end;
  
    /* Look for unaffixed or special-case affixed mnemonic.  */
-  opcode = (const struct asm_opcode *) hash_find_n (arm_ops_hsh, base,
+  opcode = (const struct asm_opcode *) str_hash_find_n (arm_ops_hsh, base,
                                                     end - base);
    if (opcode)
      {
@@ -19514,7 +22633,7 @@ opcode_lookup (char **str)
        if (warn_on_deprecated && unified_syntax)
         as_tsktsk (_("conditional infixes are deprecated in unified syntax"));
        affix = base + (opcode->tag - OT_odd_infix_0);
-      cond = (const struct asm_cond *) hash_find_n (arm_cond_hsh, affix, 2);
+      cond = (const struct asm_cond *) str_hash_find_n (arm_cond_hsh, affix, 2);
        gas_assert (cond);
  
        inst.cond = cond->value;
@@ -19527,8 +22646,8 @@ opcode_lookup (char **str)
      if (end - base < 2)
        return NULL;
       affix = end - 1;
-     cond = (const struct asm_cond *) hash_find_n (arm_vcond_hsh, affix, 1);
-     opcode = (const struct asm_opcode *) hash_find_n (arm_ops_hsh, base,
+     cond = (const struct asm_cond *) str_hash_find_n (arm_vcond_hsh, affix, 1);
+     opcode = (const struct asm_opcode *) str_hash_find_n (arm_ops_hsh, base,
                                                       affix - base);
       /* If this opcode can not be vector predicated then don't accept it with a
         vector predication code.  */
@@ -19544,8 +22663,8 @@ opcode_lookup (char **str)
  
        /* Look for suffixed mnemonic.  */
        affix = end - 2;
-      cond = (const struct asm_cond *) hash_find_n (arm_cond_hsh, affix, 2);
-      opcode = (const struct asm_opcode *) hash_find_n (arm_ops_hsh, base,
+      cond = (const struct asm_cond *) str_hash_find_n (arm_cond_hsh, affix, 2);
+      opcode = (const struct asm_opcode *) str_hash_find_n (arm_ops_hsh, base,
                                                         affix - base);
      }
  
@@ -19595,13 +22714,13 @@ opcode_lookup (char **str)
  
    /* Look for infixed mnemonic in the usual position.  */
    affix = base + 3;
-  cond = (const struct asm_cond *) hash_find_n (arm_cond_hsh, affix, 2);
+  cond = (const struct asm_cond *) str_hash_find_n (arm_cond_hsh, affix, 2);
    if (!cond)
      return NULL;
  
    memcpy (save, affix, 2);
    memmove (affix, affix + 2, (end - affix) - 2);
-  opcode = (const struct asm_opcode *) hash_find_n (arm_ops_hsh, base,
+  opcode = (const struct asm_opcode *) str_hash_find_n (arm_ops_hsh, base,
                                                     (end - base) - 2);
    memmove (affix + 2, affix, (end - affix) - 2);
    memcpy (affix, save, 2);
@@ -19922,6 +23041,7 @@ handle_pred_state (void)
             close_automatic_it_block ();
           break;
  
+         /* Fallthrough.  */
         case NEUTRAL_IT_INSN:
           now_pred.block_length++;
           now_pred.insn_cond = TRUE;
@@ -20174,9 +23294,11 @@ it_fsm_post_encode (void)
      handle_pred_state ();
  
    if (now_pred.insn_cond
+      && warn_on_restrict_it
        && !now_pred.warn_deprecated
        && warn_on_deprecated
-      && ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8)
+      && (ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8)
+          || ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v8r))
        && !ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_m))
      {
        if (inst.instruction >= 0x10000)
@@ -20574,7 +23696,7 @@ arm_frob_label (symbolS * sym)
       out of the jump table, and chaos would ensue.  */
    if (label_is_thumb_function_name
        && (S_GET_NAME (sym)[0] != '.' || S_GET_NAME (sym)[1] != 'L')
-      && (bfd_get_section_flags (stdoutput, now_seg) & SEC_CODE) != 0)
+      && (bfd_section_flags (now_seg) & SEC_CODE) != 0)
      {
        /* When the address of a Thumb function is taken the bottom
          bit of that address should be set.  This will allow
@@ -20667,6 +23789,10 @@ static const struct reg_entry reg_names[] =
    REGDEF(WR, 7,RN), REGDEF(SB, 9,RN), REGDEF(SL,10,RN), REGDEF(FP,11,RN),
    REGDEF(IP,12,RN), REGDEF(SP,13,RN), REGDEF(LR,14,RN), REGDEF(PC,15,RN),
  
+  /* Defining the new Zero register from ARMv8.1-M.  */
+  REGDEF(zr,15,ZR),
+  REGDEF(ZR,15,ZR),
+
    /* Coprocessor numbers.  */
    REGSET(p, CP), REGSET(P, CP),
  
@@ -20730,6 +23856,10 @@ static const struct reg_entry reg_names[] =
    REGDEF(mvfr0,7,VFC), REGDEF(mvfr1,6,VFC),
    REGDEF(MVFR0,7,VFC), REGDEF(MVFR1,6,VFC),
    REGDEF(mvfr2,5,VFC), REGDEF(MVFR2,5,VFC),
+  REGDEF(fpscr_nzcvqc,2,VFC), REGDEF(FPSCR_nzcvqc,2,VFC),
+  REGDEF(vpr,12,VFC), REGDEF(VPR,12,VFC),
+  REGDEF(fpcxt_ns,14,VFC), REGDEF(FPCXT_NS,14,VFC),
+  REGDEF(fpcxt_s,15,VFC), REGDEF(FPCXT_S,15,VFC),
  
    /* Maverick DSP coprocessor registers.  */
    REGSET(mvf,MVF),  REGSET(mvd,MVD),  REGSET(mvfx,MVFX),  REGSET(mvdx,MVDX),
@@ -21085,6 +24215,10 @@ static struct asm_barrier_opt barrier_opt_names[] =
  #define cCE(mnem,  op, nops, ops, ae)  \
    { mnem, OPS##nops ops, OT_csuffix, 0x##op, 0xe##op, ARM_VARIANT, ARM_VARIANT, do_##ae, do_##ae, 0 }
  
+/* mov instructions that are shared between coprocessor and MVE.  */
+#define mcCE(mnem,  op, nops, ops, ae) \
+  { #mnem, OPS##nops ops, OT_csuffix, 0x##op, 0xe##op, ARM_VARIANT, THUMB_VARIANT, do_##ae, do_##ae, 0 }
+
  /* Legacy coprocessor instructions where conditional infix and conditional
     suffix are ambiguous.  For consistency this includes all FPA instructions,
     not just the potentially ambiguous ones.  */
@@ -21817,6 +24951,13 @@ static const struct asm_opcode insns[] =
                                                         ldrexd, t_ldrexd),
   TCE("stlexd", 1a00e90, e8c000f0, 4, (RRnpc, RRnpc, oRRnpc, RRnpcb),
                                                         strexd, t_strexd),
+#undef THUMB_VARIANT
+#define THUMB_VARIANT & arm_ext_v8r
+#undef ARM_VARIANT
+#define ARM_VARIANT & arm_ext_v8r
+
+/* ARMv8-R instructions.  */
+ TUF("dfb",    57ff04c, f3bf8f4c, 0, (), noargs, noargs),
  
  /* Defined in V8 but is in undefined encoding space for earlier
     architectures.  However earlier architectures are required to treat
@@ -21846,19 +24987,13 @@ static const struct asm_opcode insns[] =
    nUF(vselvs, _vselvs, 3, (RVSD, RVSD, RVSD),          vsel),
    nUF(vselge, _vselge, 3, (RVSD, RVSD, RVSD),          vsel),
    nUF(vselgt, _vselgt, 3, (RVSD, RVSD, RVSD),          vsel),
-  nUF(vmaxnm, _vmaxnm, 3, (RNSDQ, oRNSDQ, RNSDQ),      vmaxnm),
-  nUF(vminnm, _vminnm, 3, (RNSDQ, oRNSDQ, RNSDQ),      vmaxnm),
-  nUF(vcvta,  _vcvta,  2, (RNSDQ, oRNSDQ),             neon_cvta),
-  nUF(vcvtn,  _vcvta,  2, (RNSDQ, oRNSDQ),             neon_cvtn),
-  nUF(vcvtp,  _vcvta,  2, (RNSDQ, oRNSDQ),             neon_cvtp),
-  nUF(vcvtm,  _vcvta,  2, (RNSDQ, oRNSDQ),             neon_cvtm),
    nCE(vrintr, _vrintr, 2, (RNSDQ, oRNSDQ),             vrintr),
-  nCE(vrintz, _vrintr, 2, (RNSDQ, oRNSDQ),             vrintz),
-  nCE(vrintx, _vrintr, 2, (RNSDQ, oRNSDQ),             vrintx),
-  nUF(vrinta, _vrinta, 2, (RNSDQ, oRNSDQ),             vrinta),
-  nUF(vrintn, _vrinta, 2, (RNSDQ, oRNSDQ),             vrintn),
-  nUF(vrintp, _vrinta, 2, (RNSDQ, oRNSDQ),             vrintp),
-  nUF(vrintm, _vrinta, 2, (RNSDQ, oRNSDQ),             vrintm),
+  mnCE(vrintz, _vrintr, 2, (RNSDQMQ, oRNSDQMQ),                vrintz),
+  mnCE(vrintx, _vrintr, 2, (RNSDQMQ, oRNSDQMQ),                vrintx),
+  mnUF(vrinta, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),                vrinta),
+  mnUF(vrintn, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),                vrintn),
+  mnUF(vrintp, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),                vrintp),
+  mnUF(vrintm, _vrinta, 2, (RNSDQMQ, oRNSDQMQ),                vrintm),
  
    /* Crypto v1 extensions.  */
  #undef  ARM_VARIANT
@@ -21882,9 +25017,9 @@ static const struct asm_opcode insns[] =
    nUF(sha256su0, _sha2op, 2, (RNQ, RNQ), sha256su0),
  
  #undef  ARM_VARIANT
-#define ARM_VARIANT   & crc_ext_armv8
+#define ARM_VARIANT   & arm_ext_crc
  #undef  THUMB_VARIANT
-#define THUMB_VARIANT & crc_ext_armv8
+#define THUMB_VARIANT & arm_ext_crc
    TUEc("crc32b", 1000040, fac0f080, 3, (RR, oRR, RR), crc32b),
    TUEc("crc32h", 1200040, fac0f090, 3, (RR, oRR, RR), crc32h),
    TUEc("crc32w", 1400040, fac0f0a0, 3, (RR, oRR, RR), crc32w),
@@ -21904,8 +25039,6 @@ static const struct asm_opcode insns[] =
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT & arm_ext_v8_3
   NCE (vjcvt, eb90bc0, 2, (RVS, RVD), vjcvt),
- NUF (vcmla, 0, 4, (RNDQ, RNDQ, RNDQ_RNSC, EXPi), vcmla),
- NUF (vcadd, 0, 4, (RNDQ, RNDQ, RNDQ, EXPi), vcadd),
  
  #undef  ARM_VARIANT
  #define ARM_VARIANT   & fpu_neon_ext_dotprod
@@ -22361,14 +25494,24 @@ static const struct asm_opcode insns[] =
  
  #undef  ARM_VARIANT
  #define ARM_VARIANT  & fpu_vfp_ext_v1xd  /* VFP V1xD (single precision).  */
+#undef THUMB_VARIANT
+#define THUMB_VARIANT  & arm_ext_v6t2
+ mcCE(vmrs,    ef00a10, 2, (APSR_RR, RVC),   vmrs),
+ mcCE(vmsr,    ee00a10, 2, (RVC, RR),        vmsr),
+ mcCE(fldd,    d100b00, 2, (RVD, ADDRGLDC),  vfp_dp_ldst),
+ mcCE(fstd,    d000b00, 2, (RVD, ADDRGLDC),  vfp_dp_ldst),
+ mcCE(flds,    d100a00, 2, (RVS, ADDRGLDC),  vfp_sp_ldst),
+ mcCE(fsts,    d000a00, 2, (RVS, ADDRGLDC),  vfp_sp_ldst),
+
+  /* Memory operations.         */
+ mcCE(fldmias, c900a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmia),
+ mcCE(fldmdbs, d300a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmdb),
+ mcCE(fstmias, c800a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmia),
+ mcCE(fstmdbs, d200a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmdb),
+#undef THUMB_VARIANT
  
    /* Moves and type conversions.  */
- cCE("fcpys",  eb00a40, 2, (RVS, RVS),       vfp_sp_monadic),
- cCE("fmrs",   e100a10, 2, (RR, RVS),        vfp_reg_from_sp),
- cCE("fmsr",   e000a10, 2, (RVS, RR),        vfp_sp_from_reg),
   cCE("fmstat", ef1fa10, 0, (),               noargs),
- cCE("vmrs",   ef00a10, 2, (APSR_RR, RVC),   vmrs),
- cCE("vmsr",   ee00a10, 2, (RVC, RR),        vmsr),
   cCE("fsitos", eb80ac0, 2, (RVS, RVS),       vfp_sp_monadic),
   cCE("fuitos", eb80a40, 2, (RVS, RVS),       vfp_sp_monadic),
   cCE("ftosis", ebd0a40, 2, (RVS, RVS),       vfp_sp_monadic),
@@ -22379,19 +25522,13 @@ static const struct asm_opcode insns[] =
   cCE("fmxr",   ee00a10, 2, (RVC, RR),        rn_rd),
  
    /* Memory operations.         */
- cCE("flds",   d100a00, 2, (RVS, ADDRGLDC),  vfp_sp_ldst),
- cCE("fsts",   d000a00, 2, (RVS, ADDRGLDC),  vfp_sp_ldst),
- cCE("fldmias",        c900a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmia),
   cCE("fldmfds",        c900a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmia),
- cCE("fldmdbs",        d300a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmdb),
   cCE("fldmeas",        d300a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmdb),
   cCE("fldmiax",        c900b00, 2, (RRnpctw, VRDLST),    vfp_xp_ldstmia),
   cCE("fldmfdx",        c900b00, 2, (RRnpctw, VRDLST),    vfp_xp_ldstmia),
   cCE("fldmdbx",        d300b00, 2, (RRnpctw, VRDLST),    vfp_xp_ldstmdb),
   cCE("fldmeax",        d300b00, 2, (RRnpctw, VRDLST),    vfp_xp_ldstmdb),
- cCE("fstmias",        c800a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmia),
   cCE("fstmeas",        c800a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmia),
- cCE("fstmdbs",        d200a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmdb),
   cCE("fstmfds",        d200a00, 2, (RRnpctw, VRSLST),    vfp_sp_ldstmdb),
   cCE("fstmiax",        c800b00, 2, (RRnpctw, VRDLST),    vfp_xp_ldstmia),
   cCE("fstmeax",        c800b00, 2, (RRnpctw, VRDLST),    vfp_xp_ldstmia),
@@ -22422,8 +25559,6 @@ static const struct asm_opcode insns[] =
  
   /* Double precision load/store are still present on single precision
      implementations.  */
- cCE("fldd",   d100b00, 2, (RVD, ADDRGLDC),  vfp_dp_ldst),
- cCE("fstd",   d000b00, 2, (RVD, ADDRGLDC),  vfp_dp_ldst),
   cCE("fldmiad",        c900b00, 2, (RRnpctw, VRDLST),    vfp_dp_ldstmia),
   cCE("fldmfdd",        c900b00, 2, (RRnpctw, VRDLST),    vfp_dp_ldstmia),
   cCE("fldmdbd",        d300b00, 2, (RRnpctw, VRDLST),    vfp_dp_ldstmdb),
@@ -22437,7 +25572,6 @@ static const struct asm_opcode insns[] =
  #define ARM_VARIANT  & fpu_vfp_ext_v1 /* VFP V1 (Double precision).  */
  
    /* Moves and type conversions.  */
- cCE("fcpyd",  eb00b40, 2, (RVD, RVD),       vfp_dp_rd_rm),
   cCE("fcvtds", eb70ac0, 2, (RVD, RVS),       vfp_dp_sp_cvt),
   cCE("fcvtsd", eb70bc0, 2, (RVS, RVD),       vfp_sp_dp_cvt),
   cCE("fmdhr",  e200b10, 2, (RVD, RR),        vfp_dp_rn_rd),
@@ -22473,18 +25607,23 @@ static const struct asm_opcode insns[] =
   cCE("fcmped", eb40bc0, 2, (RVD, RVD),       vfp_dp_rd_rm),
   cCE("fcmpezd",        eb50bc0, 1, (RVD),            vfp_dp_rd),
  
-#undef  ARM_VARIANT
-#define ARM_VARIANT  & fpu_vfp_ext_v2
-
- cCE("fmsrr",  c400a10, 3, (VRSLST, RR, RR), vfp_sp2_from_reg2),
- cCE("fmrrs",  c500a10, 3, (RR, RR, VRSLST), vfp_reg2_from_sp2),
- cCE("fmdrr",  c400b10, 3, (RVD, RR, RR),    vfp_dp_rm_rd_rn),
- cCE("fmrrd",  c500b10, 3, (RR, RR, RVD),    vfp_dp_rd_rn_rm),
-
  /* Instructions which may belong to either the Neon or VFP instruction sets.
     Individual encoder functions perform additional architecture checks.  */
  #undef  ARM_VARIANT
  #define ARM_VARIANT    & fpu_vfp_ext_v1xd
+#undef  THUMB_VARIANT
+#define THUMB_VARIANT  & arm_ext_v6t2
+
+ NCE(vldm,      c900b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
+ NCE(vldmia,    c900b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
+ NCE(vldmdb,    d100b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
+ NCE(vstm,      c800b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
+ NCE(vstmia,    c800b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
+ NCE(vstmdb,    d000b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
+
+ NCE(vpop,      0,       1, (VRSDLST),          vfp_nsyn_pop),
+ NCE(vpush,     0,       1, (VRSDLST),          vfp_nsyn_push),
+
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT  & fpu_vfp_ext_v1xd
  
@@ -22494,32 +25633,18 @@ static const struct asm_opcode insns[] =
   nCE(vnmul,     _vnmul,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
   nCE(vnmla,     _vnmla,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
   nCE(vnmls,     _vnmls,   3, (RVSD, RVSD, RVSD), vfp_nsyn_nmul),
- nCE(vcmp,      _vcmp,    2, (RVSD, RSVD_FI0),    vfp_nsyn_cmp),
- nCE(vcmpe,     _vcmpe,   2, (RVSD, RSVD_FI0),    vfp_nsyn_cmp),
- NCE(vpush,     0,       1, (VRSDLST),          vfp_nsyn_push),
- NCE(vpop,      0,       1, (VRSDLST),          vfp_nsyn_pop),
   NCE(vcvtz,     0,       2, (RVSD, RVSD),       vfp_nsyn_cvtz),
  
    /* Mnemonics shared by Neon and VFP.  */
- nCEF(vmul,     _vmul,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mul),
- nCEF(vmla,     _vmla,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
   nCEF(vmls,     _vmls,    3, (RNSDQ, oRNSDQ, RNSDQ_RNSC), neon_mac_maybe_scalar),
  
- NCE(vldm,      c900b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
- NCE(vldmia,    c900b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
- NCE(vldmdb,    d100b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
- NCE(vstm,      c800b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
- NCE(vstmia,    c800b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
- NCE(vstmdb,    d000b00, 2, (RRnpctw, VRSDLST), neon_ldm_stm),
-
- nCEF(vcvt,     _vcvt,   3, (RNSDQ, RNSDQ, oI32z), neon_cvt),
+ mnCEF(vcvt,     _vcvt,   3, (RNSDQMQ, RNSDQMQ, oI32z), neon_cvt),
   nCEF(vcvtr,    _vcvt,   2, (RNSDQ, RNSDQ), neon_cvtr),
- NCEF(vcvtb,   eb20a40, 2, (RVSD, RVSD), neon_cvtb),
- NCEF(vcvtt,   eb20a40, 2, (RVSD, RVSD), neon_cvtt),
+ MNCEF(vcvtb,  eb20a40, 3, (RVSDMQ, RVSDMQ, oI32b), neon_cvtb),
+ MNCEF(vcvtt,  eb20a40, 3, (RVSDMQ, RVSDMQ, oI32b), neon_cvtt),
  
  
    /* NOTE: All VMOV encoding is special-cased!  */
- NCE(vmov,      0,       1, (VMOV), neon_mov),
   NCE(vmovq,     0,       1, (VMOV), neon_mov),
  
  #undef  THUMB_VARIANT
@@ -22541,8 +25666,8 @@ static const struct asm_opcode insns[] =
   NCE (vins,      eb00ac0,       2, (RVS, RVS), neon_movhf),
  
   /* New backported fma/fms instructions optional in v8.2.  */
- NCE (vfmal, 810, 3, (RNDQ, RNSD, RNSD_RNSC), neon_vfmal),
- NCE (vfmsl, 810, 3, (RNDQ, RNSD, RNSD_RNSC), neon_vfmsl),
+ NUF (vfmsl, 810, 3, (RNDQ, RNSD, RNSD_RNSC), neon_vfmsl),
+ NUF (vfmal, 810, 3, (RNDQ, RNSD, RNSD_RNSC), neon_vfmal),
  
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT  & fpu_neon_ext_v1
@@ -22553,38 +25678,24 @@ static const struct asm_opcode insns[] =
    /* integer ops, valid types S8 S16 S32 U8 U16 U32.  */
   NUF(vaba,      0000710, 3, (RNDQ, RNDQ,  RNDQ), neon_dyadic_i_su),
   NUF(vabaq,     0000710, 3, (RNQ,  RNQ,   RNQ),  neon_dyadic_i_su),
- NUF(vhadd,     0000000, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
   NUF(vhaddq,    0000000, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
- NUF(vrhadd,    0000100, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
   NUF(vrhaddq,   0000100, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
- NUF(vhsub,     0000200, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i_su),
   NUF(vhsubq,    0000200, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i_su),
    /* integer ops, valid types S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vqadd,     0000010, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i64_su),
   NUF(vqaddq,    0000010, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i64_su),
- NUF(vqsub,     0000210, 3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_i64_su),
   NUF(vqsubq,    0000210, 3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_i64_su),
- NUF(vrshl,     0000500, 3, (RNDQ, oRNDQ, RNDQ), neon_rshl),
   NUF(vrshlq,    0000500, 3, (RNQ,  oRNQ,  RNQ),  neon_rshl),
- NUF(vqrshl,    0000510, 3, (RNDQ, oRNDQ, RNDQ), neon_rshl),
   NUF(vqrshlq,   0000510, 3, (RNQ,  oRNQ,  RNQ),  neon_rshl),
    /* If not immediate, fall back to neon_dyadic_i64_su.
-     shl_imm should accept I8 I16 I32 I64,
-     qshl_imm should accept S8 S16 S32 S64 U8 U16 U32 U64.  */
- nUF(vshl,      _vshl,    3, (RNDQ, oRNDQ, RNDQ_I63b), neon_shl_imm),
- nUF(vshlq,     _vshl,    3, (RNQ,  oRNQ,  RNDQ_I63b), neon_shl_imm),
- nUF(vqshl,     _vqshl,   3, (RNDQ, oRNDQ, RNDQ_I63b), neon_qshl_imm),
- nUF(vqshlq,    _vqshl,   3, (RNQ,  oRNQ,  RNDQ_I63b), neon_qshl_imm),
+     shl should accept I8 I16 I32 I64,
+     qshl should accept S8 S16 S32 S64 U8 U16 U32 U64.  */
+ nUF(vshlq,     _vshl,    3, (RNQ,  oRNQ,  RNDQ_I63b), neon_shl),
+ nUF(vqshlq,    _vqshl,   3, (RNQ,  oRNQ,  RNDQ_I63b), neon_qshl),
    /* Logic ops, types optional & ignored.  */
- nUF(vand,      _vand,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
   nUF(vandq,     _vand,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vbic,      _vbic,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
   nUF(vbicq,     _vbic,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vorr,      _vorr,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
   nUF(vorrq,     _vorr,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(vorn,      _vorn,    3, (RNDQ, oRNDQ, RNDQ_Ibig), neon_logic),
   nUF(vornq,     _vorn,    3, (RNQ,  oRNQ,  RNDQ_Ibig), neon_logic),
- nUF(veor,      _veor,    3, (RNDQ, oRNDQ, RNDQ),      neon_logic),
   nUF(veorq,     _veor,    3, (RNQ,  oRNQ,  RNQ),       neon_logic),
    /* Bitfield ops, untyped.  */
   NUF(vbsl,      1100110, 3, (RNDQ, RNDQ, RNDQ), neon_bitfield),
@@ -22595,9 +25706,7 @@ static const struct asm_opcode insns[] =
   NUF(vbifq,     1300110, 3, (RNQ,  RNQ,  RNQ),  neon_bitfield),
    /* Int and float variants, types S8 S16 S32 U8 U16 U32 F16 F32.  */
   nUF(vabdq,     _vabd,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
- nUF(vmax,      _vmax,    3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_if_su),
   nUF(vmaxq,     _vmax,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
- nUF(vmin,      _vmin,    3, (RNDQ, oRNDQ, RNDQ), neon_dyadic_if_su),
   nUF(vminq,     _vmin,    3, (RNQ,  oRNQ,  RNQ),  neon_dyadic_if_su),
    /* Comparisons. Types S8 S16 S32 U8 U16 U32 F32. Non-immediate versions fall
       back to neon_dyadic_if_su.  */
@@ -22628,9 +25737,7 @@ static const struct asm_opcode insns[] =
    /* VMUL takes I8 I16 I32 F32 P8.  */
   nUF(vmulq,     _vmul,     3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_mul),
    /* VQD{R}MULH takes S16 S32.  */
- nUF(vqdmulh,   _vqdmulh,  3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
   nUF(vqdmulhq,  _vqdmulh,  3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
- nUF(vqrdmulh,  _vqrdmulh, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qdmulh),
   nUF(vqrdmulhq, _vqrdmulh, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qdmulh),
   NUF(vacge,     0000e10,  3, (RNDQ, oRNDQ, RNDQ), neon_fcmp_absolute),
   NUF(vacgeq,    0000e10,  3, (RNQ,  oRNQ,  RNQ),  neon_fcmp_absolute),
@@ -22645,7 +25752,6 @@ static const struct asm_opcode insns[] =
   NUF(vrsqrts,   0200f10,  3, (RNDQ, oRNDQ, RNDQ), neon_step),
   NUF(vrsqrtsq,  0200f10,  3, (RNQ,  oRNQ,  RNQ),  neon_step),
   /* ARM v8.1 extension.  */
- nUF (vqrdmlah,  _vqrdmlah, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qrdmlah),
   nUF (vqrdmlahq, _vqrdmlah, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qrdmlah),
   nUF (vqrdmlsh,  _vqrdmlsh, 3, (RNDQ, oRNDQ, RNDQ_RNSC), neon_qrdmlah),
   nUF (vqrdmlshq, _vqrdmlsh, 3, (RNQ,  oRNQ,  RNDQ_RNSC), neon_qrdmlah),
@@ -22657,21 +25763,16 @@ static const struct asm_opcode insns[] =
    /* Data processing with two registers and a shift amount.  */
    /* Right shifts, and variants with rounding.
       Types accepted S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vshr,      0800010, 3, (RNDQ, oRNDQ, I64z), neon_rshift_round_imm),
   NUF(vshrq,     0800010, 3, (RNQ,  oRNQ,  I64z), neon_rshift_round_imm),
- NUF(vrshr,     0800210, 3, (RNDQ, oRNDQ, I64z), neon_rshift_round_imm),
   NUF(vrshrq,    0800210, 3, (RNQ,  oRNQ,  I64z), neon_rshift_round_imm),
   NUF(vsra,      0800110, 3, (RNDQ, oRNDQ, I64),  neon_rshift_round_imm),
   NUF(vsraq,     0800110, 3, (RNQ,  oRNQ,  I64),  neon_rshift_round_imm),
   NUF(vrsra,     0800310, 3, (RNDQ, oRNDQ, I64),  neon_rshift_round_imm),
   NUF(vrsraq,    0800310, 3, (RNQ,  oRNQ,  I64),  neon_rshift_round_imm),
    /* Shift and insert. Sizes accepted 8 16 32 64.  */
- NUF(vsli,      1800510, 3, (RNDQ, oRNDQ, I63), neon_sli),
   NUF(vsliq,     1800510, 3, (RNQ,  oRNQ,  I63), neon_sli),
- NUF(vsri,      1800410, 3, (RNDQ, oRNDQ, I64), neon_sri),
   NUF(vsriq,     1800410, 3, (RNQ,  oRNQ,  I64), neon_sri),
    /* QSHL{U} immediate accepts S8 S16 S32 S64 U8 U16 U32 U64.  */
- NUF(vqshlu,    1800610, 3, (RNDQ, oRNDQ, I63), neon_qshlu_imm),
   NUF(vqshluq,   1800610, 3, (RNQ,  oRNQ,  I63), neon_qshlu_imm),
    /* Right shift immediate, saturating & narrowing, with rounding variants.
       Types accepted S16 S32 S64 U16 U32 U64.  */
@@ -22688,7 +25789,6 @@ static const struct asm_opcode insns[] =
    /* CVT with optional immediate for fixed-point variant.  */
   nUF(vcvtq,     _vcvt,    3, (RNQ, RNQ, oI32b), neon_cvt),
  
- nUF(vmvn,      _vmvn,    2, (RNDQ, RNDQ_Ibig), neon_mvn),
   nUF(vmvnq,     _vmvn,    2, (RNQ,  RNDQ_Ibig), neon_mvn),
  
    /* Data processing, three registers of different lengths.  */
@@ -22720,14 +25820,10 @@ static const struct asm_opcode insns[] =
  
    /* Two registers, miscellaneous.  */
    /* Reverse. Sizes 8 16 32 (must be < size in opcode).  */
- NUF(vrev64,    1b00000, 2, (RNDQ, RNDQ),     neon_rev),
   NUF(vrev64q,   1b00000, 2, (RNQ,  RNQ),      neon_rev),
- NUF(vrev32,    1b00080, 2, (RNDQ, RNDQ),     neon_rev),
   NUF(vrev32q,   1b00080, 2, (RNQ,  RNQ),      neon_rev),
- NUF(vrev16,    1b00100, 2, (RNDQ, RNDQ),     neon_rev),
   NUF(vrev16q,   1b00100, 2, (RNQ,  RNQ),      neon_rev),
    /* Vector replicate. Sizes 8 16 32.  */
- nCE(vdup,      _vdup,    2, (RNDQ, RR_RNSC),  neon_dup),
   nCE(vdupq,     _vdup,    2, (RNQ,  RR_RNSC),  neon_dup),
    /* VMOVL. Types S8 S16 S32 U8 U16 U32.  */
   NUF(vmovl,     0800a10, 2, (RNQ, RND),       neon_movl),
@@ -22743,9 +25839,7 @@ static const struct asm_opcode insns[] =
   NUF(vuzp,      1b20100, 2, (RNDQ, RNDQ),     neon_zip_uzp),
   NUF(vuzpq,     1b20100, 2, (RNQ,  RNQ),      neon_zip_uzp),
    /* VQABS / VQNEG. Types S8 S16 S32.  */
- NUF(vqabs,     1b00700, 2, (RNDQ, RNDQ),     neon_sat_abs_neg),
   NUF(vqabsq,    1b00700, 2, (RNQ,  RNQ),      neon_sat_abs_neg),
- NUF(vqneg,     1b00780, 2, (RNDQ, RNDQ),     neon_sat_abs_neg),
   NUF(vqnegq,    1b00780, 2, (RNQ,  RNQ),      neon_sat_abs_neg),
    /* Pairwise, lengthening. Types S8 S16 S32 U8 U16 U32.  */
   NUF(vpadal,    1b00600, 2, (RNDQ, RNDQ),     neon_pair_long),
@@ -22758,10 +25852,8 @@ static const struct asm_opcode insns[] =
   NUF(vrsqrte,   1b30480, 2, (RNDQ, RNDQ),     neon_recip_est),
   NUF(vrsqrteq,  1b30480, 2, (RNQ,  RNQ),      neon_recip_est),
    /* VCLS. Types S8 S16 S32.  */
- NUF(vcls,      1b00400, 2, (RNDQ, RNDQ),     neon_cls),
   NUF(vclsq,     1b00400, 2, (RNQ,  RNQ),      neon_cls),
    /* VCLZ. Types I8 I16 I32.  */
- NUF(vclz,      1b00480, 2, (RNDQ, RNDQ),     neon_clz),
   NUF(vclzq,     1b00480, 2, (RNQ,  RNQ),      neon_clz),
    /* VCNT. Size 8.  */
   NUF(vcnt,      1b00500, 2, (RNDQ, RNDQ),     neon_cnt),
@@ -22825,11 +25917,13 @@ static const struct asm_opcode insns[] =
  #define ARM_VARIANT    & fpu_vfp_ext_fma
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT  & fpu_vfp_ext_fma
- /* Mnemonics shared by Neon and VFP.  These are included in the
+ /* Mnemonics shared by Neon, VFP, MVE and BF16.  These are included in the
      VFP FMA variant; NEON and VFP FMA always includes the NEON
      FMA instructions.  */
- nCEF(vfma,     _vfma,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_fmac),
- nCEF(vfms,     _vfms,    3, (RNSDQ, oRNSDQ, RNSDQ), neon_fmac),
+ mnCEF(vfma,     _vfma,    3, (RNSDQMQ, oRNSDQMQ, RNSDQMQR), neon_fmac),
+ TUF ("vfmat",    c300850,    fc300850,  3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), mve_vfma, mve_vfma),
+ mnCEF(vfms,     _vfms,    3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),  neon_fmac),
+
   /* ffmas/ffmad/ffmss/ffmsd are dummy mnemonics to satisfy gas;
      the v form should always be used.  */
   cCE("ffmas",  ea00a00, 3, (RVS, RVS, RVS),  vfp_sp_dyadic),
@@ -23197,6 +26291,16 @@ static const struct asm_opcode insns[] =
   /* Armv8.1-M Mainline instructions.  */
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT & arm_ext_v8_1m_main
+ toU("cinc",  _cinc,  3, (RRnpcsp, RR_ZR, COND),       t_cond),
+ toU("cinv",  _cinv,  3, (RRnpcsp, RR_ZR, COND),       t_cond),
+ toU("cneg",  _cneg,  3, (RRnpcsp, RR_ZR, COND),       t_cond),
+ toU("csel",  _csel,  4, (RRnpcsp, RR_ZR, RR_ZR, COND),        t_cond),
+ toU("csetm", _csetm, 2, (RRnpcsp, COND),              t_cond),
+ toU("cset",  _cset,  2, (RRnpcsp, COND),              t_cond),
+ toU("csinc", _csinc, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),        t_cond),
+ toU("csinv", _csinv, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),        t_cond),
+ toU("csneg", _csneg, 4, (RRnpcsp, RR_ZR, RR_ZR, COND),        t_cond),
+
   toC("bf",     _bf,    2, (EXPs, EXPs),             t_branch_future),
   toU("bfcsel", _bfcsel,        4, (EXPs, EXPs, EXPs, COND), t_branch_future),
   toC("bfx",    _bfx,   2, (EXPs, RRnpcsp),          t_branch_future),
@@ -23212,6 +26316,38 @@ static const struct asm_opcode insns[] =
  
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT & mve_ext
+ ToC("lsll",   ea50010d, 3, (RRe, RRo, RRnpcsp_I32), mve_scalar_shift),
+ ToC("lsrl",   ea50011f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("asrl",   ea50012d, 3, (RRe, RRo, RRnpcsp_I32), mve_scalar_shift),
+ ToC("uqrshll",        ea51010d, 4, (RRe, RRo, I48_I64, RRnpcsp), mve_scalar_shift1),
+ ToC("sqrshrl",        ea51012d, 4, (RRe, RRo, I48_I64, RRnpcsp), mve_scalar_shift1),
+ ToC("uqshll", ea51010f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("urshrl", ea51011f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("srshrl", ea51012f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("sqshll", ea51013f, 3, (RRe, RRo, I32),         mve_scalar_shift),
+ ToC("uqrshl", ea500f0d, 2, (RRnpcsp, RRnpcsp),      mve_scalar_shift),
+ ToC("sqrshr", ea500f2d, 2, (RRnpcsp, RRnpcsp),      mve_scalar_shift),
+ ToC("uqshl",  ea500f0f, 2, (RRnpcsp, I32),          mve_scalar_shift),
+ ToC("urshr",  ea500f1f, 2, (RRnpcsp, I32),          mve_scalar_shift),
+ ToC("srshr",  ea500f2f, 2, (RRnpcsp, I32),          mve_scalar_shift),
+ ToC("sqshl",  ea500f3f, 2, (RRnpcsp, I32),          mve_scalar_shift),
+
+ ToC("vpt",    ee410f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptt",   ee018f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpte",   ee418f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttt",  ee014f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptte",  ee01cf00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptet",  ee41cf00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptee",  ee414f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptttt", ee012f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttte", ee016f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttet", ee01ef00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpttee", ee01af00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptett", ee41af00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vptete", ee41ef00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpteet", ee416f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+ ToC("vpteee", ee412f00, 3, (COND, RMQ, RMQRZ), mve_vpt),
+
   ToC("vpst",   fe710f4d, 0, (), mve_vpt),
   ToC("vpstt",  fe318f4d, 0, (), mve_vpt),
   ToC("vpste",  fe718f4d, 0, (), mve_vpt),
@@ -23229,6 +26365,12 @@ static const struct asm_opcode insns[] =
   ToC("vpsteee",        fe712f4d, 0, (), mve_vpt),
  
   /* MVE and MVE FP only.  */
+ mToC("vhcadd",        ee000f00,   4, (RMQ, RMQ, RMQ, EXPi),             mve_vhcadd),
+ mCEF(vctp,    _vctp,      1, (RRnpc),                           mve_vctp),
+ mCEF(vadc,    _vadc,      3, (RMQ, RMQ, RMQ),                   mve_vadc),
+ mCEF(vadci,   _vadci,     3, (RMQ, RMQ, RMQ),                   mve_vadc),
+ mToC("vsbc",  fe300f00,   3, (RMQ, RMQ, RMQ),                   mve_vsbc),
+ mToC("vsbci", fe301f00,   3, (RMQ, RMQ, RMQ),                   mve_vsbc),
   mCEF(vmullb,  _vmullb,    3, (RMQ, RMQ, RMQ),                   mve_vmull),
   mCEF(vabav,   _vabav,     3, (RRnpcsp, RMQ, RMQ),               mve_vabav),
   mCEF(vmladav,   _vmladav,     3, (RRe, RMQ, RMQ),             mve_vmladav),
@@ -23263,10 +26405,118 @@ static const struct asm_opcode insns[] =
   mCEF(vldrw,   _vldrw,     2, (RMQ, ADDRMVE),                  mve_vstr_vldr),
   mCEF(vldrd,   _vldrd,     2, (RMQ, ADDRMVE),                  mve_vstr_vldr),
  
+ mCEF(vmovnt,  _vmovnt,    2, (RMQ, RMQ),                        mve_movn),
+ mCEF(vmovnb,  _vmovnb,    2, (RMQ, RMQ),                        mve_movn),
+ mCEF(vbrsr,   _vbrsr,     3, (RMQ, RMQ, RR),                    mve_vbrsr),
+ mCEF(vaddlv,  _vaddlv,    3, (RRe, RRo, RMQ),                   mve_vaddlv),
+ mCEF(vaddlva, _vaddlva,   3, (RRe, RRo, RMQ),                   mve_vaddlv),
+ mCEF(vaddv,   _vaddv,     2, (RRe, RMQ),                        mve_vaddv),
+ mCEF(vaddva,  _vaddva,    2, (RRe, RMQ),                        mve_vaddv),
+ mCEF(vddup,   _vddup,     3, (RMQ, RRe, EXPi),                  mve_viddup),
+ mCEF(vdwdup,  _vdwdup,    4, (RMQ, RRe, RR, EXPi),              mve_viddup),
+ mCEF(vidup,   _vidup,     3, (RMQ, RRe, EXPi),                  mve_viddup),
+ mCEF(viwdup,  _viwdup,    4, (RMQ, RRe, RR, EXPi),              mve_viddup),
+ mToC("vmaxa", ee330e81,   2, (RMQ, RMQ),                        mve_vmaxa_vmina),
+ mToC("vmina", ee331e81,   2, (RMQ, RMQ),                        mve_vmaxa_vmina),
+ mCEF(vmaxv,   _vmaxv,   2, (RR, RMQ),                           mve_vmaxv),
+ mCEF(vmaxav,  _vmaxav,  2, (RR, RMQ),                           mve_vmaxv),
+ mCEF(vminv,   _vminv,   2, (RR, RMQ),                           mve_vmaxv),
+ mCEF(vminav,  _vminav,  2, (RR, RMQ),                           mve_vmaxv),
+
+ mCEF(vmlaldav,          _vmlaldav,    4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlaldava,  _vmlaldava,  4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlaldavx,  _vmlaldavx,  4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlaldavax, _vmlaldavax, 4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlalv,    _vmlaldav,    4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlalva,   _vmlaldava,   4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlsldav,          _vmlsldav,    4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlsldava,  _vmlsldava,  4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlsldavx,  _vmlsldavx,  4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mCEF(vmlsldavax, _vmlsldavax, 4, (RRe, RRo, RMQ, RMQ),        mve_vmlaldav),
+ mToC("vrmlaldavh", ee800f00,     4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlaldavha",ee800f20,     4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlaldavhx,  _vrmlaldavhx,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlaldavhax, _vrmlaldavhax, 4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlalvh",   ee800f00,     4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mToC("vrmlalvha",  ee800f20,     4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavh,   _vrmlsldavh,   4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavha,  _vrmlsldavha,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavhx,  _vrmlsldavhx,  4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+ mCEF(vrmlsldavhax, _vrmlsldavhax, 4, (RRe, RR, RMQ, RMQ),  mve_vrmlaldavh),
+
+ mToC("vmlas",   ee011e40,     3, (RMQ, RMQ, RR),              mve_vmlas),
+ mToC("vmulh",   ee010e01,     3, (RMQ, RMQ, RMQ),             mve_vmulh),
+ mToC("vrmulh",          ee011e01,     3, (RMQ, RMQ, RMQ),             mve_vmulh),
+ mToC("vpnot",   fe310f4d,     0, (),                          mve_vpnot),
+ mToC("vpsel",   fe310f01,     3, (RMQ, RMQ, RMQ),             mve_vpsel),
+
+ mToC("vqdmladh",  ee000e00,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqdmladhx", ee001e00,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqrdmladh", ee000e01,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqrdmladhx",ee001e01,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqdmlsdh",  fe000e00,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqdmlsdhx", fe001e00,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqrdmlsdh", fe000e01,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqrdmlsdhx",fe001e01,   3, (RMQ, RMQ, RMQ),             mve_vqdmladh),
+ mToC("vqdmlah",   ee000e60,   3, (RMQ, RMQ, RR),              mve_vqdmlah),
+ mToC("vqdmlash",  ee001e60,   3, (RMQ, RMQ, RR),              mve_vqdmlah),
+ mToC("vqrdmlash", ee001e40,   3, (RMQ, RMQ, RR),              mve_vqdmlah),
+ mToC("vqdmullt",  ee301f00,   3, (RMQ, RMQ, RMQRR),           mve_vqdmull),
+ mToC("vqdmullb",  ee300f00,   3, (RMQ, RMQ, RMQRR),           mve_vqdmull),
+ mCEF(vqmovnt,   _vqmovnt,     2, (RMQ, RMQ),                  mve_vqmovn),
+ mCEF(vqmovnb,   _vqmovnb,     2, (RMQ, RMQ),                  mve_vqmovn),
+ mCEF(vqmovunt,          _vqmovunt,    2, (RMQ, RMQ),                  mve_vqmovn),
+ mCEF(vqmovunb,          _vqmovunb,    2, (RMQ, RMQ),                  mve_vqmovn),
+
+ mCEF(vshrnt,    _vshrnt,      3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vshrnb,    _vshrnb,      3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vrshrnt,   _vrshrnt,     3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vrshrnb,   _vrshrnb,     3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqshrnt,   _vqrshrnt,    3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqshrnb,   _vqrshrnb,    3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqshrunt,          _vqrshrunt,   3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqshrunb,          _vqrshrunb,   3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqrshrnt,          _vqrshrnt,    3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqrshrnb,          _vqrshrnb,    3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqrshrunt,  _vqrshrunt,  3, (RMQ, RMQ, I32z),    mve_vshrn),
+ mCEF(vqrshrunb,  _vqrshrunb,  3, (RMQ, RMQ, I32z),    mve_vshrn),
+
+ mToC("vshlc",     eea00fc0,      3, (RMQ, RR, I32z),      mve_vshlc),
+ mToC("vshllt",            ee201e00,      3, (RMQ, RMQ, I32),      mve_vshll),
+ mToC("vshllb",            ee200e00,      3, (RMQ, RMQ, I32),      mve_vshll),
+
+ toU("dlstp",  _dlstp, 2, (LR, RR),      t_loloop),
+ toU("wlstp",  _wlstp, 3, (LR, RR, EXP), t_loloop),
+ toU("letp",   _letp,  2, (LR, EXP),     t_loloop),
+ toU("lctp",   _lctp,  0, (),            t_loloop),
+
+#undef THUMB_VARIANT
+#define THUMB_VARIANT & mve_fp_ext
+ mToC("vcmul", ee300e00,   4, (RMQ, RMQ, RMQ, EXPi),             mve_vcmul),
+ mToC("vfmas", ee311e40,   3, (RMQ, RMQ, RR),                    mve_vfmas),
+ mToC("vmaxnma", ee3f0e81, 2, (RMQ, RMQ),                        mve_vmaxnma_vminnma),
+ mToC("vminnma", ee3f1e81, 2, (RMQ, RMQ),                        mve_vmaxnma_vminnma),
+ mToC("vmaxnmv", eeee0f00, 2, (RR, RMQ),                         mve_vmaxnmv),
+ mToC("vmaxnmav",eeec0f00, 2, (RR, RMQ),                         mve_vmaxnmv),
+ mToC("vminnmv", eeee0f80, 2, (RR, RMQ),                         mve_vmaxnmv),
+ mToC("vminnmav",eeec0f80, 2, (RR, RMQ),                         mve_vmaxnmv),
+
  #undef  ARM_VARIANT
-#define ARM_VARIANT    & fpu_vfp_ext_v1xd
+#define ARM_VARIANT  & fpu_vfp_ext_v1
  #undef  THUMB_VARIANT
  #define THUMB_VARIANT  & arm_ext_v6t2
+ mnCEF(vmla,     _vmla,    3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), neon_mac_maybe_scalar),
+ mnCEF(vmul,     _vmul,    3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ_RR), neon_mul),
+
+ mcCE(fcpyd,   eb00b40, 2, (RVD, RVD),       vfp_dp_rd_rm),
+
+#undef  ARM_VARIANT
+#define ARM_VARIANT  & fpu_vfp_ext_v1xd
+
+ MNCE(vmov,   0,       1, (VMOV),            neon_mov),
+ mcCE(fmrs,    e100a10, 2, (RR, RVS),        vfp_reg_from_sp),
+ mcCE(fmsr,    e000a10, 2, (RVS, RR),        vfp_sp_from_reg),
+ mcCE(fcpys,   eb00a40, 2, (RVS, RVS),       vfp_sp_monadic),
  
   mCEF(vmullt, _vmullt, 3, (RNSDQMQ, oRNSDQMQ, RNSDQ_RNSC_MQ),  mve_vmull),
   mnCEF(vadd,  _vadd,   3, (RNSDQMQ, oRNSDQMQ, RNSDQMQR),       neon_addsub_if_i),
@@ -23275,13 +26525,122 @@ static const struct asm_opcode insns[] =
   MNCEF(vabs,  1b10300, 2, (RNSDQMQ, RNSDQMQ),  neon_abs_neg),
   MNCEF(vneg,  1b10380, 2, (RNSDQMQ, RNSDQMQ),  neon_abs_neg),
  
-#undef ARM_VARIANT
+ mCEF(vmovlt, _vmovlt, 1, (VMOV),              mve_movl),
+ mCEF(vmovlb, _vmovlb, 1, (VMOV),              mve_movl),
+
+ mnCE(vcmp,      _vcmp,    3, (RVSD_COND, RSVDMQ_FI0, oRMQRZ),    vfp_nsyn_cmp),
+ mnCE(vcmpe,     _vcmpe,   3, (RVSD_COND, RSVDMQ_FI0, oRMQRZ),    vfp_nsyn_cmp),
+
+#undef  ARM_VARIANT
+#define ARM_VARIANT  & fpu_vfp_ext_v2
+
+ mcCE(fmsrr,   c400a10, 3, (VRSLST, RR, RR), vfp_sp2_from_reg2),
+ mcCE(fmrrs,   c500a10, 3, (RR, RR, VRSLST), vfp_reg2_from_sp2),
+ mcCE(fmdrr,   c400b10, 3, (RVD, RR, RR),    vfp_dp_rm_rd_rn),
+ mcCE(fmrrd,   c500b10, 3, (RR, RR, RVD),    vfp_dp_rd_rn_rm),
+
+#undef  ARM_VARIANT
+#define ARM_VARIANT    & fpu_vfp_ext_armv8xd
+ mnUF(vcvta,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),         neon_cvta),
+ mnUF(vcvtp,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),         neon_cvtp),
+ mnUF(vcvtn,  _vcvta,  3, (RNSDQMQ, oRNSDQMQ, oI32z),  neon_cvtn),
+ mnUF(vcvtm,  _vcvta,  2, (RNSDQMQ, oRNSDQMQ),         neon_cvtm),
+ mnUF(vmaxnm, _vmaxnm, 3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),        vmaxnm),
+ mnUF(vminnm, _vminnm, 3, (RNSDQMQ, oRNSDQMQ, RNSDQMQ),        vmaxnm),
+
+#undef ARM_VARIANT
  #define ARM_VARIANT & fpu_neon_ext_v1
- mnUF(vabd,      _vabd,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ mnUF(vabd,      _vabd,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
   mnUF(vabdl,     _vabdl,         3, (RNQMQ, RNDMQ, RNDMQ),   neon_dyadic_long),
- mnUF(vaddl,     _vaddl,         3, (RNQMQ, RNDMQ, RNDMQR),  neon_dyadic_long),
- mnUF(vsubl,     _vsubl,         3, (RNQMQ, RNDMQ, RNDMQR),  neon_dyadic_long),
+ mnUF(vaddl,     _vaddl,         3, (RNSDQMQ, oRNSDMQ, RNSDMQR),  neon_dyadic_long),
+ mnUF(vsubl,     _vsubl,         3, (RNSDQMQ, oRNSDMQ, RNSDMQR),  neon_dyadic_long),
+ mnUF(vand,      _vand,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vbic,      _vbic,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vorr,      _vorr,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(vorn,      _vorn,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ_Ibig), neon_logic),
+ mnUF(veor,      _veor,                  3, (RNDQMQ, oRNDQMQ, RNDQMQ),      neon_logic),
+ MNUF(vcls,      1b00400,        2, (RNDQMQ, RNDQMQ),               neon_cls),
+ MNUF(vclz,      1b00480,        2, (RNDQMQ, RNDQMQ),               neon_clz),
+ mnCE(vdup,      _vdup,                  2, (RNDQMQ, RR_RNSC),              neon_dup),
+ MNUF(vhadd,     00000000,       3, (RNDQMQ, oRNDQMQ, RNDQMQR),  neon_dyadic_i_su),
+ MNUF(vrhadd,    00000100,       3, (RNDQMQ, oRNDQMQ, RNDQMQ),   neon_dyadic_i_su),
+ MNUF(vhsub,     00000200,       3, (RNDQMQ, oRNDQMQ, RNDQMQR),  neon_dyadic_i_su),
+ mnUF(vmin,      _vmin,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ mnUF(vmax,      _vmax,    3, (RNDQMQ, oRNDQMQ, RNDQMQ), neon_dyadic_if_su),
+ MNUF(vqadd,     0000010,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_dyadic_i64_su),
+ MNUF(vqsub,     0000210,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_dyadic_i64_su),
+ mnUF(vmvn,      _vmvn,    2, (RNDQMQ, RNDQMQ_Ibig), neon_mvn),
+ MNUF(vqabs,     1b00700,  2, (RNDQMQ, RNDQMQ),     neon_sat_abs_neg),
+ MNUF(vqneg,     1b00780,  2, (RNDQMQ, RNDQMQ),     neon_sat_abs_neg),
+ mnUF(vqrdmlah,  _vqrdmlah,3, (RNDQMQ, oRNDQMQ, RNDQ_RNSC_RR), neon_qrdmlah),
+ mnUF(vqdmulh,   _vqdmulh, 3, (RNDQMQ, oRNDQMQ, RNDQMQ_RNSC_RR), neon_qdmulh),
+ mnUF(vqrdmulh,  _vqrdmulh,3, (RNDQMQ, oRNDQMQ, RNDQMQ_RNSC_RR), neon_qdmulh),
+ MNUF(vqrshl,    0000510,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_rshl),
+ MNUF(vrshl,     0000500,  3, (RNDQMQ, oRNDQMQ, RNDQMQR), neon_rshl),
+ MNUF(vshr,      0800010,  3, (RNDQMQ, oRNDQMQ, I64z), neon_rshift_round_imm),
+ MNUF(vrshr,     0800210,  3, (RNDQMQ, oRNDQMQ, I64z), neon_rshift_round_imm),
+ MNUF(vsli,      1800510,  3, (RNDQMQ, oRNDQMQ, I63),  neon_sli),
+ MNUF(vsri,      1800410,  3, (RNDQMQ, oRNDQMQ, I64z), neon_sri),
+ MNUF(vrev64,    1b00000,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ MNUF(vrev32,    1b00080,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ MNUF(vrev16,    1b00100,  2, (RNDQMQ, RNDQMQ),     neon_rev),
+ mnUF(vshl,     _vshl,    3, (RNDQMQ, oRNDQMQ, RNDQMQ_I63b_RR), neon_shl),
+ mnUF(vqshl,     _vqshl,   3, (RNDQMQ, oRNDQMQ, RNDQMQ_I63b_RR), neon_qshl),
+ MNUF(vqshlu,    1800610,  3, (RNDQMQ, oRNDQMQ, I63),           neon_qshlu_imm),
+
+#undef ARM_VARIANT
+#define ARM_VARIANT & arm_ext_v8_3
+#undef THUMB_VARIANT
+#define        THUMB_VARIANT & arm_ext_v6t2_v8m
+ MNUF (vcadd, 0, 4, (RNDQMQ, RNDQMQ, RNDQMQ, EXPi), vcadd),
+ MNUF (vcmla, 0, 4, (RNDQMQ, RNDQMQ, RNDQMQ_RNSC, EXPi), vcmla),
+
+#undef ARM_VARIANT
+#define ARM_VARIANT &arm_ext_bf16
+#undef THUMB_VARIANT
+#define        THUMB_VARIANT &arm_ext_bf16
+ TUF ("vdot", c000d00, fc000d00, 3, (RNDQ, RNDQ, RNDQ_RNSC), vdot, vdot),
+ TUF ("vmmla", c000c40, fc000c40, 3, (RNQ, RNQ, RNQ), vmmla, vmmla),
+ TUF ("vfmab", c300810, fc300810, 3, (RNDQ, RNDQ, RNDQ_RNSC), bfloat_vfma, bfloat_vfma),
+
+#undef ARM_VARIANT
+#define ARM_VARIANT &arm_ext_i8mm
+#undef THUMB_VARIANT
+#define        THUMB_VARIANT &arm_ext_i8mm
+ TUF ("vsmmla", c200c40, fc200c40, 3, (RNQ, RNQ, RNQ), vsmmla, vsmmla),
+ TUF ("vummla", c200c50, fc200c50, 3, (RNQ, RNQ, RNQ), vummla, vummla),
+ TUF ("vusmmla", ca00c40, fca00c40, 3, (RNQ, RNQ, RNQ), vsmmla, vsmmla),
+ TUF ("vusdot", c800d00, fc800d00, 3, (RNDQ, RNDQ, RNDQ_RNSC), vusdot, vusdot),
+ TUF ("vsudot", c800d10, fc800d10, 3, (RNDQ, RNDQ, RNSC), vsudot, vsudot),
+
+#undef ARM_VARIANT
+#undef THUMB_VARIANT
+#define        THUMB_VARIANT &arm_ext_cde
+ ToC ("cx1", ee000000, 3, (RCP, APSR_RR, I8191), cx1),
+ ToC ("cx1a", fe000000, 3, (RCP, APSR_RR, I8191), cx1a),
+ ToC ("cx1d", ee000040, 4, (RCP, RR, APSR_RR, I8191), cx1d),
+ ToC ("cx1da", fe000040, 4, (RCP, RR, APSR_RR, I8191), cx1da),
+
+ ToC ("cx2", ee400000, 4, (RCP, APSR_RR, APSR_RR, I511), cx2),
+ ToC ("cx2a", fe400000, 4, (RCP, APSR_RR, APSR_RR, I511), cx2a),
+ ToC ("cx2d", ee400040, 5, (RCP, RR, APSR_RR, APSR_RR, I511), cx2d),
+ ToC ("cx2da", fe400040, 5, (RCP, RR, APSR_RR, APSR_RR, I511), cx2da),
+
+ ToC ("cx3", ee800000, 5, (RCP, APSR_RR, APSR_RR, APSR_RR, I63), cx3),
+ ToC ("cx3a", fe800000, 5, (RCP, APSR_RR, APSR_RR, APSR_RR, I63), cx3a),
+ ToC ("cx3d", ee800040, 6, (RCP, RR, APSR_RR, APSR_RR, APSR_RR, I63), cx3d),
+ ToC ("cx3da", fe800040, 6, (RCP, RR, APSR_RR, APSR_RR, APSR_RR, I63), cx3da),
+
+ mToC ("vcx1", ec200000, 3, (RCP, RNSDMQ, I4095), vcx1),
+ mToC ("vcx1a", fc200000, 3, (RCP, RNSDMQ, I4095), vcx1),
+
+ mToC ("vcx2", ec300000, 4, (RCP, RNSDMQ, RNSDMQ, I127), vcx2),
+ mToC ("vcx2a", fc300000, 4, (RCP, RNSDMQ, RNSDMQ, I127), vcx2),
+
+ mToC ("vcx3", ec800000, 5, (RCP, RNSDMQ, RNSDMQ, RNSDMQ, I15), vcx3),
+ mToC ("vcx3a", fc800000, 5, (RCP, RNSDMQ, RNSDMQ, RNSDMQ, I15), vcx3),
  };
+
  #undef ARM_VARIANT
  #undef THUMB_VARIANT
  #undef TCE
@@ -24010,7 +27369,7 @@ arm_init_frag (fragS * fragP, int max_chars)
  
    /* PR 21809: Do not set a mapping state for debug sections
       - it just confuses other tools.  */
-  if (bfd_get_section_flags (NULL, now_seg) & SEC_DEBUGGING)
+  if (bfd_section_flags (now_seg) & SEC_DEBUGGING)
      return;
  
    frag_thumb_mode = fragP->tc_frag_data.thumb_mode ^ MODE_RECORDED;
@@ -24193,7 +27552,7 @@ start_unwind_section (const segT text_seg, int idx)
    const char * text_name;
    const char * prefix;
    const char * prefix_once;
-  const char * group_name;
+  struct elf_section_match match;
    char * sec_name;
    int type;
    int flags;
@@ -24227,13 +27586,13 @@ start_unwind_section (const segT text_seg, int idx)
  
    flags = SHF_ALLOC;
    linkonce = 0;
-  group_name = 0;
+  memset (&match, 0, sizeof (match));
  
    /* Handle COMDAT group.  */
    if (prefix != prefix_once && (text_seg->flags & SEC_LINK_ONCE) != 0)
      {
-      group_name = elf_group_name (text_seg);
-      if (group_name == NULL)
+      match.group_name = elf_group_name (text_seg);
+      if (match.group_name == NULL)
         {
           as_bad (_("Group section `%s' has no group signature"),
                   segment_name (text_seg));
@@ -24244,7 +27603,7 @@ start_unwind_section (const segT text_seg, int idx)
        linkonce = 1;
      }
  
-  obj_elf_change_section (sec_name, type, 0, flags, 0, group_name,
+  obj_elf_change_section (sec_name, type, flags, 0, &match,
                           linkonce, 0);
  
    /* Set the section link for index tables.  */
@@ -24629,16 +27988,19 @@ arm_tc_equal_in_insn (int c ATTRIBUTE_UNUSED, char * name)
  
        for (p = nbuf; *p; p++)
         *p = TOLOWER (*p);
-      if (hash_find (arm_ops_hsh, nbuf) != NULL)
+      if (str_hash_find (arm_ops_hsh, nbuf) != NULL)
         {
-         static struct hash_control * already_warned = NULL;
+         static htab_t  already_warned = NULL;
  
           if (already_warned == NULL)
-           already_warned = hash_new ();
+           already_warned = str_htab_create ();
           /* Only warn about the symbol once.  To keep the code
-            simple we let hash_insert do the lookup for us.  */
-         if (hash_insert (already_warned, nbuf, NULL) == NULL)
-           as_warn (_("[-mwarn-syms]: Assignment makes a symbol match an ARM instruction: %s"), name);
+            simple we let str_hash_insert do the lookup for us.  */
+         if (str_hash_find (already_warned, nbuf) == NULL)
+           {
+             as_warn (_("[-mwarn-syms]: Assignment makes a symbol match an ARM instruction: %s"), name);
+             str_hash_insert (already_warned, nbuf, NULL);
+           }
         }
        else
         free (nbuf);
@@ -25433,11 +28795,12 @@ md_apply_fix (fixS *  fixP,
        break;
  
      case BFD_RELOC_ARM_SMC:
-      if (((unsigned long) value) > 0xffff)
+      if (((unsigned long) value) > 0xf)
         as_bad_where (fixP->fx_file, fixP->fx_line,
                       _("invalid smc expression"));
+
        newval = md_chars_to_number (buf, INSN_SIZE);
-      newval |= (value & 0xf) | ((value & 0xfff0) << 4);
+      newval |= (value & 0xf);
        md_number_to_chars (buf, newval, INSN_SIZE);
        break;
  
@@ -25606,7 +28969,7 @@ md_apply_fix (fixS *    fixP,
        break;
  
      case BFD_RELOC_THUMB_PCREL_BRANCH9: /* Conditional branch. */
-      if ((value & ~0xff) && ((value & ~0xff) != ~0xff))
+      if (out_of_range_p (value, 8))
         as_bad_where (fixP->fx_file, fixP->fx_line, BAD_RANGE);
  
        if (fixP->fx_done || !seg->use_rela_p)
@@ -25618,7 +28981,7 @@ md_apply_fix (fixS *    fixP,
        break;
  
      case BFD_RELOC_THUMB_PCREL_BRANCH12: /* Unconditional branch.  */
-      if ((value & ~0x7ff) && ((value & ~0x7ff) != ~0x7ff))
+      if (out_of_range_p (value, 11))
         as_bad_where (fixP->fx_file, fixP->fx_line, BAD_RANGE);
  
        if (fixP->fx_done || !seg->use_rela_p)
@@ -25629,6 +28992,7 @@ md_apply_fix (fixS *    fixP,
         }
        break;
  
+    /* This relocation is misnamed, it should be BRANCH21.  */
      case BFD_RELOC_THUMB_PCREL_BRANCH20:
        if (fixP->fx_addsy
           && (S_GET_SEGMENT (fixP->fx_addsy) == seg)
@@ -25639,7 +29003,7 @@ md_apply_fix (fixS *    fixP,
           /* Force a relocation for a branch 20 bits wide.  */
           fixP->fx_done = 0;
         }
-      if ((value & ~0x1fffff) && ((value & ~0x0fffff) != ~0x0fffff))
+      if (out_of_range_p (value, 20))
         as_bad_where (fixP->fx_file, fixP->fx_line,
                       _("conditional branch out of range"));
  
@@ -25718,12 +29082,11 @@ md_apply_fix (fixS *  fixP,
          fixP->fx_r_type = BFD_RELOC_THUMB_PCREL_BRANCH23;
  #endif
  
-      if ((value & ~0x3fffff) && ((value & ~0x3fffff) != ~0x3fffff))
+      if (out_of_range_p (value, 22))
         {
           if (!(ARM_CPU_HAS_FEATURE (cpu_variant, arm_ext_v6t2)))
             as_bad_where (fixP->fx_file, fixP->fx_line, BAD_RANGE);
-         else if ((value & ~0x1ffffff)
-                  && ((value & ~0x1ffffff) != ~0x1ffffff))
+         else if (out_of_range_p (value, 24))
             as_bad_where (fixP->fx_file, fixP->fx_line,
                           _("Thumb2 branch out of range"));
         }
@@ -25734,7 +29097,7 @@ md_apply_fix (fixS *    fixP,
        break;
  
      case BFD_RELOC_THUMB_PCREL_BRANCH25:
-      if ((value & ~0x0ffffff) && ((value & ~0x0ffffff) != ~0x0ffffff))
+      if (out_of_range_p (value, 24))
         as_bad_where (fixP->fx_file, fixP->fx_line, BAD_RANGE);
  
        if (fixP->fx_done || !seg->use_rela_p)
@@ -25940,6 +29303,9 @@ md_apply_fix (fixS *    fixP,
                           (((unsigned long) fixP->fx_frag->fr_address
                             + (unsigned long) fixP->fx_where) & ~3)
                           + (unsigned long) value);
+         else if (get_recorded_alignment (seg) < 2)
+           as_warn_where (fixP->fx_file, fixP->fx_line,
+                          _("section does not have enough alignment to ensure safe PC-relative loads"));
  
           if (value & ~0x3fc)
             as_bad_where (fixP->fx_file, fixP->fx_line,
@@ -26578,9 +29944,10 @@ md_apply_fix (fixS *   fixP,
         }
  
        bfd_vma insn = get_thumb32_insn (buf);
-      /* le lr, <label> or le <label> */
+      /* le lr, <label>, le <label> or letp lr, <label> */
        if (((insn & 0xffffffff) == 0xf00fc001)
-         || ((insn & 0xffffffff) == 0xf02fc001))
+         || ((insn & 0xffffffff) == 0xf02fc001)
+         || ((insn & 0xffffffff) == 0xf01fc001))
         value = -value;
  
        if (v8_1_branch_value_check (value, 12, FALSE) == FAIL)
@@ -27006,6 +30373,7 @@ arm_force_relocation (struct fix * fixp)
        || fixp->fx_r_type == BFD_RELOC_ARM_CP_OFF_IMM
        || fixp->fx_r_type == BFD_RELOC_ARM_CP_OFF_IMM_S2
        || fixp->fx_r_type == BFD_RELOC_ARM_THUMB_OFFSET
+      || fixp->fx_r_type == BFD_RELOC_THUMB_PCREL_BRANCH12
        || fixp->fx_r_type == BFD_RELOC_ARM_T32_ADD_IMM
        || fixp->fx_r_type == BFD_RELOC_ARM_T32_IMMEDIATE
        || fixp->fx_r_type == BFD_RELOC_ARM_T32_IMM12
@@ -27350,36 +30718,37 @@ md_begin (void)
    unsigned mach;
    unsigned int i;
  
-  if (  (arm_ops_hsh = hash_new ()) == NULL
-      || (arm_cond_hsh = hash_new ()) == NULL
-      || (arm_vcond_hsh = hash_new ()) == NULL
-      || (arm_shift_hsh = hash_new ()) == NULL
-      || (arm_psr_hsh = hash_new ()) == NULL
-      || (arm_v7m_psr_hsh = hash_new ()) == NULL
-      || (arm_reg_hsh = hash_new ()) == NULL
-      || (arm_reloc_hsh = hash_new ()) == NULL
-      || (arm_barrier_opt_hsh = hash_new ()) == NULL)
+  if (  (arm_ops_hsh = str_htab_create ()) == NULL
+      || (arm_cond_hsh = str_htab_create ()) == NULL
+      || (arm_vcond_hsh = str_htab_create ()) == NULL
+      || (arm_shift_hsh = str_htab_create ()) == NULL
+      || (arm_psr_hsh = str_htab_create ()) == NULL
+      || (arm_v7m_psr_hsh = str_htab_create ()) == NULL
+      || (arm_reg_hsh = str_htab_create ()) == NULL
+      || (arm_reloc_hsh = str_htab_create ()) == NULL
+      || (arm_barrier_opt_hsh = str_htab_create ()) == NULL)
      as_fatal (_("virtual memory exhausted"));
  
    for (i = 0; i < sizeof (insns) / sizeof (struct asm_opcode); i++)
-    hash_insert (arm_ops_hsh, insns[i].template_name, (void *) (insns + i));
+    if (str_hash_find (arm_ops_hsh, insns[i].template_name) == NULL)
+      str_hash_insert (arm_ops_hsh, insns[i].template_name, (void *) (insns + i));
    for (i = 0; i < sizeof (conds) / sizeof (struct asm_cond); i++)
-    hash_insert (arm_cond_hsh, conds[i].template_name, (void *) (conds + i));
+    str_hash_insert (arm_cond_hsh, conds[i].template_name, (void *) (conds + i));
    for (i = 0; i < sizeof (vconds) / sizeof (struct asm_cond); i++)
-    hash_insert (arm_vcond_hsh, vconds[i].template_name, (void *) (vconds + i));
+    str_hash_insert (arm_vcond_hsh, vconds[i].template_name, (void *) (vconds + i));
    for (i = 0; i < sizeof (shift_names) / sizeof (struct asm_shift_name); i++)
-    hash_insert (arm_shift_hsh, shift_names[i].name, (void *) (shift_names + i));
+    str_hash_insert (arm_shift_hsh, shift_names[i].name, (void *) (shift_names + i));
    for (i = 0; i < sizeof (psrs) / sizeof (struct asm_psr); i++)
-    hash_insert (arm_psr_hsh, psrs[i].template_name, (void *) (psrs + i));
+    str_hash_insert (arm_psr_hsh, psrs[i].template_name, (void *) (psrs + i));
    for (i = 0; i < sizeof (v7m_psrs) / sizeof (struct asm_psr); i++)
-    hash_insert (arm_v7m_psr_hsh, v7m_psrs[i].template_name,
+    str_hash_insert (arm_v7m_psr_hsh, v7m_psrs[i].template_name,
                  (void *) (v7m_psrs + i));
    for (i = 0; i < sizeof (reg_names) / sizeof (struct reg_entry); i++)
-    hash_insert (arm_reg_hsh, reg_names[i].name, (void *) (reg_names + i));
+    str_hash_insert (arm_reg_hsh, reg_names[i].name, (void *) (reg_names + i));
    for (i = 0;
         i < sizeof (barrier_opt_names) / sizeof (struct asm_barrier_opt);
         i++)
-    hash_insert (arm_barrier_opt_hsh, barrier_opt_names[i].template_name,
+    str_hash_insert (arm_barrier_opt_hsh, barrier_opt_names[i].template_name,
                  (void *) (barrier_opt_names + i));
  #ifdef OBJ_ELF
    for (i = 0; i < ARRAY_SIZE (reloc_names); i++)
@@ -27390,7 +30759,7 @@ md_begin (void)
         /* This makes encode_branch() use the EABI versions of this relocation.  */
         entry->reloc = BFD_RELOC_UNUSED;
  
-      hash_insert (arm_reloc_hsh, entry->name, (void *) entry);
+      str_hash_insert (arm_reloc_hsh, entry->name, (void *) entry);
      }
  #endif
  
@@ -27536,9 +30905,8 @@ md_begin (void)
  
         if (sec != NULL)
           {
-           bfd_set_section_flags
-             (stdoutput, sec, SEC_READONLY | SEC_DEBUGGING /* | SEC_HAS_CONTENTS */);
-           bfd_set_section_size (stdoutput, sec, 0);
+           bfd_set_section_flags (sec, SEC_READONLY | SEC_DEBUGGING);
+           bfd_set_section_size (sec, 0);
             bfd_set_section_contents (stdoutput, sec, NULL, 0, 0);
           }
        }
@@ -27714,6 +31082,11 @@ struct arm_option_table arm_opts[] =
    {"mwarn-deprecated", NULL, &warn_on_deprecated, 1, NULL},
    {"mno-warn-deprecated", N_("do not warn on use of deprecated feature"),
     &warn_on_deprecated, 0, NULL},
+
+  {"mwarn-restrict-it", N_("warn about performance deprecated IT instructions"
+   " in ARMv8-A and ARMv8-R"), &warn_on_restrict_it, 1, NULL},
+  {"mno-warn-restrict-it", NULL, &warn_on_restrict_it, 0, NULL},
+
    {"mwarn-syms", N_("warn about symbols that match instruction names [default]"), (int *) (& flag_warn_syms), TRUE, NULL},
    {"mno-warn-syms", N_("disable warnings about symobls that match instructions"), (int *) (& flag_warn_syms), FALSE, NULL},
    {NULL, NULL, NULL, 0, NULL}
@@ -28131,25 +31504,25 @@ static const struct arm_cpu_option_table arm_cpus[] =
                ARM_ARCH_NONE,
                FPU_ARCH_NEON_VFP_V4),
    ARM_CPU_OPT ("cortex-a32",     "Cortex-A32",        ARM_ARCH_V8A,
-              ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
    ARM_CPU_OPT ("cortex-a35",     "Cortex-A35",        ARM_ARCH_V8A,
-              ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
    ARM_CPU_OPT ("cortex-a53",     "Cortex-A53",        ARM_ARCH_V8A,
-              ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
    ARM_CPU_OPT ("cortex-a55",    "Cortex-A55",         ARM_ARCH_V8_2A,
                ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
    ARM_CPU_OPT ("cortex-a57",     "Cortex-A57",        ARM_ARCH_V8A,
-              ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
    ARM_CPU_OPT ("cortex-a72",     "Cortex-A72",        ARM_ARCH_V8A,
-             ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
               FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
    ARM_CPU_OPT ("cortex-a73",     "Cortex-A73",        ARM_ARCH_V8A,
-             ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
               FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
    ARM_CPU_OPT ("cortex-a75",    "Cortex-A75",         ARM_ARCH_V8_2A,
                ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
@@ -28157,6 +31530,12 @@ static const struct arm_cpu_option_table arm_cpus[] =
    ARM_CPU_OPT ("cortex-a76",    "Cortex-A76",         ARM_ARCH_V8_2A,
                ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
+  ARM_CPU_OPT ("cortex-a76ae",    "Cortex-A76AE",      ARM_ARCH_V8_2A,
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
+              FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
+  ARM_CPU_OPT ("cortex-a77",    "Cortex-A77",         ARM_ARCH_V8_2A,
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
+              FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
    ARM_CPU_OPT ("ares",    "Ares",             ARM_ARCH_V8_2A,
                ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_DOTPROD),
@@ -28176,8 +31555,11 @@ static const struct arm_cpu_option_table arm_cpus[] =
                ARM_FEATURE_CORE_LOW (ARM_EXT_ADIV),
                FPU_ARCH_VFP_V3D16),
    ARM_CPU_OPT ("cortex-r52",     "Cortex-R52",        ARM_ARCH_V8R,
-             ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
               FPU_ARCH_NEON_VFP_ARMV8),
+  ARM_CPU_OPT ("cortex-m35p",    "Cortex-M35P",       ARM_ARCH_V8M_MAIN,
+              ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
+              FPU_NONE),
    ARM_CPU_OPT ("cortex-m33",     "Cortex-M33",        ARM_ARCH_V8M_MAIN,
                ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
                FPU_NONE),
@@ -28203,7 +31585,7 @@ static const struct arm_cpu_option_table arm_cpus[] =
                ARM_ARCH_NONE,
                FPU_NONE),
    ARM_CPU_OPT ("exynos-m1",      "Samsung Exynos M1", ARM_ARCH_V8A,
-              ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
    ARM_CPU_OPT ("neoverse-n1",    "Neoverse N1",               ARM_ARCH_V8_2A,
                ARM_FEATURE_CORE_HIGH (ARM_EXT2_FP16_INST),
@@ -28242,7 +31624,7 @@ static const struct arm_cpu_option_table arm_cpus[] =
                ARM_ARCH_NONE,
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
    ARM_CPU_OPT ("xgene2",         "APM X-Gene 2",      ARM_ARCH_V8A,
-              ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+              ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC),
                FPU_ARCH_CRYPTO_NEON_VFP_ARMV8),
  
    { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE, ARM_ARCH_NONE, NULL }
@@ -28362,7 +31744,7 @@ static const struct arm_ext_table armv7em_ext_table[] =
  
  static const struct arm_ext_table armv8a_ext_table[] =
  {
-  ARM_ADD ("crc", ARCH_CRC_ARMV8),
+  ARM_ADD ("crc", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC)),
    ARM_ADD ("simd", FPU_ARCH_NEON_VFP_ARMV8),
    ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8,
            ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
@@ -28395,6 +31777,8 @@ static const struct arm_ext_table armv82a_ext_table[] =
    ARM_ADD ("simd", FPU_ARCH_NEON_VFP_ARMV8_1),
    ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_2_FP16),
    ARM_ADD ("fp16fml", FPU_ARCH_NEON_VFP_ARMV8_2_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
    ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_1,
            ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
    ARM_ADD ("dotprod", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
@@ -28411,6 +31795,8 @@ static const struct arm_ext_table armv84a_ext_table[] =
  {
    ARM_ADD ("simd", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
    ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_4_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
    ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_4,
            ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
  
@@ -28426,6 +31812,8 @@ static const struct arm_ext_table armv85a_ext_table[] =
  {
    ARM_ADD ("simd", FPU_ARCH_DOTPROD_NEON_VFP_ARMV8),
    ARM_ADD ("fp16", FPU_ARCH_NEON_VFP_ARMV8_4_FP16FML),
+  ARM_ADD ("bf16", ARM_FEATURE_CORE_HIGH (ARM_EXT2_BF16)),
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
    ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8_4,
            ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
  
@@ -28435,19 +31823,37 @@ static const struct arm_ext_table armv85a_ext_table[] =
    { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE }
  };
  
+static const struct arm_ext_table armv86a_ext_table[] =
+{
+  ARM_ADD ("i8mm", ARM_FEATURE_CORE_HIGH (ARM_EXT2_I8MM)),
+  { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE }
+};
+
+#define CDE_EXTENSIONS \
+  ARM_ADD ("cdecp0", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE | ARM_EXT2_CDE0)), \
+  ARM_ADD ("cdecp1", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE | ARM_EXT2_CDE1)), \
+  ARM_ADD ("cdecp2", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE | ARM_EXT2_CDE2)), \
+  ARM_ADD ("cdecp3", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE | ARM_EXT2_CDE3)), \
+  ARM_ADD ("cdecp4", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE | ARM_EXT2_CDE4)), \
+  ARM_ADD ("cdecp5", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE | ARM_EXT2_CDE5)), \
+  ARM_ADD ("cdecp6", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE | ARM_EXT2_CDE6)), \
+  ARM_ADD ("cdecp7", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CDE | ARM_EXT2_CDE7))
+
  static const struct arm_ext_table armv8m_main_ext_table[] =
  {
-  ARM_EXT ("dsp", ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
-                 ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP)),
+  ARM_EXT ("dsp", ARM_FEATURE_CORE_LOW (ARM_AEXT_V8M_MAIN_DSP),
+                 ARM_FEATURE_CORE_LOW (ARM_AEXT_V8M_MAIN_DSP)),
    ARM_EXT ("fp", FPU_ARCH_VFP_V5_SP_D16, ALL_FP),
    ARM_ADD ("fp.dp", FPU_ARCH_VFP_V5D16),
+  CDE_EXTENSIONS,
    { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE }
  };
  
+
  static const struct arm_ext_table armv8_1m_main_ext_table[] =
  {
-  ARM_EXT ("dsp", ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP),
-                 ARM_FEATURE_CORE_LOW (ARM_EXT_V5ExP | ARM_EXT_V6_DSP)),
+  ARM_EXT ("dsp", ARM_FEATURE_CORE_LOW (ARM_AEXT_V8M_MAIN_DSP),
+                 ARM_FEATURE_CORE_LOW (ARM_AEXT_V8M_MAIN_DSP)),
    ARM_EXT ("fp",
            ARM_FEATURE (0, ARM_EXT2_FP16_INST,
                         FPU_VFP_V5_SP_D16 | FPU_VFP_EXT_FP16 | FPU_VFP_EXT_FMA),
@@ -28455,18 +31861,21 @@ static const struct arm_ext_table armv8_1m_main_ext_table[] =
    ARM_ADD ("fp.dp",
            ARM_FEATURE (0, ARM_EXT2_FP16_INST,
                         FPU_VFP_V5D16 | FPU_VFP_EXT_FP16 | FPU_VFP_EXT_FMA)),
-  ARM_EXT ("mve", ARM_FEATURE_COPROC (FPU_MVE),
-          ARM_FEATURE_COPROC (FPU_MVE | FPU_MVE_FP)),
+  ARM_EXT ("mve", ARM_FEATURE (ARM_AEXT_V8M_MAIN_DSP, ARM_EXT2_MVE, 0),
+          ARM_FEATURE_CORE_HIGH (ARM_EXT2_MVE | ARM_EXT2_MVE_FP)),
    ARM_ADD ("mve.fp",
-          ARM_FEATURE (0, ARM_EXT2_FP16_INST,
-                       FPU_MVE | FPU_MVE_FP | FPU_VFP_V5_SP_D16 |
-                       FPU_VFP_EXT_FP16 | FPU_VFP_EXT_FMA)),
+          ARM_FEATURE (ARM_AEXT_V8M_MAIN_DSP,
+                       ARM_EXT2_FP16_INST | ARM_EXT2_MVE | ARM_EXT2_MVE_FP,
+                       FPU_VFP_V5_SP_D16 | FPU_VFP_EXT_FP16 | FPU_VFP_EXT_FMA)),
+  CDE_EXTENSIONS,
    { NULL, 0, ARM_ARCH_NONE, ARM_ARCH_NONE }
  };
  
+#undef CDE_EXTENSIONS
+
  static const struct arm_ext_table armv8r_ext_table[] =
  {
-  ARM_ADD ("crc", ARCH_CRC_ARMV8),
+  ARM_ADD ("crc", ARM_FEATURE_CORE_HIGH (ARM_EXT2_CRC)),
    ARM_ADD ("simd", FPU_ARCH_NEON_VFP_ARMV8),
    ARM_EXT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8,
            ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8)),
@@ -28540,6 +31949,7 @@ static const struct arm_arch_option_table arm_archs[] =
    ARM_ARCH_OPT2 ("armv8-r",      ARM_ARCH_V8R,         FPU_ARCH_VFP, armv8r),
    ARM_ARCH_OPT2 ("armv8.4-a",    ARM_ARCH_V8_4A,       FPU_ARCH_VFP, armv84a),
    ARM_ARCH_OPT2 ("armv8.5-a",    ARM_ARCH_V8_5A,       FPU_ARCH_VFP, armv85a),
+  ARM_ARCH_OPT2 ("armv8.6-a",    ARM_ARCH_V8_6A,       FPU_ARCH_VFP, armv86a),
    ARM_ARCH_OPT ("xscale",        ARM_ARCH_XSCALE,      FPU_ARCH_VFP),
    ARM_ARCH_OPT ("iwmmxt",        ARM_ARCH_IWMMXT,      FPU_ARCH_VFP),
    ARM_ARCH_OPT ("iwmmxt2",       ARM_ARCH_IWMMXT2,     FPU_ARCH_VFP),
@@ -28570,7 +31980,8 @@ struct arm_option_extension_value_table
     use the context sensitive approach using arm_ext_table's.  */
  static const struct arm_option_extension_value_table arm_extensions[] =
  {
-  ARM_EXT_OPT ("crc",  ARCH_CRC_ARMV8, ARM_FEATURE_COPROC (CRC_EXT_ARMV8),
+  ARM_EXT_OPT ("crc",   ARM_FEATURE_CORE_HIGH(ARM_EXT2_CRC),
+                        ARM_FEATURE_CORE_HIGH(ARM_EXT2_CRC),
                          ARM_FEATURE_CORE_LOW (ARM_EXT_V8)),
    ARM_EXT_OPT ("crypto", FPU_ARCH_CRYPTO_NEON_VFP_ARMV8,
                          ARM_FEATURE_COPROC (FPU_CRYPTO_ARMV8),
@@ -28906,6 +32317,22 @@ arm_parse_extension (const char *str, const arm_feature_set *opt_set,
    return TRUE;
  }
  
+static bfd_boolean
+arm_parse_fp16_opt (const char *str)
+{
+  if (strcasecmp (str, "ieee") == 0)
+    fp16_format = ARM_FP16_FORMAT_IEEE;
+  else if (strcasecmp (str, "alternative") == 0)
+    fp16_format = ARM_FP16_FORMAT_ALTERNATIVE;
+  else
+    {
+      as_bad (_("unrecognised float16 format \"%s\""), str);
+      return FALSE;
+    }
+
+  return TRUE;
+}
+
  static bfd_boolean
  arm_parse_cpu (const char *str)
  {
@@ -28985,6 +32412,7 @@ arm_parse_arch (const char *str)
           march_ext_opt = XNEW (arm_feature_set);
         *march_ext_opt = arm_arch_none;
         march_fpu_opt = &opt->default_fpu;
+       selected_ctx_ext_table = opt->ext_table;
         strcpy (selected_cpu_name, opt->name);
  
         if (ext != NULL)
@@ -29097,6 +32525,12 @@ struct arm_long_option_table arm_long_opts[] =
     arm_parse_it_mode, NULL},
    {"mccs", N_("\t\t\t  TI CodeComposer Studio syntax compatibility mode"),
     arm_ccs_mode, NULL},
+  {"mfp16-format=",
+   N_("[ieee|alternative]\n\
+                          set the encoding for half precision floating point "
+                         "numbers to IEEE\n\
+                          or Arm alternative format."),
+   arm_parse_fp16_opt, NULL },
    {NULL, NULL, 0, NULL}
  };
  
@@ -29298,7 +32732,8 @@ static const cpu_arch_ver_table cpu_arch_ver[] =
      {TAG_CPU_ARCH_V8,        ARM_ARCH_V8_4A},
      {TAG_CPU_ARCH_V8,        ARM_ARCH_V8_5A},
      {TAG_CPU_ARCH_V8_1M_MAIN, ARM_ARCH_V8_1M_MAIN},
-    {-1,                     ARM_ARCH_NONE}
+    {TAG_CPU_ARCH_V8,      ARM_ARCH_V8_6A},
+    {-1,                   ARM_ARCH_NONE}
  };
  
  /* Set an attribute if it has not already been set by the user.  */
@@ -29453,14 +32888,16 @@ get_aeabi_cpu_arch_from_fset (const arm_feature_set *arch_ext_fset,
    if (p_ver_ret == NULL)
      return -1;
  
-found:
+ found:
    /* Tag_CPU_arch_profile.  */
-  if (ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v7a)
-      || ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v8)
-      || (ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_atomics)
-         && !ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v8m_m_only)))
+  if (!ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v8r)
+      && (ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v7a)
+          || ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v8)
+          || (ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_atomics)
+              && !ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v8m_m_only))))
      *profile = 'A';
-  else if (ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v7r))
+  else if (ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v7r)
+      || ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_v8r))
      *profile = 'R';
    else if (ARM_CPU_HAS_FEATURE (p_ver_ret->flags, arm_ext_m))
      *profile = 'M';
@@ -29678,6 +33115,9 @@ aeabi_set_public_attributes (void)
      virt_sec |= 2;
    if (virt_sec != 0)
      aeabi_set_attribute_int (Tag_Virtualization_use, virt_sec);
+
+  if (fp16_format != ARM_FP16_FORMAT_DEFAULT)
+    aeabi_set_attribute_int (Tag_ABI_FP_16bit_format, fp16_format);
  }
  
  /* Post relaxation hook.  Recompute ARM attributes now that relaxation is
@@ -29768,6 +33208,7 @@ s_arm_arch (int ignored ATTRIBUTE_UNUSED)
      if (streq (opt->name, name))
        {
         selected_arch = opt->value;
+       selected_ctx_ext_table = opt->ext_table;
         selected_ext = arm_arch_none;
         selected_cpu = selected_arch;
         strcpy (selected_cpu_name, opt->name);
@@ -29835,6 +33276,35 @@ s_arm_arch_extension (int ignored ATTRIBUTE_UNUSED)
        name += 2;
      }
  
+  /* Check the context specific extension table */
+  if (selected_ctx_ext_table)
+    {
+      const struct arm_ext_table * ext_opt;
+      for (ext_opt = selected_ctx_ext_table; ext_opt->name != NULL; ext_opt++)
+        {
+          if (streq (ext_opt->name, name))
+           {
+             if (adding_value)
+               {
+                 if (ARM_FEATURE_ZERO (ext_opt->merge))
+                   /* TODO: Option not supported.  When we remove the
+                   legacy table this case should error out.  */
+                   continue;
+                 ARM_MERGE_FEATURE_SETS (selected_ext, selected_ext,
+                                         ext_opt->merge);
+               }
+             else
+               ARM_CLEAR_FEATURE (selected_ext, selected_ext, ext_opt->clear);
+
+             ARM_MERGE_FEATURE_SETS (selected_cpu, selected_arch, selected_ext);
+             ARM_MERGE_FEATURE_SETS (cpu_variant, selected_cpu, selected_fpu);
+             *input_line_pointer = saved_char;
+             demand_empty_rest_of_line ();
+             return;
+           }
+       }
+    }
+
    for (opt = arm_extensions; opt->name != NULL; opt++)
      if (streq (opt->name, name))
        {
@@ -29899,6 +33369,7 @@ s_arm_fpu (int ignored ATTRIBUTE_UNUSED)
      if (streq (opt->name, name))
        {
         selected_fpu = opt->value;
+       ARM_CLEAR_FEATURE (selected_cpu, selected_cpu, fpu_any);
  #ifndef CPU_DEFAULT
         if (no_cpu_selected ())
           ARM_MERGE_FEATURE_SETS (cpu_variant, arm_arch_any, selected_fpu);