gas/

[deliverable/binutils-gdb.git] / gas / config / tc-i386.c
diff --git a/gas/config/tc-i386.c b/gas/config/tc-i386.c

index 26684c8e9847fa7b8fd1ed035d483cbba01715ef..3d935c1bd8d90a3db88e95897a409438d55dafb5 100644 (file)
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -7,7 +7,7 @@
  
     GAS is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
+   the Free Software Foundation; either version 3, or (at your option)
     any later version.
  
     GAS is distributed in the hope that it will be useful,
@@ -32,7 +32,6 @@
  #include "subsegs.h"
  #include "dwarf2dbg.h"
  #include "dw2gencfi.h"
-#include "opcodes/i386-opc.h"
  #include "elf/x86-64.h"
  
  #ifndef REGISTER_WARNINGS
@@ -432,7 +431,7 @@ static const arch_entry cpu_arch[] =
     Cpu186},
    {"i286", PROCESSOR_UNKNOWN,
     Cpu186|Cpu286},
-  {"i386", PROCESSOR_GENERIC32,
+  {"i386", PROCESSOR_I386,
     Cpu186|Cpu286|Cpu386},
    {"i486", PROCESSOR_I486,
     Cpu186|Cpu286|Cpu386|Cpu486},
@@ -499,6 +498,12 @@ static const arch_entry cpu_arch[] =
     CpuMMX|CpuMMX2|CpuSSE|CpuSSE2|CpuSSE3},
    {".ssse3", PROCESSOR_UNKNOWN,
     CpuMMX|CpuMMX2|CpuSSE|CpuSSE2|CpuSSE3|CpuSSSE3},
+  {".sse4.1", PROCESSOR_UNKNOWN,
+   CpuMMX|CpuMMX2|CpuSSE|CpuSSE2|CpuSSE3|CpuSSSE3|CpuSSE4_1},
+  {".sse4.2", PROCESSOR_UNKNOWN,
+   CpuMMX|CpuMMX2|CpuSSE|CpuSSE2|CpuSSE3|CpuSSSE3|CpuSSE4},
+  {".sse4", PROCESSOR_UNKNOWN,
+   CpuMMX|CpuMMX2|CpuSSE|CpuSSE2|CpuSSE3|CpuSSSE3|CpuSSE4},
    {".3dnow", PROCESSOR_UNKNOWN,
     CpuMMX|Cpu3dnow},
    {".3dnowa", PROCESSOR_UNKNOWN,
@@ -603,9 +608,6 @@ i386_align_code (fragS *fragP, int count)
    static const char f32_14[] =
      {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00,       /* leal 0L(%esi,1),%esi */
       0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};      /* leal 0L(%edi,1),%edi */
-  static const char f32_15[] =
-    {0xeb,0x0d,0x90,0x90,0x90,0x90,0x90,       /* jmp .+15; lotsa nops */
-     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
    static const char f16_3[] =
      {0x8d,0x74,0x00};                          /* lea 0(%esi),%esi     */
    static const char f16_4[] =
@@ -622,13 +624,17 @@ i386_align_code (fragS *fragP, int count)
    static const char f16_8[] =
      {0x8d,0xb4,0x00,0x00,                      /* lea 0w(%si),%si      */
       0x8d,0xbd,0x00,0x00};                     /* lea 0w(%di),%di      */
+  static const char jump_31[] =
+    {0xeb,0x1d,0x90,0x90,0x90,0x90,0x90,       /* jmp .+31; lotsa nops */
+     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,
+     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,
+     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
    static const char *const f32_patt[] = {
      f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
-    f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
+    f32_9, f32_10, f32_11, f32_12, f32_13, f32_14
    };
    static const char *const f16_patt[] = {
-    f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8,
-    f32_15, f32_15, f32_15, f32_15, f32_15, f32_15, f32_15
+    f32_1, f32_2, f16_3, f16_4, f16_5, f16_6, f16_7, f16_8
    };
    /* nopl (%[re]ax) */
    static const char alt_3[] =
@@ -735,57 +741,40 @@ i386_align_code (fragS *fragP, int count)
      alt_long_14, alt_long_15
    };
  
-  if (count <= 0 || count > 15)
+  /* Only align for at least a positive non-zero boundary. */
+  if (count <= 0 || count > MAX_MEM_FOR_RS_ALIGN_CODE)
      return;
  
    /* We need to decide which NOP sequence to use for 32bit and
       64bit. When -mtune= is used:
  
-     1. For PROCESSOR_I486, PROCESSOR_PENTIUM and PROCESSOR_GENERIC32,
-     f32_patt will be used.
-     2. For PROCESSOR_K8 and PROCESSOR_AMDFAM10 in 64bit, NOPs with
-     0x66 prefix will be used.
-     3. For PROCESSOR_CORE2, alt_long_patt will be used.
-     4. For PROCESSOR_PENTIUMPRO, PROCESSOR_PENTIUM4, PROCESSOR_NOCONA,
-     PROCESSOR_CORE, PROCESSOR_CORE2, PROCESSOR_K6, PROCESSOR_ATHLON
-     and PROCESSOR_GENERIC64, alt_short_patt will be used.
+     1. For PROCESSOR_I386, PROCESSOR_I486, PROCESSOR_PENTIUM and
+     PROCESSOR_GENERIC32, f32_patt will be used.
+     2. For PROCESSOR_PENTIUMPRO, PROCESSOR_PENTIUM4, PROCESSOR_NOCONA,
+     PROCESSOR_CORE, PROCESSOR_CORE2, and PROCESSOR_GENERIC64,
+     alt_long_patt will be used.
+     3. For PROCESSOR_ATHLON, PROCESSOR_K6, PROCESSOR_K8 and
+     PROCESSOR_AMDFAM10, alt_short_patt will be used.
  
-     When -mtune= isn't used, alt_short_patt will be used if
-     cpu_arch_isa_flags has Cpu686. Otherwise, f32_patt will be used.
+     When -mtune= isn't used, alt_long_patt will be used if
+     cpu_arch_isa_flags has Cpu686. Otherwise, f32_patt will
+     be used.
  
       When -march= or .arch is used, we can't use anything beyond
       cpu_arch_isa_flags.   */
  
    if (flag_code == CODE_16BIT)
      {
-      memcpy (fragP->fr_literal + fragP->fr_fix,
-             f16_patt[count - 1], count);
        if (count > 8)
-       /* Adjust jump offset.  */
-       fragP->fr_literal[fragP->fr_fix + 1] = count - 2;
-    }
-  else if (flag_code == CODE_64BIT && cpu_arch_tune == PROCESSOR_K8)
-    {
-      int i;
-      int nnops = (count + 3) / 4;
-      int len = count / nnops;
-      int remains = count - nnops * len;
-      int pos = 0;
-
-      /* The recommended way to pad 64bit code is to use NOPs preceded
-        by maximally four 0x66 prefixes.  Balance the size of nops.  */
-      for (i = 0; i < remains; i++)
-       {
-         memset (fragP->fr_literal + fragP->fr_fix + pos, 0x66, len);
-         fragP->fr_literal[fragP->fr_fix + pos + len] = 0x90;
-         pos += len + 1;
-       }
-      for (; i < nnops; i++)
         {
-         memset (fragP->fr_literal + fragP->fr_fix + pos, 0x66, len - 1);
-         fragP->fr_literal[fragP->fr_fix + pos + len - 1] = 0x90;
-         pos += len;
+         memcpy (fragP->fr_literal + fragP->fr_fix,
+                 jump_31, count);
+         /* Adjust jump offset.  */
+         fragP->fr_literal[fragP->fr_fix + 1] = count - 2;
         }
+      else
+       memcpy (fragP->fr_literal + fragP->fr_fix,
+               f16_patt[count - 1], count);
      }
    else
      {
@@ -800,24 +789,25 @@ i386_align_code (fragS *fragP, int count)
               /* We use cpu_arch_isa_flags to check if we SHOULD
                  optimize for Cpu686.  */
               if ((cpu_arch_isa_flags & Cpu686) != 0)
-               patt = alt_short_patt;
+               patt = alt_long_patt;
               else
                 patt = f32_patt;
               break;
-           case PROCESSOR_CORE2:
-             patt = alt_long_patt;
-             break;
             case PROCESSOR_PENTIUMPRO:
             case PROCESSOR_PENTIUM4:
             case PROCESSOR_NOCONA:
             case PROCESSOR_CORE:
+           case PROCESSOR_CORE2:
+           case PROCESSOR_GENERIC64:
+             patt = alt_long_patt;
+             break;
             case PROCESSOR_K6:
             case PROCESSOR_ATHLON:
             case PROCESSOR_K8:
-           case PROCESSOR_GENERIC64:
             case PROCESSOR_AMDFAM10:
               patt = alt_short_patt;
               break;
+           case PROCESSOR_I386:
             case PROCESSOR_I486:
             case PROCESSOR_PENTIUM:
             case PROCESSOR_GENERIC32:
@@ -835,12 +825,9 @@ i386_align_code (fragS *fragP, int count)
               abort ();
               break;
  
+           case PROCESSOR_I386:
             case PROCESSOR_I486:
             case PROCESSOR_PENTIUM:
-           case PROCESSOR_PENTIUMPRO:
-           case PROCESSOR_PENTIUM4:
-           case PROCESSOR_NOCONA:
-           case PROCESSOR_CORE:
             case PROCESSOR_K6:
             case PROCESSOR_ATHLON:
             case PROCESSOR_K8:
@@ -853,6 +840,10 @@ i386_align_code (fragS *fragP, int count)
               else
                 patt = f32_patt;
               break;
+           case PROCESSOR_PENTIUMPRO:
+           case PROCESSOR_PENTIUM4:
+           case PROCESSOR_NOCONA:
+           case PROCESSOR_CORE:
             case PROCESSOR_CORE2:
               if ((cpu_arch_isa_flags & Cpu686) != 0)
                 patt = alt_long_patt;
@@ -860,13 +851,44 @@ i386_align_code (fragS *fragP, int count)
                 patt = f32_patt;
               break;
             case PROCESSOR_GENERIC64:
-             patt = alt_short_patt;
+             patt = alt_long_patt;
               break;
             }
         }
  
-      memcpy (fragP->fr_literal + fragP->fr_fix,
-             patt[count - 1], count);
+      if (patt == f32_patt)
+       {
+         /* If the padding is less than 15 bytes, we use the normal
+            ones.  Otherwise, we use a jump instruction and adjust
+            its offset.  */
+         if (count < 15)
+           memcpy (fragP->fr_literal + fragP->fr_fix,
+                   patt[count - 1], count);
+         else
+           {
+             memcpy (fragP->fr_literal + fragP->fr_fix,
+                     jump_31, count);
+             /* Adjust jump offset.  */
+             fragP->fr_literal[fragP->fr_fix + 1] = count - 2;
+           }
+       }
+      else
+       {
+         /* Maximum length of an instruction is 15 byte.  If the
+            padding is greater than 15 bytes and we don't use jump,
+            we have to break it into smaller pieces.  */
+         int padding = count;
+         while (padding > 15)
+           {
+             padding -= 15;
+             memcpy (fragP->fr_literal + fragP->fr_fix + padding,
+                     patt [14], 15);
+           }
+
+         if (padding)
+           memcpy (fragP->fr_literal + fragP->fr_fix,
+                   patt [padding - 1], padding);
+       }
      }
    fragP->fr_var = count;
  }
@@ -992,9 +1014,9 @@ add_prefix (unsigned int prefix)
    if (prefix >= REX_OPCODE && prefix < REX_OPCODE + 16
        && flag_code == CODE_64BIT)
      {
-      if ((i.prefix[REX_PREFIX] & prefix & REX_MODE64)
-         || ((i.prefix[REX_PREFIX] & (REX_EXTX | REX_EXTY | REX_EXTZ))
-             && (prefix & (REX_EXTX | REX_EXTY | REX_EXTZ))))
+      if ((i.prefix[REX_PREFIX] & prefix & REX_W)
+         || ((i.prefix[REX_PREFIX] & (REX_R | REX_X | REX_B))
+             && (prefix & (REX_R | REX_X | REX_B))))
         ret = 0;
        q = REX_PREFIX;
      }
@@ -1238,8 +1260,9 @@ md_begin ()
    reg_hash = hash_new ();
    {
      const reg_entry *regtab;
+    unsigned int regtab_size = i386_regtab_size;
  
-    for (regtab = i386_regtab; regtab->reg_name != NULL; regtab++)
+    for (regtab = i386_regtab; regtab_size--; regtab++)
        {
         hash_err = hash_insert (reg_hash, regtab->reg_name, (PTR) regtab);
         if (hash_err)
@@ -1294,6 +1317,7 @@ md_begin ()
  #endif
      digit_chars['-'] = '-';
      mnemonic_chars['-'] = '-';
+    mnemonic_chars['.'] = '.';
      identifier_chars['_'] = '_';
      identifier_chars['.'] = '.';
  
@@ -1353,10 +1377,10 @@ pi (char *line, i386_insn *x)
    fprintf (stdout, "  sib:  base %x  index %x  scale %x\n",
            x->sib.base, x->sib.index, x->sib.scale);
    fprintf (stdout, "  rex: 64bit %x  extX %x  extY %x  extZ %x\n",
-          (x->rex & REX_MODE64) != 0,
-          (x->rex & REX_EXTX) != 0,
-          (x->rex & REX_EXTY) != 0,
-          (x->rex & REX_EXTZ) != 0);
+          (x->rex & REX_W) != 0,
+          (x->rex & REX_R) != 0,
+          (x->rex & REX_X) != 0,
+          (x->rex & REX_B) != 0);
    for (i = 0; i < x->operands; i++)
      {
        fprintf (stdout, "    #%d:  ", i + 1);
@@ -1828,8 +1852,11 @@ md_assemble (line)
  
           for (x = 0; x < i.operands; x++)
             if (i.op[x].regs->reg_num != x)
-             as_bad (_("can't use register '%%%s' as operand %d in '%s'."),
-                     i.op[x].regs->reg_name, x + 1, i.tm.name);
+             as_bad (_("can't use register '%s%s' as operand %d in '%s'."),
+                     register_prefix,
+                     i.op[x].regs->reg_name,
+                     x + 1,
+                     i.tm.name);
           i.operands = 0;
         }
  
@@ -1878,7 +1905,7 @@ md_assemble (line)
      }
  
    if ((i.tm.opcode_modifier & Rex64) != 0)
-    i.rex |= REX_MODE64;
+    i.rex |= REX_W;
  
    /* For 8 bit registers we need an empty rex prefix.  Also if the
       instruction already has a prefix, we need to convert old
@@ -1902,9 +1929,9 @@ md_assemble (line)
             {
               /* In case it is "hi" register, give up.  */
               if (i.op[x].regs->reg_num > 3)
-               as_bad (_("can't encode register '%%%s' in an "
+               as_bad (_("can't encode register '%s%s' in an "
                           "instruction requiring REX prefix."),
-                       i.op[x].regs->reg_name);
+                       register_prefix, i.op[x].regs->reg_name);
  
               /* Otherwise it is equivalent to the extended register.
                  Since the encoding doesn't change this is merely
@@ -2621,6 +2648,15 @@ match_template (void)
             continue;
           break;
         case 2:
+         /* xchg %eax, %eax is a special case. It is an aliase for nop
+            only in 32bit mode and we can use opcode 0x90.  In 64bit
+            mode, we can't use 0x90 for xchg %eax, %eax since it should
+            zero-extend %eax to %rax.  */
+         if (flag_code == CODE_64BIT
+             && t->base_opcode == 0x90
+             && i.types [0] == (Acc | Reg32)
+             && i.types [1] == (Acc | Reg32))
+           continue;
         case 3:
         case 4:
           overlap1 = i.types[1] & operand_types[1];
@@ -2628,9 +2664,10 @@ match_template (void)
               || !MATCH (overlap1, i.types[1], operand_types[1])
               /* monitor in SSE3 is a very special case.  The first
                  register and the second register may have different
-                sizes.  */
+                sizes.  The same applies to crc32 in SSE4.2.  */
               || !((t->base_opcode == 0x0f01
                     && t->extension_opcode == 0xc8)
+                  || t->base_opcode == 0xf20f38f1
                    || CONSISTENT_REGISTER_MATCH (overlap0, i.types[0],
                                                  operand_types[0],
                                                  overlap1, i.types[1],
@@ -2817,19 +2854,44 @@ process_suffix (void)
         {
           /* We take i.suffix from the last register operand specified,
              Destination register type is more significant than source
-            register type.  */
-         int op;
-
-         for (op = i.operands; --op >= 0;)
-           if ((i.types[op] & Reg)
-               && !(i.tm.operand_types[op] & InOutPortReg))
-             {
-               i.suffix = ((i.types[op] & Reg8) ? BYTE_MNEM_SUFFIX :
-                           (i.types[op] & Reg16) ? WORD_MNEM_SUFFIX :
-                           (i.types[op] & Reg64) ? QWORD_MNEM_SUFFIX :
+            register type.  crc32 in SSE4.2 prefers source register
+            type. */
+         if (i.tm.base_opcode == 0xf20f38f1)
+           {
+             if ((i.types[0] & Reg))
+               i.suffix = ((i.types[0] & Reg16) ? WORD_MNEM_SUFFIX :
                             LONG_MNEM_SUFFIX);
-               break;
-             }
+           }
+         else if (i.tm.base_opcode == 0xf20f38f0)
+           {
+             if ((i.types[0] & Reg8))
+               i.suffix = BYTE_MNEM_SUFFIX;
+           }
+
+         if (!i.suffix)
+           {
+             int op;
+
+             if (i.tm.base_opcode == 0xf20f38f1
+                 || i.tm.base_opcode == 0xf20f38f0)
+               {
+                 /* We have to know the operand size for crc32.  */
+                 as_bad (_("ambiguous memory operand size for `%s`"),
+                         i.tm.name);
+                 return 0;
+               }
+
+             for (op = i.operands; --op >= 0;)
+               if ((i.types[op] & Reg)
+                   && !(i.tm.operand_types[op] & InOutPortReg))
+                 {
+                   i.suffix = ((i.types[op] & Reg8) ? BYTE_MNEM_SUFFIX :
+                               (i.types[op] & Reg16) ? WORD_MNEM_SUFFIX :
+                               (i.types[op] & Reg64) ? QWORD_MNEM_SUFFIX :
+                               LONG_MNEM_SUFFIX);
+                   break;
+                 }
+           }
         }
        else if (i.suffix == BYTE_MNEM_SUFFIX)
         {
@@ -2974,8 +3036,8 @@ process_suffix (void)
           if (i.operands != 2
               || i.types [0] != (Acc | Reg64)
               || i.types [1] != (Acc | Reg64)
-             || strcmp (i.tm.name, "xchg") != 0)
-         i.rex |= REX_MODE64;
+             || i.tm.base_opcode != 0x90)
+           i.rex |= REX_W;
         }
  
        /* Size floating point instruction.  */
@@ -3009,6 +3071,10 @@ check_byte_reg (void)
               || i.tm.base_opcode == 0xfbf))
         continue;
  
+      /* crc32 doesn't generate this warning.  */
+      if (i.tm.base_opcode == 0xf20f38f0)
+       continue;
+
        if ((i.types[op] & WordReg) && i.op[op].regs->reg_num < 4)
         {
           /* Prohibit these changes in the 64bit mode, since the
@@ -3024,10 +3090,12 @@ check_byte_reg (void)
  #if REGISTER_WARNINGS
           if (!quiet_warnings
               && (i.tm.operand_types[op] & InOutPortReg) == 0)
-           as_warn (_("using `%%%s' instead of `%%%s' due to `%c' suffix"),
+           as_warn (_("using `%s%s' instead of `%s%s' due to `%c' suffix"),
+                    register_prefix,
                      (i.op[op].regs + (i.types[op] & Reg16
                                        ? REGNAM_AL - REGNAM_AX
                                        : REGNAM_AL - REGNAM_EAX))->reg_name,
+                    register_prefix,
                      i.op[op].regs->reg_name,
                      i.suffix);
  #endif
@@ -3039,7 +3107,8 @@ check_byte_reg (void)
                          | Control | Debug | Test
                          | FloatReg | FloatAcc))
         {
-         as_bad (_("`%%%s' not allowed with `%s%c'"),
+         as_bad (_("`%s%s' not allowed with `%s%c'"),
+                 register_prefix,
                   i.op[op].regs->reg_name,
                   i.tm.name,
                   i.suffix);
@@ -3060,7 +3129,8 @@ check_long_reg (void)
      if ((i.types[op] & Reg8) != 0
         && (i.tm.operand_types[op] & (Reg16 | Reg32 | Acc)) != 0)
        {
-       as_bad (_("`%%%s' not allowed with `%s%c'"),
+       as_bad (_("`%s%s' not allowed with `%s%c'"),
+               register_prefix,
                 i.op[op].regs->reg_name,
                 i.tm.name,
                 i.suffix);
@@ -3082,8 +3152,10 @@ check_long_reg (void)
           }
  #if REGISTER_WARNINGS
         else
-         as_warn (_("using `%%%s' instead of `%%%s' due to `%c' suffix"),
+         as_warn (_("using `%s%s' instead of `%s%s' due to `%c' suffix"),
+                  register_prefix,
                    (i.op[op].regs + REGNAM_EAX - REGNAM_AX)->reg_name,
+                  register_prefix,
                    i.op[op].regs->reg_name,
                    i.suffix);
  #endif
@@ -3111,7 +3183,8 @@ check_qword_reg (void)
      if ((i.types[op] & Reg8) != 0
         && (i.tm.operand_types[op] & (Reg16 | Reg32 | Acc)) != 0)
        {
-       as_bad (_("`%%%s' not allowed with `%s%c'"),
+       as_bad (_("`%s%s' not allowed with `%s%c'"),
+               register_prefix,
                 i.op[op].regs->reg_name,
                 i.tm.name,
                 i.suffix);
@@ -3142,7 +3215,8 @@ check_word_reg (void)
      if ((i.types[op] & Reg8) != 0
         && (i.tm.operand_types[op] & (Reg16 | Reg32 | Acc)) != 0)
        {
-       as_bad (_("`%%%s' not allowed with `%s%c'"),
+       as_bad (_("`%s%s' not allowed with `%s%c'"),
+               register_prefix,
                 i.op[op].regs->reg_name,
                 i.tm.name,
                 i.suffix);
@@ -3164,8 +3238,10 @@ check_word_reg (void)
           }
         else
  #if REGISTER_WARNINGS
-         as_warn (_("using `%%%s' instead of `%%%s' due to `%c' suffix"),
+         as_warn (_("using `%s%s' instead of `%s%s' due to `%c' suffix"),
+                  register_prefix,
                    (i.op[op].regs + REGNAM_AX - REGNAM_EAX)->reg_name,
+                  register_prefix,
                    i.op[op].regs->reg_name,
                    i.suffix);
  #endif
@@ -3265,16 +3341,49 @@ process_operands (void)
    /* The imul $imm, %reg instruction is converted into
       imul $imm, %reg, %reg, and the clr %reg instruction
       is converted into xor %reg, %reg.  */
-  if (i.tm.opcode_modifier & regKludge)
-    {
-      unsigned int first_reg_op = (i.types[0] & Reg) ? 0 : 1;
-      /* Pretend we saw the extra register operand.  */
-      assert (i.reg_operands == 1
-             && i.op[first_reg_op + 1].regs == 0);
-      i.op[first_reg_op + 1].regs = i.op[first_reg_op].regs;
-      i.types[first_reg_op + 1] = i.types[first_reg_op];
-      i.operands++;
-      i.reg_operands++;
+  if (i.tm.opcode_modifier & RegKludge)
+    {
+       if ((i.tm.cpu_flags & CpuSSE4_1))
+        {
+          /* The first operand in instruction blendvpd, blendvps and
+             pblendvb in SSE4.1 is implicit and must be xmm0.  */
+          assert (i.operands == 3
+                  && i.reg_operands >= 2
+                  && i.types[0] == RegXMM);
+          if (i.op[0].regs->reg_num != 0)
+            {
+              if (intel_syntax)
+                as_bad (_("the last operand of `%s' must be `%sxmm0'"),
+                        i.tm.name, register_prefix);
+              else
+                as_bad (_("the first operand of `%s' must be `%sxmm0'"),
+                        i.tm.name, register_prefix);
+              return 0;
+            }
+          i.op[0] = i.op[1];
+          i.op[1] = i.op[2];
+          i.types[0] = i.types[1];
+          i.types[1] = i.types[2];
+          i.operands--;
+          i.reg_operands--;
+
+          /* We need to adjust fields in i.tm since they are used by
+             build_modrm_byte.  */
+          i.tm.operand_types [0] = i.tm.operand_types [1];
+          i.tm.operand_types [1] = i.tm.operand_types [2];
+          i.tm.operands--;
+        }
+       else
+        {
+          unsigned int first_reg_op = (i.types[0] & Reg) ? 0 : 1;
+          /* Pretend we saw the extra register operand.  */
+          assert (i.reg_operands == 1
+                  && i.op[first_reg_op + 1].regs == 0);
+          i.op[first_reg_op + 1].regs = i.op[first_reg_op].regs;
+          i.types[first_reg_op + 1] = i.types[first_reg_op];
+          i.operands++;
+          i.reg_operands++;
+        }
      }
  
    if (i.tm.opcode_modifier & ShortForm)
@@ -3289,7 +3398,7 @@ process_operands (void)
             }
           i.tm.base_opcode |= (i.op[0].regs->reg_num << 3);
           if ((i.op[0].regs->reg_flags & RegRex) != 0)
-           i.rex |= REX_EXTZ;
+           i.rex |= REX_B;
         }
        else
         {
@@ -3298,7 +3407,7 @@ process_operands (void)
           /* Register goes in low 3 bits of opcode.  */
           i.tm.base_opcode |= i.op[op].regs->reg_num;
           if ((i.op[op].regs->reg_flags & RegRex) != 0)
-           i.rex |= REX_EXTZ;
+           i.rex |= REX_B;
           if (!quiet_warnings && (i.tm.opcode_modifier & Ugh) != 0)
             {
               /* Warn about some common errors, but press on regardless.
@@ -3306,15 +3415,15 @@ process_operands (void)
               if (i.operands == 2)
                 {
                   /* Reversed arguments on faddp, fsubp, etc.  */
-                 as_warn (_("translating to `%s %%%s,%%%s'"), i.tm.name,
-                          i.op[1].regs->reg_name,
-                          i.op[0].regs->reg_name);
+                 as_warn (_("translating to `%s %s%s,%s%s'"), i.tm.name,
+                          register_prefix, i.op[1].regs->reg_name,
+                          register_prefix, i.op[0].regs->reg_name);
                 }
               else
                 {
                   /* Extraneous `l' suffix on fp insn.  */
-                 as_warn (_("translating to `%s %%%s'"), i.tm.name,
-                          i.op[0].regs->reg_name);
+                 as_warn (_("translating to `%s %s%s'"), i.tm.name,
+                          register_prefix, i.op[0].regs->reg_name);
                 }
             }
         }
@@ -3402,29 +3511,29 @@ build_modrm_byte (void)
          destination operand, then we assume the source operand may
          sometimes be a memory operand and so we need to store the
          destination in the i.rm.reg field.  */
-      if ((i.tm.operand_types[dest] & AnyMem) == 0)
+      if ((i.tm.operand_types[dest] & (AnyMem | RegMem)) == 0)
         {
           i.rm.reg = i.op[dest].regs->reg_num;
           i.rm.regmem = i.op[source].regs->reg_num;
           if ((i.op[dest].regs->reg_flags & RegRex) != 0)
-           i.rex |= REX_EXTX;
+           i.rex |= REX_R;
           if ((i.op[source].regs->reg_flags & RegRex) != 0)
-           i.rex |= REX_EXTZ;
+           i.rex |= REX_B;
         }
        else
         {
           i.rm.reg = i.op[source].regs->reg_num;
           i.rm.regmem = i.op[dest].regs->reg_num;
           if ((i.op[dest].regs->reg_flags & RegRex) != 0)
-           i.rex |= REX_EXTZ;
+           i.rex |= REX_B;
           if ((i.op[source].regs->reg_flags & RegRex) != 0)
-           i.rex |= REX_EXTX;
+           i.rex |= REX_R;
         }
-      if (flag_code != CODE_64BIT && (i.rex & (REX_EXTX | REX_EXTZ)))
+      if (flag_code != CODE_64BIT && (i.rex & (REX_R | REX_B)))
         {
           if (!((i.types[0] | i.types[1]) & Control))
             abort ();
-         i.rex &= ~(REX_EXTX | REX_EXTZ);
+         i.rex &= ~(REX_R | REX_B);
           add_prefix (LOCK_PREFIX_OPCODE);
         }
      }
@@ -3486,7 +3595,7 @@ build_modrm_byte (void)
                   else
                     i.types[op] |= Disp32S;
                   if ((i.index_reg->reg_flags & RegRex) != 0)
-                   i.rex |= REX_EXTY;
+                   i.rex |= REX_X;
                 }
             }
           /* RIP addressing for 64bit mode.  */
@@ -3539,7 +3648,7 @@ build_modrm_byte (void)
  
               i.rm.regmem = i.base_reg->reg_num;
               if ((i.base_reg->reg_flags & RegRex) != 0)
-               i.rex |= REX_EXTZ;
+               i.rex |= REX_B;
               i.sib.base = i.base_reg->reg_num;
               /* x86-64 ignores REX prefix bit here to avoid decoder
                  complications.  */
@@ -3576,7 +3685,7 @@ build_modrm_byte (void)
                   i.sib.index = i.index_reg->reg_num;
                   i.rm.regmem = ESCAPE_TO_TWO_BYTE_ADDRESSING;
                   if ((i.index_reg->reg_flags & RegRex) != 0)
-                   i.rex |= REX_EXTY;
+                   i.rex |= REX_X;
                 }
  
               if (i.disp_operands
@@ -3624,13 +3733,13 @@ build_modrm_byte (void)
             {
               i.rm.regmem = i.op[op].regs->reg_num;
               if ((i.op[op].regs->reg_flags & RegRex) != 0)
-               i.rex |= REX_EXTZ;
+               i.rex |= REX_B;
             }
           else
             {
               i.rm.reg = i.op[op].regs->reg_num;
               if ((i.op[op].regs->reg_flags & RegRex) != 0)
-               i.rex |= REX_EXTX;
+               i.rex |= REX_R;
             }
  
           /* Now, if no memory operand has set i.rm.mode = 0, 1, 2 we
@@ -3883,11 +3992,12 @@ output_insn (void)
        unsigned char *q;
        unsigned int prefix;
  
-      /* All opcodes on i386 have either 1 or 2 bytes.  Supplemental
-        Streaming SIMD extensions 3 Instructions have 3 bytes.  We may
-        use one more higher byte to specify a prefix the instruction
-        requires.  */
-      if ((i.tm.cpu_flags & CpuSSSE3) != 0)
+      /* All opcodes on i386 have either 1 or 2 bytes.  SSSE3 and
+        SSE4 instructions have 3 bytes.  We may use one more higher
+        byte to specify a prefix the instruction requires.  Exclude
+        instructions which are in both SSE4 and ABM.  */
+      if ((i.tm.cpu_flags & (CpuSSSE3 | CpuSSE4)) != 0
+         && (i.tm.cpu_flags & CpuABM) == 0)
         {
           if (i.tm.base_opcode & 0xff000000)
             {
@@ -3928,7 +4038,8 @@ output_insn (void)
         }
        else
         {
-         if ((i.tm.cpu_flags & CpuSSSE3) != 0)
+         if ((i.tm.cpu_flags & (CpuSSSE3 | CpuSSE4)) != 0
+             && (i.tm.cpu_flags & CpuABM) == 0)
             {
               p = frag_more (3);
               *p++ = (i.tm.base_opcode >> 16) & 0xff;
@@ -3982,6 +4093,40 @@ output_insn (void)
  #endif /* DEBUG386  */
  }
  
+/* Return the size of the displacement operand N.  */
+
+static int
+disp_size (unsigned int n)
+{
+  int size = 4;
+  if (i.types[n] & (Disp8 | Disp16 | Disp64))
+    {
+      size = 2;
+      if (i.types[n] & Disp8)
+       size = 1;
+      if (i.types[n] & Disp64)
+       size = 8;
+    }
+  return size;
+}
+
+/* Return the size of the immediate operand N.  */
+
+static int
+imm_size (unsigned int n)
+{
+  int size = 4;
+  if (i.types[n] & (Imm8 | Imm8S | Imm16 | Imm64))
+    {
+      size = 2;
+      if (i.types[n] & (Imm8 | Imm8S))
+       size = 1;
+      if (i.types[n] & Imm64)
+       size = 8;
+    }
+  return size;
+}
+
  static void
  output_disp (fragS *insn_start_frag, offsetT insn_start_off)
  {
@@ -3994,18 +4139,9 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
         {
           if (i.op[n].disps->X_op == O_constant)
             {
-             int size;
+             int size = disp_size (n);
               offsetT val;
  
-             size = 4;
-             if (i.types[n] & (Disp8 | Disp16 | Disp64))
-               {
-                 size = 2;
-                 if (i.types[n] & Disp8)
-                   size = 1;
-                 if (i.types[n] & Disp64)
-                   size = 8;
-               }
               val = offset_in_range (i.op[n].disps->X_add_number,
                                      size);
               p = frag_more (size);
@@ -4014,45 +4150,32 @@ output_disp (fragS *insn_start_frag, offsetT insn_start_off)
           else
             {
               enum bfd_reloc_code_real reloc_type;
-             int size = 4;
-             int sign = 0;
+             int size = disp_size (n);
+             int sign = (i.types[n] & Disp32S) != 0;
               int pcrel = (i.flags[n] & Operand_PCrel) != 0;
  
+             /* We can't have 8 bit displacement here.  */
+             assert ((i.types[n] & Disp8) == 0);
+
               /* The PC relative address is computed relative
                  to the instruction boundary, so in case immediate
                  fields follows, we need to adjust the value.  */
               if (pcrel && i.imm_operands)
                 {
-                 int imm_size = 4;
                   unsigned int n1;
+                 int sz = 0;
  
                   for (n1 = 0; n1 < i.operands; n1++)
                     if (i.types[n1] & Imm)
                       {
-                       if (i.types[n1] & (Imm8 | Imm8S | Imm16 | Imm64))
-                         {
-                           imm_size = 2;
-                           if (i.types[n1] & (Imm8 | Imm8S))
-                             imm_size = 1;
-                           if (i.types[n1] & Imm64)
-                             imm_size = 8;
-                         }
-                       break;
+                       /* Only one immediate is allowed for PC
+                          relative address.  */
+                       assert (sz == 0);
+                       sz = imm_size (n1);
+                       i.op[n].disps->X_add_number -= sz;
                       }
                   /* We should find the immediate.  */
-                 if (n1 == i.operands)
-                   abort ();
-                 i.op[n].disps->X_add_number -= imm_size;
-               }
-
-             if (i.types[n] & Disp32S)
-               sign = 1;
-
-             if (i.types[n] & (Disp16 | Disp64))
-               {
-                 size = 2;
-                 if (i.types[n] & Disp64)
-                   size = 8;
+                 assert (sz != 0);
                 }
  
               p = frag_more (size);
@@ -4117,18 +4240,9 @@ output_imm (fragS *insn_start_frag, offsetT insn_start_off)
         {
           if (i.op[n].imms->X_op == O_constant)
             {
-             int size;
+             int size = imm_size (n);
               offsetT val;
  
-             size = 4;
-             if (i.types[n] & (Imm8 | Imm8S | Imm16 | Imm64))
-               {
-                 size = 2;
-                 if (i.types[n] & (Imm8 | Imm8S))
-                   size = 1;
-                 else if (i.types[n] & Imm64)
-                   size = 8;
-               }
               val = offset_in_range (i.op[n].imms->X_add_number,
                                      size);
               p = frag_more (size);
@@ -4141,21 +4255,15 @@ output_imm (fragS *insn_start_frag, offsetT insn_start_off)
                  non-absolute imms).  Try to support other
                  sizes ...  */
               enum bfd_reloc_code_real reloc_type;
-             int size = 4;
-             int sign = 0;
+             int size = imm_size (n);
+             int sign;
  
               if ((i.types[n] & (Imm32S))
                   && (i.suffix == QWORD_MNEM_SUFFIX
                       || (!i.suffix && (i.tm.opcode_modifier & No_lSuf))))
                 sign = 1;
-             if (i.types[n] & (Imm8 | Imm8S | Imm16 | Imm64))
-               {
-                 size = 2;
-                 if (i.types[n] & (Imm8 | Imm8S))
-                   size = 1;
-                 if (i.types[n] & Imm64)
-                   size = 8;
-               }
+             else
+               sign = 0;
  
               p = frag_more (size);
               reloc_type = reloc (size, 0, sign, i.reloc[n]);
@@ -4383,9 +4491,6 @@ lex_got (enum bfd_reloc_code_real *reloc,
               if (GOT_symbol == NULL)
                 GOT_symbol = symbol_find_or_make (GLOBAL_OFFSET_TABLE_NAME);
  
-             /* Replace the relocation token with ' ', so that
-                errors like foo@GOTOFF1 will be detected.  */
-
               /* The length of the first part of our input line.  */
               first = cp - input_line_pointer;
  
@@ -4401,9 +4506,12 @@ lex_got (enum bfd_reloc_code_real *reloc,
                  be necessary, but be safe.  */
               tmpbuf = xmalloc (first + second + 2);
               memcpy (tmpbuf, input_line_pointer, first);
-             tmpbuf[first] = ' ';
-             memcpy (tmpbuf + first + 1, past_reloc, second);
-             tmpbuf[first + second + 1] = '\0';
+             if (second != 0 && *past_reloc != ' ')
+               /* Replace the relocation token with ' ', so that
+                  errors like foo@GOTOFF1 will be detected.  */
+               tmpbuf[first++] = ' ';
+             memcpy (tmpbuf + first, past_reloc, second);
+             tmpbuf[first + second] = '\0';
               return tmpbuf;
             }
  
@@ -5741,14 +5849,16 @@ parse_real_register (char *reg_string, char **end_op)
             ++s;
           if (*s >= '0' && *s <= '7')
             {
-             r = &i386_float_regtab[*s - '0'];
+             int fpr = *s - '0';
               ++s;
               if (is_space_char (*s))
                 ++s;
               if (*s == ')')
                 {
                   *end_op = s + 1;
-                 return r;
+                 r = hash_find (reg_hash, "st(0)");
+                 know (r);
+                 return r + fpr;
                 }
             }
           /* We have "%st(" then garbage.  */
@@ -5791,7 +5901,7 @@ parse_register (char *reg_string, char **end_op)
  
           know (e->X_op == O_register);
           know (e->X_add_number >= 0
-               && (valueT) e->X_add_number < ARRAY_SIZE (i386_regtab));
+               && (valueT) e->X_add_number < i386_regtab_size);
           r = i386_regtab + e->X_add_number;
           *end_op = input_line_pointer;
         }
@@ -6022,28 +6132,8 @@ md_show_usage (stream)
  
  }
  
-#if defined(TE_PEP)
-const char *
-x86_64_target_format (void)
-{
-  if (strcmp (default_arch, "x86_64") == 0)
-    {
-      set_code_flag (CODE_64BIT);
-      return COFF_TARGET_FORMAT;
-    }
-  else if (strcmp (default_arch, "i386") == 0)
-    {
-      set_code_flag (CODE_32BIT);
-      return "coff-i386";
-    }
-
-  as_fatal (_("Unknown architecture"));
-  return NULL;
-}
-#endif
-
  #if ((defined (OBJ_MAYBE_COFF) && defined (OBJ_MAYBE_AOUT)) \
-     || defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF))
+     || defined (OBJ_ELF) || defined (OBJ_MAYBE_ELF) || defined (TE_PEP))
  
  /* Pick the target format to use.  */
  
@@ -6074,6 +6164,11 @@ i386_target_format (void)
      as_fatal (_("Unknown architecture"));
    switch (OUTPUT_FLAVOR)
      {
+#ifdef TE_PEP
+    case bfd_target_coff_flavour:
+      return flag_code == CODE_64BIT ? COFF_TARGET_FORMAT : "coff-i386";
+      break;
+#endif
  #ifdef OBJ_MAYBE_AOUT
      case bfd_target_aout_flavour:
        return AOUT_TARGET_FORMAT;