gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright (C) 1987-2020 Free Software Foundation, Inc.
   3
   4    This file is part of GAS, the GNU Assembler.
   5
   6    GAS is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    GAS is distributed in the hope that it will be useful, but WITHOUT
  12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  14    License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GAS; see the file COPYING.  If not, write to the Free
  18    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  19    02110-1301, USA.  */
  20
  21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  22 /* App, the assembler pre-processor.  This pre-processor strips out
  23    excess spaces, turns single-quoted characters into a decimal
  24    constant, and turns the # in # <number> <filename> <garbage> into a
  25    .linefile.  This needs better error-handling.  */
  26
  27 #include "as.h"
  28
  29 #if (__STDC__ != 1)
  30 #ifndef const
  31 #define const  /* empty */
  32 #endif
  33 #endif
  34
  35 #ifdef H_TICK_HEX
  36 int enable_h_tick_hex = 0;
  37 #endif
  38
  39 #ifdef TC_M68K
  40 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  41    flag_m68k_mri, because the two flags will be affected by the .mri
  42    pseudo-op at different times.  */
  43 static int scrub_m68k_mri;
  44
  45 /* The pseudo-op which switches in and out of MRI mode.  See the
  46    comment in do_scrub_chars.  */
  47 static const char mri_pseudo[] = ".mri 0";
  48 #else
  49 #define scrub_m68k_mri 0
  50 #endif
  51
  52 #if defined TC_ARM && defined OBJ_ELF
  53 /* The pseudo-op for which we need to special-case `@' characters.
  54    See the comment in do_scrub_chars.  */
  55 static const char   symver_pseudo[] = ".symver";
  56 static const char * symver_state;
  57 #endif
  58 #ifdef TC_ARM
  59 static char last_char;
  60 #endif
  61
  62 static char lex[256];
  63 static const char symbol_chars[] =
  64 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  65
  66 #define LEX_IS_SYMBOL_COMPONENT         1
  67 #define LEX_IS_WHITESPACE               2
  68 #define LEX_IS_LINE_SEPARATOR           3
  69 #define LEX_IS_COMMENT_START            4
  70 #define LEX_IS_LINE_COMMENT_START       5
  71 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  72 #define LEX_IS_STRINGQUOTE              8
  73 #define LEX_IS_COLON                    9
  74 #define LEX_IS_NEWLINE                  10
  75 #define LEX_IS_ONECHAR_QUOTE            11
  76 #ifdef TC_V850
  77 #define LEX_IS_DOUBLEDASH_1ST           12
  78 #endif
  79 #ifdef TC_M32R
  80 #define DOUBLEBAR_PARALLEL
  81 #endif
  82 #ifdef DOUBLEBAR_PARALLEL
  83 #define LEX_IS_DOUBLEBAR_1ST            13
  84 #endif
  85 #define LEX_IS_PARALLEL_SEPARATOR       14
  86 #ifdef H_TICK_HEX
  87 #define LEX_IS_H                        15
  88 #endif
  89 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  90 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  91 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  92 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  93 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  94 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  95 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  96
  97 static int process_escape (int);
  98
  99 /* FIXME-soon: The entire lexer/parser thingy should be
 100    built statically at compile time rather than dynamically
 101    each and every time the assembler is run.  xoxorich.  */
 102
 103 void
 104 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
 105 {
 106   const char *p;
 107   int c;
 108
 109   lex[' '] = LEX_IS_WHITESPACE;
 110   lex['\t'] = LEX_IS_WHITESPACE;
 111   lex['\r'] = LEX_IS_WHITESPACE;
 112   lex['\n'] = LEX_IS_NEWLINE;
 113   lex[':'] = LEX_IS_COLON;
 114
 115 #ifdef TC_M68K
 116   scrub_m68k_mri = m68k_mri;
 117
 118   if (! m68k_mri)
 119 #endif
 120     {
 121       lex['"'] = LEX_IS_STRINGQUOTE;
 122
 123 #if ! defined (TC_HPPA)
 124       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 125 #endif
 126
 127 #ifdef SINGLE_QUOTE_STRINGS
 128       lex['\''] = LEX_IS_STRINGQUOTE;
 129 #endif
 130     }
 131
 132   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 133      in state 5 of do_scrub_chars must be changed.  */
 134
 135   /* Note that these override the previous defaults, e.g. if ';' is a
 136      comment char, then it isn't a line separator.  */
 137   for (p = symbol_chars; *p; ++p)
 138     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 139
 140   for (c = 128; c < 256; ++c)
 141     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 142
 143 #ifdef tc_symbol_chars
 144   /* This macro permits the processor to specify all characters which
 145      may appears in an operand.  This will prevent the scrubber from
 146      discarding meaningful whitespace in certain cases.  The i386
 147      backend uses this to support prefixes, which can confuse the
 148      scrubber as to whether it is parsing operands or opcodes.  */
 149   for (p = tc_symbol_chars; *p; ++p)
 150     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 151 #endif
 152
 153   /* The m68k backend wants to be able to change comment_chars.  */
 154 #ifndef tc_comment_chars
 155 #define tc_comment_chars comment_chars
 156 #endif
 157   for (p = tc_comment_chars; *p; p++)
 158     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 159
 160   for (p = line_comment_chars; *p; p++)
 161     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 162
 163 #ifndef tc_line_separator_chars
 164 #define tc_line_separator_chars line_separator_chars
 165 #endif
 166   for (p = tc_line_separator_chars; *p; p++)
 167     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 168
 169 #ifdef tc_parallel_separator_chars
 170   /* This macro permits the processor to specify all characters which
 171      separate parallel insns on the same line.  */
 172   for (p = tc_parallel_separator_chars; *p; p++)
 173     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 174 #endif
 175
 176   /* Only allow slash-star comments if slash is not in use.
 177      FIXME: This isn't right.  We should always permit them.  */
 178   if (lex['/'] == 0)
 179     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 180
 181 #ifdef TC_M68K
 182   if (m68k_mri)
 183     {
 184       lex['\''] = LEX_IS_STRINGQUOTE;
 185       lex[';'] = LEX_IS_COMMENT_START;
 186       lex['*'] = LEX_IS_LINE_COMMENT_START;
 187       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 188          then it can't be used in an expression.  */
 189       lex['!'] = LEX_IS_LINE_COMMENT_START;
 190     }
 191 #endif
 192
 193 #ifdef TC_V850
 194   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 195 #endif
 196 #ifdef DOUBLEBAR_PARALLEL
 197   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 198 #endif
 199 #ifdef TC_D30V
 200   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 201   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 202 #endif
 203
 204 #ifdef H_TICK_HEX
 205   if (enable_h_tick_hex)
 206     {
 207       lex['h'] = LEX_IS_H;
 208       lex['H'] = LEX_IS_H;
 209     }
 210 #endif
 211 }
 212
 213 /* Saved state of the scrubber.  */
 214 static int state;
 215 static int old_state;
 216 static const char *out_string;
 217 static char out_buf[20];
 218 static int add_newlines;
 219 static char *saved_input;
 220 static size_t saved_input_len;
 221 static char input_buffer[32 * 1024];
 222 static const char *mri_state;
 223 static char mri_last_ch;
 224
 225 /* Data structure for saving the state of app across #include's.  Note that
 226    app is called asynchronously to the parsing of the .include's, so our
 227    state at the time .include is interpreted is completely unrelated.
 228    That's why we have to save it all.  */
 229
 230 struct app_save
 231 {
 232   int          state;
 233   int          old_state;
 234   const char * out_string;
 235   char         out_buf[sizeof (out_buf)];
 236   int          add_newlines;
 237   char *       saved_input;
 238   size_t       saved_input_len;
 239 #ifdef TC_M68K
 240   int          scrub_m68k_mri;
 241 #endif
 242   const char * mri_state;
 243   char         mri_last_ch;
 244 #if defined TC_ARM && defined OBJ_ELF
 245   const char * symver_state;
 246 #endif
 247 #ifdef TC_ARM
 248   char last_char;
 249 #endif
 250 };
 251
 252 char *
 253 app_push (void)
 254 {
 255   struct app_save *saved;
 256
 257   saved = XNEW (struct app_save);
 258   saved->state = state;
 259   saved->old_state = old_state;
 260   saved->out_string = out_string;
 261   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 262   saved->add_newlines = add_newlines;
 263   if (saved_input == NULL)
 264     saved->saved_input = NULL;
 265   else
 266     {
 267       saved->saved_input = XNEWVEC (char, saved_input_len);
 268       memcpy (saved->saved_input, saved_input, saved_input_len);
 269       saved->saved_input_len = saved_input_len;
 270     }
 271 #ifdef TC_M68K
 272   saved->scrub_m68k_mri = scrub_m68k_mri;
 273 #endif
 274   saved->mri_state = mri_state;
 275   saved->mri_last_ch = mri_last_ch;
 276 #if defined TC_ARM && defined OBJ_ELF
 277   saved->symver_state = symver_state;
 278 #endif
 279 #ifdef TC_ARM
 280   saved->last_char = last_char;
 281 #endif
 282
 283   /* do_scrub_begin() is not useful, just wastes time.  */
 284
 285   state = 0;
 286   saved_input = NULL;
 287   add_newlines = 0;
 288
 289   return (char *) saved;
 290 }
 291
 292 void
 293 app_pop (char *arg)
 294 {
 295   struct app_save *saved = (struct app_save *) arg;
 296
 297   /* There is no do_scrub_end ().  */
 298   state = saved->state;
 299   old_state = saved->old_state;
 300   out_string = saved->out_string;
 301   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 302   add_newlines = saved->add_newlines;
 303   if (saved->saved_input == NULL)
 304     saved_input = NULL;
 305   else
 306     {
 307       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
 308       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 309       saved_input = input_buffer;
 310       saved_input_len = saved->saved_input_len;
 311       free (saved->saved_input);
 312     }
 313 #ifdef TC_M68K
 314   scrub_m68k_mri = saved->scrub_m68k_mri;
 315 #endif
 316   mri_state = saved->mri_state;
 317   mri_last_ch = saved->mri_last_ch;
 318 #if defined TC_ARM && defined OBJ_ELF
 319   symver_state = saved->symver_state;
 320 #endif
 321 #ifdef TC_ARM
 322   last_char = saved->last_char;
 323 #endif
 324
 325   free (arg);
 326 }
 327
 328 /* @@ This assumes that \n &c are the same on host and target.  This is not
 329    necessarily true.  */
 330
 331 static int
 332 process_escape (int ch)
 333 {
 334   switch (ch)
 335     {
 336     case 'b':
 337       return '\b';
 338     case 'f':
 339       return '\f';
 340     case 'n':
 341       return '\n';
 342     case 'r':
 343       return '\r';
 344     case 't':
 345       return '\t';
 346     case '\'':
 347       return '\'';
 348     case '"':
 349       return '\"';
 350     default:
 351       return ch;
 352     }
 353 }
 354
 355 /* This function is called to process input characters.  The GET
 356    parameter is used to retrieve more input characters.  GET should
 357    set its parameter to point to a buffer, and return the length of
 358    the buffer; it should return 0 at end of file.  The scrubbed output
 359    characters are put into the buffer starting at TOSTART; the TOSTART
 360    buffer is TOLEN bytes in length.  The function returns the number
 361    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 362    end of file was seen.  This function is arranged as a state
 363    machine, and saves its state so that it may return at any point.
 364    This is the way the old code used to work.  */
 365
 366 size_t
 367 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
 368 {
 369   char *to = tostart;
 370   char *toend = tostart + tolen;
 371   char *from;
 372   char *fromend;
 373   size_t fromlen;
 374   int ch, ch2 = 0;
 375   /* Character that started the string we're working on.  */
 376   static char quotechar;
 377
 378   /*State 0: beginning of normal line
 379           1: After first whitespace on line (flush more white)
 380           2: After first non-white (opcode) on line (keep 1white)
 381           3: after second white on line (into operands) (flush white)
 382           4: after putting out a .linefile, put out digits
 383           5: parsing a string, then go to old-state
 384           6: putting out \ escape in a "d string.
 385           7: no longer used
 386           8: no longer used
 387           9: After seeing symbol char in state 3 (keep 1white after symchar)
 388          10: After seeing whitespace in state 9 (keep white before symchar)
 389          11: After seeing a symbol character in state 0 (eg a label definition)
 390          -1: output string in out_string and go to the state in old_state
 391          -2: flush text until a '*' '/' is seen, then go to state old_state
 392 #ifdef TC_V850
 393          12: After seeing a dash, looking for a second dash as a start
 394              of comment.
 395 #endif
 396 #ifdef DOUBLEBAR_PARALLEL
 397          13: After seeing a vertical bar, looking for a second
 398              vertical bar as a parallel expression separator.
 399 #endif
 400 #ifdef TC_PREDICATE_START_CHAR
 401          14: After seeing a predicate start character at state 0, looking
 402              for a predicate end character as predicate.
 403          15: After seeing a predicate start character at state 1, looking
 404              for a predicate end character as predicate.
 405 #endif
 406 #ifdef TC_Z80
 407          16: After seeing an 'a' or an 'A' at the start of a symbol
 408          17: After seeing an 'f' or an 'F' in state 16
 409 #endif
 410           */
 411
 412   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 413      constructs like ``.loc 1 20''.  This was turning into ``.loc
 414      120''.  States 9 and 10 ensure that a space is never dropped in
 415      between characters which could appear in an identifier.  Ian
 416      Taylor, ian@cygnus.com.
 417
 418      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 419      correctly on the PA (and any other target where colons are optional).
 420      Jeff Law, law@cs.utah.edu.
 421
 422      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 423      get squashed into "cmp r1,r2||trap#1", with the all important space
 424      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 425
 426   /* This macro gets the next input character.  */
 427
 428 #define GET()                                                   \
 429   (from < fromend                                               \
 430    ? * (unsigned char *) (from++)                               \
 431    : (saved_input = NULL,                                       \
 432       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 433       from = input_buffer,                                      \
 434       fromend = from + fromlen,                                 \
 435       (fromlen == 0                                             \
 436        ? EOF                                                    \
 437        : * (unsigned char *) (from++))))
 438
 439   /* This macro pushes a character back on the input stream.  */
 440
 441 #define UNGET(uch) (*--from = (uch))
 442
 443   /* This macro puts a character into the output buffer.  If this
 444      character fills the output buffer, this macro jumps to the label
 445      TOFULL.  We use this rather ugly approach because we need to
 446      handle two different termination conditions: EOF on the input
 447      stream, and a full output buffer.  It would be simpler if we
 448      always read in the entire input stream before processing it, but
 449      I don't want to make such a significant change to the assembler's
 450      memory usage.  */
 451
 452 #define PUT(pch)                                \
 453   do                                            \
 454     {                                           \
 455       *to++ = (pch);                            \
 456       if (to >= toend)                          \
 457         goto tofull;                            \
 458     }                                           \
 459   while (0)
 460
 461   if (saved_input != NULL)
 462     {
 463       from = saved_input;
 464       fromend = from + saved_input_len;
 465     }
 466   else
 467     {
 468       fromlen = (*get) (input_buffer, sizeof input_buffer);
 469       if (fromlen == 0)
 470         return 0;
 471       from = input_buffer;
 472       fromend = from + fromlen;
 473     }
 474
 475   while (1)
 476     {
 477       /* The cases in this switch end with continue, in order to
 478          branch back to the top of this while loop and generate the
 479          next output character in the appropriate state.  */
 480       switch (state)
 481         {
 482         case -1:
 483           ch = *out_string++;
 484           if (*out_string == '\0')
 485             {
 486               state = old_state;
 487               old_state = 3;
 488             }
 489           PUT (ch);
 490           continue;
 491
 492         case -2:
 493           for (;;)
 494             {
 495               do
 496                 {
 497                   ch = GET ();
 498
 499                   if (ch == EOF)
 500                     {
 501                       as_warn (_("end of file in comment"));
 502                       goto fromeof;
 503                     }
 504
 505                   if (ch == '\n')
 506                     PUT ('\n');
 507                 }
 508               while (ch != '*');
 509
 510               while ((ch = GET ()) == '*')
 511                 ;
 512
 513               if (ch == EOF)
 514                 {
 515                   as_warn (_("end of file in comment"));
 516                   goto fromeof;
 517                 }
 518
 519               if (ch == '/')
 520                 break;
 521
 522               UNGET (ch);
 523             }
 524
 525           state = old_state;
 526           UNGET (' ');
 527           continue;
 528
 529         case 4:
 530           ch = GET ();
 531           if (ch == EOF)
 532             goto fromeof;
 533           else if (ch >= '0' && ch <= '9')
 534             PUT (ch);
 535           else
 536             {
 537               while (ch != EOF && IS_WHITESPACE (ch))
 538                 ch = GET ();
 539               if (ch == '"')
 540                 {
 541                   quotechar = ch;
 542                   state = 5;
 543                   old_state = 3;
 544                   PUT (ch);
 545                 }
 546               else
 547                 {
 548                   while (ch != EOF && ch != '\n')
 549                     ch = GET ();
 550                   state = 0;
 551                   PUT (ch);
 552                 }
 553             }
 554           continue;
 555
 556         case 5:
 557           /* We are going to copy everything up to a quote character,
 558              with special handling for a backslash.  We try to
 559              optimize the copying in the simple case without using the
 560              GET and PUT macros.  */
 561           {
 562             char *s;
 563             ptrdiff_t len;
 564
 565             for (s = from; s < fromend; s++)
 566               {
 567                 ch = *s;
 568                 if (ch == '\\'
 569                     || ch == quotechar
 570                     || ch == '\n')
 571                   break;
 572               }
 573             len = s - from;
 574             if (len > toend - to)
 575               len = toend - to;
 576             if (len > 0)
 577               {
 578                 memcpy (to, from, len);
 579                 to += len;
 580                 from += len;
 581                 if (to >= toend)
 582                   goto tofull;
 583               }
 584           }
 585
 586           ch = GET ();
 587           if (ch == EOF)
 588             {
 589               /* This buffer is here specifically so
 590                  that the UNGET below will work.  */
 591               static char one_char_buf[1];
 592
 593               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 594               state = old_state;
 595               from = fromend = one_char_buf + 1;
 596               fromlen = 1;
 597               UNGET ('\n');
 598               PUT (quotechar);
 599             }
 600           else if (ch == quotechar)
 601             {
 602               state = old_state;
 603               PUT (ch);
 604             }
 605 #ifndef NO_STRING_ESCAPES
 606           else if (ch == '\\')
 607             {
 608               state = 6;
 609               PUT (ch);
 610             }
 611 #endif
 612           else if (scrub_m68k_mri && ch == '\n')
 613             {
 614               /* Just quietly terminate the string.  This permits lines like
 615                    bne  label   loop if we haven't reach end yet.  */
 616               state = old_state;
 617               UNGET (ch);
 618               PUT ('\'');
 619             }
 620           else
 621             {
 622               PUT (ch);
 623             }
 624           continue;
 625
 626         case 6:
 627           state = 5;
 628           ch = GET ();
 629           switch (ch)
 630             {
 631               /* Handle strings broken across lines, by turning '\n' into
 632                  '\\' and 'n'.  */
 633             case '\n':
 634               UNGET ('n');
 635               add_newlines++;
 636               PUT ('\\');
 637               continue;
 638
 639             case EOF:
 640               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 641               PUT (quotechar);
 642               continue;
 643
 644             case '"':
 645             case '\\':
 646             case 'b':
 647             case 'f':
 648             case 'n':
 649             case 'r':
 650             case 't':
 651             case 'v':
 652             case 'x':
 653             case 'X':
 654             case '0':
 655             case '1':
 656             case '2':
 657             case '3':
 658             case '4':
 659             case '5':
 660             case '6':
 661             case '7':
 662               break;
 663
 664             default:
 665 #ifdef ONLY_STANDARD_ESCAPES
 666               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 667 #endif
 668               break;
 669             }
 670           PUT (ch);
 671           continue;
 672
 673 #ifdef DOUBLEBAR_PARALLEL
 674         case 13:
 675           ch = GET ();
 676           if (ch != '|')
 677             abort ();
 678
 679           /* Reset back to state 1 and pretend that we are parsing a
 680              line from just after the first white space.  */
 681           state = 1;
 682           PUT ('|');
 683 #ifdef TC_TIC6X
 684           /* "||^" is used for SPMASKed instructions.  */
 685           ch = GET ();
 686           if (ch == EOF)
 687             goto fromeof;
 688           else if (ch == '^')
 689             PUT ('^');
 690           else
 691             UNGET (ch);
 692 #endif
 693           continue;
 694 #endif
 695 #ifdef TC_Z80
 696         case 16:
 697           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 698           ch = GET ();
 699           if (ch == 'f' || ch == 'F')
 700             {
 701               state = 17;
 702               PUT (ch);
 703             }
 704           else
 705             {
 706               state = 9;
 707               break;
 708             }
 709           /* Fall through.  */
 710         case 17:
 711           /* We have seen "af" at the start of a symbol,
 712              a ' here is a part of that symbol.  */
 713           ch = GET ();
 714           state = 9;
 715           if (ch == '\'')
 716             /* Change to avoid warning about unclosed string.  */
 717             PUT ('`');
 718           else if (ch != EOF)
 719             UNGET (ch);
 720           break;
 721 #endif
 722         }
 723
 724       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 725
 726       /* flushchar: */
 727       ch = GET ();
 728
 729 #ifdef TC_PREDICATE_START_CHAR
 730       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
 731         {
 732           state += 14;
 733           PUT (ch);
 734           continue;
 735         }
 736       else if (state == 14 || state == 15)
 737         {
 738           if (ch == TC_PREDICATE_END_CHAR)
 739             {
 740               state -= 14;
 741               PUT (ch);
 742               ch = GET ();
 743             }
 744           else
 745             {
 746               PUT (ch);
 747               continue;
 748             }
 749         }
 750 #endif
 751
 752     recycle:
 753
 754 #if defined TC_ARM && defined OBJ_ELF
 755       /* We need to watch out for .symver directives.  See the comment later
 756          in this function.  */
 757       if (symver_state == NULL)
 758         {
 759           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 760             symver_state = symver_pseudo + 1;
 761         }
 762       else
 763         {
 764           /* We advance to the next state if we find the right
 765              character.  */
 766           if (ch != '\0' && (*symver_state == ch))
 767             ++symver_state;
 768           else if (*symver_state != '\0')
 769             /* We did not get the expected character, or we didn't
 770                get a valid terminating character after seeing the
 771                entire pseudo-op, so we must go back to the beginning.  */
 772             symver_state = NULL;
 773           else
 774             {
 775               /* We've read the entire pseudo-op.  If this is the end
 776                  of the line, go back to the beginning.  */
 777               if (IS_NEWLINE (ch))
 778                 symver_state = NULL;
 779             }
 780         }
 781 #endif /* TC_ARM && OBJ_ELF */
 782
 783 #ifdef TC_M68K
 784       /* We want to have pseudo-ops which control whether we are in
 785          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 786          the scrubber, that means that we need a special purpose
 787          recognizer here.  */
 788       if (mri_state == NULL)
 789         {
 790           if ((state == 0 || state == 1)
 791               && ch == mri_pseudo[0])
 792             mri_state = mri_pseudo + 1;
 793         }
 794       else
 795         {
 796           /* We advance to the next state if we find the right
 797              character, or if we need a space character and we get any
 798              whitespace character, or if we need a '0' and we get a
 799              '1' (this is so that we only need one state to handle
 800              ``.mri 0'' and ``.mri 1'').  */
 801           if (ch != '\0'
 802               && (*mri_state == ch
 803                   || (*mri_state == ' '
 804                       && lex[ch] == LEX_IS_WHITESPACE)
 805                   || (*mri_state == '0'
 806                       && ch == '1')))
 807             {
 808               mri_last_ch = ch;
 809               ++mri_state;
 810             }
 811           else if (*mri_state != '\0'
 812                    || (lex[ch] != LEX_IS_WHITESPACE
 813                        && lex[ch] != LEX_IS_NEWLINE))
 814             {
 815               /* We did not get the expected character, or we didn't
 816                  get a valid terminating character after seeing the
 817                  entire pseudo-op, so we must go back to the
 818                  beginning.  */
 819               mri_state = NULL;
 820             }
 821           else
 822             {
 823               /* We've read the entire pseudo-op.  mips_last_ch is
 824                  either '0' or '1' indicating whether to enter or
 825                  leave MRI mode.  */
 826               do_scrub_begin (mri_last_ch == '1');
 827               mri_state = NULL;
 828
 829               /* We continue handling the character as usual.  The
 830                  main gas reader must also handle the .mri pseudo-op
 831                  to control expression parsing and the like.  */
 832             }
 833         }
 834 #endif
 835
 836       if (ch == EOF)
 837         {
 838           if (state != 0)
 839             {
 840               as_warn (_("end of file not at end of a line; newline inserted"));
 841               state = 0;
 842               PUT ('\n');
 843             }
 844           goto fromeof;
 845         }
 846
 847       switch (lex[ch])
 848         {
 849         case LEX_IS_WHITESPACE:
 850           do
 851             {
 852               ch = GET ();
 853             }
 854           while (ch != EOF && IS_WHITESPACE (ch));
 855           if (ch == EOF)
 856             goto fromeof;
 857
 858           if (state == 0)
 859             {
 860               /* Preserve a single whitespace character at the
 861                  beginning of a line.  */
 862               state = 1;
 863               UNGET (ch);
 864               PUT (' ');
 865               break;
 866             }
 867
 868 #ifdef KEEP_WHITE_AROUND_COLON
 869           if (lex[ch] == LEX_IS_COLON)
 870             {
 871               /* Only keep this white if there's no white *after* the
 872                  colon.  */
 873               ch2 = GET ();
 874               if (ch2 != EOF)
 875                 UNGET (ch2);
 876               if (!IS_WHITESPACE (ch2))
 877                 {
 878                   state = 9;
 879                   UNGET (ch);
 880                   PUT (' ');
 881                   break;
 882                 }
 883             }
 884 #endif
 885           if (IS_COMMENT (ch)
 886               || ch == '/'
 887               || IS_LINE_SEPARATOR (ch)
 888               || IS_PARALLEL_SEPARATOR (ch))
 889             {
 890               if (scrub_m68k_mri)
 891                 {
 892                   /* In MRI mode, we keep these spaces.  */
 893                   UNGET (ch);
 894                   PUT (' ');
 895                   break;
 896                 }
 897               goto recycle;
 898             }
 899
 900           /* If we're in state 2 or 11, we've seen a non-white
 901              character followed by whitespace.  If the next character
 902              is ':', this is whitespace after a label name which we
 903              normally must ignore.  In MRI mode, though, spaces are
 904              not permitted between the label and the colon.  */
 905           if ((state == 2 || state == 11)
 906               && lex[ch] == LEX_IS_COLON
 907               && ! scrub_m68k_mri)
 908             {
 909               state = 1;
 910               PUT (ch);
 911               break;
 912             }
 913
 914           switch (state)
 915             {
 916             case 1:
 917               /* We can arrive here if we leave a leading whitespace
 918                  character at the beginning of a line.  */
 919               goto recycle;
 920             case 2:
 921               state = 3;
 922               if (to + 1 < toend)
 923                 {
 924                   /* Optimize common case by skipping UNGET/GET.  */
 925                   PUT (' ');    /* Sp after opco */
 926                   goto recycle;
 927                 }
 928               UNGET (ch);
 929               PUT (' ');
 930               break;
 931             case 3:
 932 #ifndef TC_KEEP_OPERAND_SPACES
 933               /* For TI C6X, we keep these spaces as they may separate
 934                  functional unit specifiers from operands.  */
 935               if (scrub_m68k_mri)
 936 #endif
 937                 {
 938                   /* In MRI mode, we keep these spaces.  */
 939                   UNGET (ch);
 940                   PUT (' ');
 941                   break;
 942                 }
 943               goto recycle;     /* Sp in operands */
 944             case 9:
 945             case 10:
 946 #ifndef TC_KEEP_OPERAND_SPACES
 947               if (scrub_m68k_mri)
 948 #endif
 949                 {
 950                   /* In MRI mode, we keep these spaces.  */
 951                   state = 3;
 952                   UNGET (ch);
 953                   PUT (' ');
 954                   break;
 955                 }
 956               state = 10;       /* Sp after symbol char */
 957               goto recycle;
 958             case 11:
 959               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 960                 state = 1;
 961               else
 962                 {
 963                   /* We know that ch is not ':', since we tested that
 964                      case above.  Therefore this is not a label, so it
 965                      must be the opcode, and we've just seen the
 966                      whitespace after it.  */
 967                   state = 3;
 968                 }
 969               UNGET (ch);
 970               PUT (' ');        /* Sp after label definition.  */
 971               break;
 972             default:
 973               BAD_CASE (state);
 974             }
 975           break;
 976
 977         case LEX_IS_TWOCHAR_COMMENT_1ST:
 978           ch2 = GET ();
 979           if (ch2 == '*')
 980             {
 981               for (;;)
 982                 {
 983                   do
 984                     {
 985                       ch2 = GET ();
 986                       if (ch2 != EOF && IS_NEWLINE (ch2))
 987                         add_newlines++;
 988                     }
 989                   while (ch2 != EOF && ch2 != '*');
 990
 991                   while (ch2 == '*')
 992                     ch2 = GET ();
 993
 994                   if (ch2 == EOF || ch2 == '/')
 995                     break;
 996
 997                   /* This UNGET will ensure that we count newlines
 998                      correctly.  */
 999                   UNGET (ch2);
1000                 }
1001
1002               if (ch2 == EOF)
1003                 as_warn (_("end of file in multiline comment"));
1004
1005               ch = ' ';
1006               goto recycle;
1007             }
1008 #ifdef DOUBLESLASH_LINE_COMMENTS
1009           else if (ch2 == '/')
1010             {
1011               do
1012                 {
1013                   ch = GET ();
1014                 }
1015               while (ch != EOF && !IS_NEWLINE (ch));
1016               if (ch == EOF)
1017                 as_warn ("end of file in comment; newline inserted");
1018               state = 0;
1019               PUT ('\n');
1020               break;
1021             }
1022 #endif
1023           else
1024             {
1025               if (ch2 != EOF)
1026                 UNGET (ch2);
1027               if (state == 9 || state == 10)
1028                 state = 3;
1029               PUT (ch);
1030             }
1031           break;
1032
1033         case LEX_IS_STRINGQUOTE:
1034           quotechar = ch;
1035           if (state == 10)
1036             {
1037               /* Preserve the whitespace in foo "bar".  */
1038               UNGET (ch);
1039               state = 3;
1040               PUT (' ');
1041
1042               /* PUT didn't jump out.  We could just break, but we
1043                  know what will happen, so optimize a bit.  */
1044               ch = GET ();
1045               old_state = 3;
1046             }
1047           else if (state == 9)
1048             old_state = 3;
1049           else
1050             old_state = state;
1051           state = 5;
1052           PUT (ch);
1053           break;
1054
1055         case LEX_IS_ONECHAR_QUOTE:
1056 #ifdef H_TICK_HEX
1057           if (state == 9 && enable_h_tick_hex)
1058             {
1059               char c;
1060
1061               c = GET ();
1062               as_warn ("'%c found after symbol", c);
1063               UNGET (c);
1064             }
1065 #endif
1066           if (state == 10)
1067             {
1068               /* Preserve the whitespace in foo 'b'.  */
1069               UNGET (ch);
1070               state = 3;
1071               PUT (' ');
1072               break;
1073             }
1074           ch = GET ();
1075           if (ch == EOF)
1076             {
1077               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1078               ch = 0;
1079             }
1080           if (ch == '\\')
1081             {
1082               ch = GET ();
1083               if (ch == EOF)
1084                 {
1085                   as_warn (_("end of file in escape character"));
1086                   ch = '\\';
1087                 }
1088               else
1089                 ch = process_escape (ch);
1090             }
1091           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1092
1093           /* None of these 'x constants for us.  We want 'x'.  */
1094           if ((ch = GET ()) != '\'')
1095             {
1096 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1097               as_warn (_("missing close quote; (assumed)"));
1098 #else
1099               if (ch != EOF)
1100                 UNGET (ch);
1101 #endif
1102             }
1103           if (strlen (out_buf) == 1)
1104             {
1105               PUT (out_buf[0]);
1106               break;
1107             }
1108           if (state == 9)
1109             old_state = 3;
1110           else
1111             old_state = state;
1112           state = -1;
1113           out_string = out_buf;
1114           PUT (*out_string++);
1115           break;
1116
1117         case LEX_IS_COLON:
1118 #ifdef KEEP_WHITE_AROUND_COLON
1119           state = 9;
1120 #else
1121           if (state == 9 || state == 10)
1122             state = 3;
1123           else if (state != 3)
1124             state = 1;
1125 #endif
1126           PUT (ch);
1127           break;
1128
1129         case LEX_IS_NEWLINE:
1130           /* Roll out a bunch of newlines from inside comments, etc.  */
1131           if (add_newlines)
1132             {
1133               --add_newlines;
1134               UNGET (ch);
1135             }
1136           /* Fall through.  */
1137
1138         case LEX_IS_LINE_SEPARATOR:
1139           state = 0;
1140           PUT (ch);
1141           break;
1142
1143         case LEX_IS_PARALLEL_SEPARATOR:
1144           state = 1;
1145           PUT (ch);
1146           break;
1147
1148 #ifdef TC_V850
1149         case LEX_IS_DOUBLEDASH_1ST:
1150           ch2 = GET ();
1151           if (ch2 != '-')
1152             {
1153               if (ch2 != EOF)
1154                 UNGET (ch2);
1155               goto de_fault;
1156             }
1157           /* Read and skip to end of line.  */
1158           do
1159             {
1160               ch = GET ();
1161             }
1162           while (ch != EOF && ch != '\n');
1163
1164           if (ch == EOF)
1165             as_warn (_("end of file in comment; newline inserted"));
1166
1167           state = 0;
1168           PUT ('\n');
1169           break;
1170 #endif
1171 #ifdef DOUBLEBAR_PARALLEL
1172         case LEX_IS_DOUBLEBAR_1ST:
1173           ch2 = GET ();
1174           if (ch2 != EOF)
1175             UNGET (ch2);
1176           if (ch2 != '|')
1177             goto de_fault;
1178
1179           /* Handle '||' in two states as invoking PUT twice might
1180              result in the first one jumping out of this loop.  We'd
1181              then lose track of the state and one '|' char.  */
1182           state = 13;
1183           PUT ('|');
1184           break;
1185 #endif
1186         case LEX_IS_LINE_COMMENT_START:
1187           /* FIXME-someday: The two character comment stuff was badly
1188              thought out.  On i386, we want '/' as line comment start
1189              AND we want C style comments.  hence this hack.  The
1190              whole lexical process should be reworked.  xoxorich.  */
1191           if (ch == '/')
1192             {
1193               ch2 = GET ();
1194               if (ch2 == '*')
1195                 {
1196                   old_state = 3;
1197                   state = -2;
1198                   break;
1199                 }
1200               else if (ch2 != EOF)
1201                 {
1202                   UNGET (ch2);
1203                 }
1204             }
1205
1206           if (state == 0 || state == 1) /* Only comment at start of line.  */
1207             {
1208               int startch;
1209
1210               startch = ch;
1211
1212               do
1213                 {
1214                   ch = GET ();
1215                 }
1216               while (ch != EOF && IS_WHITESPACE (ch));
1217
1218               if (ch == EOF)
1219                 {
1220                   as_warn (_("end of file in comment; newline inserted"));
1221                   PUT ('\n');
1222                   break;
1223                 }
1224
1225               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1226                 {
1227                   /* Not a cpp line.  */
1228                   while (ch != EOF && !IS_NEWLINE (ch))
1229                     ch = GET ();
1230                   if (ch == EOF)
1231                     {
1232                       as_warn (_("end of file in comment; newline inserted"));
1233                       PUT ('\n');
1234                     }
1235                   else /* IS_NEWLINE (ch) */
1236                     {
1237                       /* To process non-zero add_newlines.  */
1238                       UNGET (ch);
1239                     }
1240                   state = 0;
1241                   break;
1242                 }
1243               /* Looks like `# 123 "filename"' from cpp.  */
1244               UNGET (ch);
1245               old_state = 4;
1246               state = -1;
1247               if (scrub_m68k_mri)
1248                 out_string = "\tlinefile ";
1249               else
1250                 out_string = "\t.linefile ";
1251               PUT (*out_string++);
1252               break;
1253             }
1254
1255 #ifdef TC_D10V
1256           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1257              Trap is the only short insn that has a first operand that is
1258              neither register nor label.
1259              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1260              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1261              already LEX_IS_LINE_COMMENT_START.  However, it is the
1262              only character in line_comment_chars for d10v, hence we
1263              can recognize it as such.  */
1264           /* An alternative approach would be to reset the state to 1 when
1265              we see '||', '<'- or '->', but that seems to be overkill.  */
1266           if (state == 10)
1267             PUT (' ');
1268 #endif
1269           /* We have a line comment character which is not at the
1270              start of a line.  If this is also a normal comment
1271              character, fall through.  Otherwise treat it as a default
1272              character.  */
1273           if (strchr (tc_comment_chars, ch) == NULL
1274               && (! scrub_m68k_mri
1275                   || (ch != '!' && ch != '*')))
1276             goto de_fault;
1277           if (scrub_m68k_mri
1278               && (ch == '!' || ch == '*' || ch == '#')
1279               && state != 1
1280               && state != 10)
1281             goto de_fault;
1282           /* Fall through.  */
1283         case LEX_IS_COMMENT_START:
1284 #if defined TC_ARM && defined OBJ_ELF
1285           /* On the ARM, `@' is the comment character.
1286              Unfortunately this is also a special character in ELF .symver
1287              directives (and .type, though we deal with those another way).
1288              So we check if this line is such a directive, and treat
1289              the character as default if so.  This is a hack.  */
1290           if ((symver_state != NULL) && (*symver_state == 0))
1291             goto de_fault;
1292 #endif
1293
1294 #ifdef TC_ARM
1295           /* For the ARM, care is needed not to damage occurrences of \@
1296              by stripping the @ onwards.  Yuck.  */
1297           if ((to > tostart ? to[-1] : last_char) == '\\')
1298             /* Do not treat the @ as a start-of-comment.  */
1299             goto de_fault;
1300 #endif
1301
1302 #ifdef WARN_COMMENTS
1303           if (!found_comment)
1304             found_comment_file = as_where (&found_comment);
1305 #endif
1306           do
1307             {
1308               ch = GET ();
1309             }
1310           while (ch != EOF && !IS_NEWLINE (ch));
1311           if (ch == EOF)
1312             as_warn (_("end of file in comment; newline inserted"));
1313           state = 0;
1314           PUT ('\n');
1315           break;
1316
1317 #ifdef H_TICK_HEX
1318         case LEX_IS_H:
1319           /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1320              the H' with 0x to make them gas-style hex characters.  */
1321           if (enable_h_tick_hex)
1322             {
1323               char quot;
1324
1325               quot = GET ();
1326               if (quot == '\'')
1327                 {
1328                   UNGET ('x');
1329                   ch = '0';
1330                 }
1331               else
1332                 UNGET (quot);
1333             }
1334 #endif
1335           /* Fall through.  */
1336
1337         case LEX_IS_SYMBOL_COMPONENT:
1338           if (state == 10)
1339             {
1340               /* This is a symbol character following another symbol
1341                  character, with whitespace in between.  We skipped
1342                  the whitespace earlier, so output it now.  */
1343               UNGET (ch);
1344               state = 3;
1345               PUT (' ');
1346               break;
1347             }
1348
1349 #ifdef TC_Z80
1350           /* "af'" is a symbol containing '\''.  */
1351           if (state == 3 && (ch == 'a' || ch == 'A'))
1352             {
1353               state = 16;
1354               PUT (ch);
1355               ch = GET ();
1356               if (ch == 'f' || ch == 'F')
1357                 {
1358                   state = 17;
1359                   PUT (ch);
1360                   break;
1361                 }
1362               else
1363                 {
1364                   state = 9;
1365                   if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1366                     {
1367                       if (ch != EOF)
1368                         UNGET (ch);
1369                       break;
1370                     }
1371                 }
1372             }
1373 #endif
1374           if (state == 3)
1375             state = 9;
1376
1377           /* This is a common case.  Quickly copy CH and all the
1378              following symbol component or normal characters.  */
1379           if (to + 1 < toend
1380               && mri_state == NULL
1381 #if defined TC_ARM && defined OBJ_ELF
1382               && symver_state == NULL
1383 #endif
1384               )
1385             {
1386               char *s;
1387               ptrdiff_t len;
1388
1389               for (s = from; s < fromend; s++)
1390                 {
1391                   int type;
1392
1393                   ch2 = *(unsigned char *) s;
1394                   type = lex[ch2];
1395                   if (type != 0
1396                       && type != LEX_IS_SYMBOL_COMPONENT)
1397                     break;
1398                 }
1399
1400               if (s > from)
1401                 /* Handle the last character normally, for
1402                    simplicity.  */
1403                 --s;
1404
1405               len = s - from;
1406
1407               if (len > (toend - to) - 1)
1408                 len = (toend - to) - 1;
1409
1410               if (len > 0)
1411                 {
1412                   PUT (ch);
1413                   memcpy (to, from, len);
1414                   to += len;
1415                   from += len;
1416                   if (to >= toend)
1417                     goto tofull;
1418                   ch = GET ();
1419                 }
1420             }
1421
1422           /* Fall through.  */
1423         default:
1424         de_fault:
1425           /* Some relatively `normal' character.  */
1426           if (state == 0)
1427             {
1428               state = 11;       /* Now seeing label definition.  */
1429             }
1430           else if (state == 1)
1431             {
1432               state = 2;        /* Ditto.  */
1433             }
1434           else if (state == 9)
1435             {
1436               if (!IS_SYMBOL_COMPONENT (ch))
1437                 state = 3;
1438             }
1439           else if (state == 10)
1440             {
1441               if (ch == '\\')
1442                 {
1443                   /* Special handling for backslash: a backslash may
1444                      be the beginning of a formal parameter (of a
1445                      macro) following another symbol character, with
1446                      whitespace in between.  If that is the case, we
1447                      output a space before the parameter.  Strictly
1448                      speaking, correct handling depends upon what the
1449                      macro parameter expands into; if the parameter
1450                      expands into something which does not start with
1451                      an operand character, then we don't want to keep
1452                      the space.  We don't have enough information to
1453                      make the right choice, so here we are making the
1454                      choice which is more likely to be correct.  */
1455                   if (to + 1 >= toend)
1456                     {
1457                       /* If we're near the end of the buffer, save the
1458                          character for the next time round.  Otherwise
1459                          we'll lose our state.  */
1460                       UNGET (ch);
1461                       goto tofull;
1462                     }
1463                   *to++ = ' ';
1464                 }
1465
1466               state = 3;
1467             }
1468           PUT (ch);
1469           break;
1470         }
1471     }
1472
1473   /*NOTREACHED*/
1474
1475  fromeof:
1476   /* We have reached the end of the input.  */
1477 #ifdef TC_ARM
1478   if (to > tostart)
1479     last_char = to[-1];
1480 #endif
1481   return to - tostart;
1482
1483  tofull:
1484   /* The output buffer is full.  Save any input we have not yet
1485      processed.  */
1486   if (fromend > from)
1487     {
1488       saved_input = from;
1489       saved_input_len = fromend - from;
1490     }
1491   else
1492     saved_input = NULL;
1493
1494 #ifdef TC_ARM
1495   if (to > tostart)
1496     last_char = to[-1];
1497 #endif
1498   return to - tostart;
1499 }