gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright (C) 1987, 1990, 1991, 1992 Free Software Foundation, Inc.
   3
   4    This file is part of GAS, the GNU Assembler.
   5
   6    GAS is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    GAS is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GAS; see the file COPYING.  If not, write to
  18    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */
  19
  20 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
  21 /* App, the assembler pre-processor.  This pre-processor strips out excess
  22    spaces, turns single-quoted characters into a decimal constant, and turns
  23    # <number> <filename> <garbage> into a .line <number>\n.file <filename>
  24    pair.  This needs better error-handling.
  25    */
  26
  27 #include <stdio.h>
  28 #include "as.h"                 /* For BAD_CASE() only */
  29
  30 #if (__STDC__ != 1) && !defined(const)
  31 #define const                   /* Nothing */
  32 #endif
  33
  34 static char lex[256];
  35 static const char symbol_chars[] =
  36 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  37
  38 #define LEX_IS_SYMBOL_COMPONENT         1
  39 #define LEX_IS_WHITESPACE               2
  40 #define LEX_IS_LINE_SEPARATOR           3
  41 #define LEX_IS_COMMENT_START            4
  42 #define LEX_IS_LINE_COMMENT_START       5
  43 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  44 #define LEX_IS_TWOCHAR_COMMENT_2ND      7
  45 #define LEX_IS_STRINGQUOTE              8
  46 #define LEX_IS_COLON                    9
  47 #define LEX_IS_NEWLINE                  10
  48 #define LEX_IS_ONECHAR_QUOTE            11
  49 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  50 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  51 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  52 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  53 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  54 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  55
  56 static int process_escape PARAMS ((int));
  57
  58 /* FIXME-soon: The entire lexer/parser thingy should be
  59    built statically at compile time rather than dynamically
  60    each and every time the assembler is run.  xoxorich. */
  61
  62 void
  63 do_scrub_begin ()
  64 {
  65   const char *p;
  66
  67   lex[' '] = LEX_IS_WHITESPACE;
  68   lex['\t'] = LEX_IS_WHITESPACE;
  69   lex['\n'] = LEX_IS_NEWLINE;
  70   lex[';'] = LEX_IS_LINE_SEPARATOR;
  71   lex['"'] = LEX_IS_STRINGQUOTE;
  72 #ifndef TC_HPPA
  73   lex['\''] = LEX_IS_ONECHAR_QUOTE;
  74 #endif
  75   lex[':'] = LEX_IS_COLON;
  76
  77
  78
  79 #ifdef SINGLE_QUOTE_STRINGS
  80         lex['\''] = LEX_IS_STRINGQUOTE;
  81 #endif
  82
  83   /* Note that these override the previous defaults, e.g. if ';'
  84
  85            is a comment char, then it isn't a line separator.  */
  86   for (p = symbol_chars; *p; ++p)
  87     {
  88       lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
  89     }                           /* declare symbol characters */
  90
  91   for (p = comment_chars; *p; p++)
  92     {
  93       lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
  94     }                           /* declare comment chars */
  95
  96   for (p = line_comment_chars; *p; p++)
  97     {
  98       lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
  99     }                           /* declare line comment chars */
 100
 101   for (p = line_separator_chars; *p; p++)
 102     {
 103       lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 104     }                           /* declare line separators */
 105
 106   /* Only allow slash-star comments if slash is not in use */
 107   if (lex['/'] == 0)
 108     {
 109       lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 110     }
 111   /* FIXME-soon.  This is a bad hack but otherwise, we
 112            can't do c-style comments when '/' is a line
 113            comment char. xoxorich. */
 114   if (lex['*'] == 0)
 115     {
 116       lex['*'] = LEX_IS_TWOCHAR_COMMENT_2ND;
 117     }
 118 }                               /* do_scrub_begin() */
 119
 120 FILE *scrub_file;
 121
 122 int
 123 scrub_from_file ()
 124 {
 125   return getc (scrub_file);
 126 }
 127
 128 void
 129 scrub_to_file (ch)
 130      int ch;
 131 {
 132   ungetc (ch, scrub_file);
 133 }                               /* scrub_to_file() */
 134
 135 char *scrub_string;
 136 char *scrub_last_string;
 137
 138 int
 139 scrub_from_string ()
 140 {
 141   return scrub_string == scrub_last_string ? EOF : *scrub_string++;
 142 }                               /* scrub_from_string() */
 143
 144 void
 145 scrub_to_string (ch)
 146      int ch;
 147 {
 148   *--scrub_string = ch;
 149 }                               /* scrub_to_string() */
 150
 151 /* Saved state of the scrubber */
 152 static int state;
 153 static int old_state;
 154 static char *out_string;
 155 static char out_buf[20];
 156 static int add_newlines = 0;
 157
 158 /* Data structure for saving the state of app across #include's.  Note that
 159    app is called asynchronously to the parsing of the .include's, so our
 160    state at the time .include is interpreted is completely unrelated.
 161    That's why we have to save it all.  */
 162
 163 struct app_save
 164   {
 165     int state;
 166     int old_state;
 167     char *out_string;
 168     char out_buf[sizeof (out_buf)];
 169     int add_newlines;
 170     char *scrub_string;
 171     char *scrub_last_string;
 172     FILE *scrub_file;
 173   };
 174
 175 char *
 176 app_push ()
 177 {
 178   register struct app_save *saved;
 179
 180   saved = (struct app_save *) xmalloc (sizeof (*saved));
 181   saved->state = state;
 182   saved->old_state = old_state;
 183   saved->out_string = out_string;
 184   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 185   saved->add_newlines = add_newlines;
 186   saved->scrub_string = scrub_string;
 187   saved->scrub_last_string = scrub_last_string;
 188   saved->scrub_file = scrub_file;
 189
 190   /* do_scrub_begin() is not useful, just wastes time. */
 191   return (char *) saved;
 192 }
 193
 194 void
 195 app_pop (arg)
 196      char *arg;
 197 {
 198   register struct app_save *saved = (struct app_save *) arg;
 199
 200   /* There is no do_scrub_end (). */
 201   state = saved->state;
 202   old_state = saved->old_state;
 203   out_string = saved->out_string;
 204   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 205   add_newlines = saved->add_newlines;
 206   scrub_string = saved->scrub_string;
 207   scrub_last_string = saved->scrub_last_string;
 208   scrub_file = saved->scrub_file;
 209
 210   free (arg);
 211 }                               /* app_pop() */
 212
 213 /* @@ This assumes that \n &c are the same on host and target.  This is not
 214    necessarily true.  */
 215 static int
 216 process_escape (ch)
 217      int ch;
 218 {
 219   switch (ch)
 220     {
 221     case 'b':
 222       return '\b';
 223     case 'f':
 224       return '\f';
 225     case 'n':
 226       return '\n';
 227     case 'r':
 228       return '\r';
 229     case 't':
 230       return '\t';
 231     case '\'':
 232       return '\'';
 233     case '"':
 234       return '\"';
 235     default:
 236       return ch;
 237     }
 238 }
 239 int
 240 do_scrub_next_char (get, unget)
 241      int (*get) ();
 242      void (*unget) ();
 243 {
 244   /*State 0: beginning of normal line
 245           1: After first whitespace on line (flush more white)
 246           2: After first non-white (opcode) on line (keep 1white)
 247           3: after second white on line (into operands) (flush white)
 248           4: after putting out a .line, put out digits
 249           5: parsing a string, then go to old-state
 250           6: putting out \ escape in a "d string.
 251           7: After putting out a .appfile, put out string.
 252           8: After putting out a .appfile string, flush until newline.
 253           9: After seeing symbol char in state 3 (keep 1white after symchar)
 254          10: After seeing whitespace in state 9 (keep white before symchar)
 255           -1: output string in out_string and go to the state in old_state
 256           -2: flush text until a '*' '/' is seen, then go to state old_state
 257           */
 258
 259   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 260      constructs like ``.loc 1 20''.  This was turning into ``.loc
 261      120''.  States 9 and 10 ensure that a space is never dropped in
 262      between characters which could appear in a identifier.  Ian
 263      Taylor, ian@cygnus.com.  */
 264
 265   register int ch, ch2 = 0;
 266   int not_cpp_line = 0;
 267
 268   switch (state)
 269     {
 270     case -1:
 271       ch = *out_string++;
 272       if (*out_string == 0)
 273         {
 274           state = old_state;
 275           old_state = 3;
 276         }
 277       return ch;
 278
 279     case -2:
 280       for (;;)
 281         {
 282           do
 283             {
 284               ch = (*get) ();
 285             }
 286           while (ch != EOF && ch != '\n' && ch != '*');
 287           if (ch == '\n' || ch == EOF)
 288             return ch;
 289
 290           /* At this point, ch must be a '*' */
 291           while ((ch = (*get) ()) == '*')
 292             {
 293               ;
 294             }
 295           if (ch == EOF || ch == '/')
 296             break;
 297           (*unget) (ch);
 298         }
 299       state = old_state;
 300       return ' ';
 301
 302     case 4:
 303       ch = (*get) ();
 304       if (ch == EOF || (ch >= '0' && ch <= '9'))
 305         return ch;
 306       else
 307         {
 308           while (ch != EOF && IS_WHITESPACE (ch))
 309             ch = (*get) ();
 310           if (ch == '"')
 311             {
 312               (*unget) (ch);
 313               out_string = "\n\t.appfile ";
 314               old_state = 7;
 315               state = -1;
 316               return *out_string++;
 317             }
 318           else
 319             {
 320               while (ch != EOF && ch != '\n')
 321                 ch = (*get) ();
 322               state = 0;
 323               return ch;
 324             }
 325         }
 326
 327     case 5:
 328       ch = (*get) ();
 329       if (lex[ch] == LEX_IS_STRINGQUOTE)
 330         {
 331           state = old_state;
 332           return ch;
 333         }
 334       else if (ch == '\\')
 335         {
 336           state = 6;
 337           return ch;
 338         }
 339       else if (ch == EOF)
 340         {
 341           as_warn ("End of file in string: inserted '\"'");
 342           state = old_state;
 343           (*unget) ('\n');
 344           return '"';
 345         }
 346       else
 347         {
 348           return ch;
 349         }
 350
 351     case 6:
 352       state = 5;
 353       ch = (*get) ();
 354       switch (ch)
 355         {
 356           /* Handle strings broken across lines, by turning '\n' into
 357              '\\' and 'n'.  */
 358         case '\n':
 359           (*unget) ('n');
 360           add_newlines++;
 361           return '\\';
 362
 363         case '"':
 364         case '\\':
 365         case 'b':
 366         case 'f':
 367         case 'n':
 368         case 'r':
 369         case 't':
 370 #ifdef BACKSLASH_V
 371         case 'v':
 372 #endif /* BACKSLASH_V */
 373         case 'x':
 374         case 'X':
 375         case '0':
 376         case '1':
 377         case '2':
 378         case '3':
 379         case '4':
 380         case '5':
 381         case '6':
 382         case '7':
 383           break;
 384 #if defined(IGNORE_NONSTANDARD_ESCAPES) | defined(ONLY_STANDARD_ESCAPES)
 385         default:
 386           as_warn ("Unknown escape '\\%c' in string: Ignored", ch);
 387           break;
 388 #else /* ONLY_STANDARD_ESCAPES */
 389         default:
 390           /* Accept \x as x for any x */
 391           break;
 392 #endif /* ONLY_STANDARD_ESCAPES */
 393
 394         case EOF:
 395           as_warn ("End of file in string: '\"' inserted");
 396           return '"';
 397         }
 398       return ch;
 399
 400     case 7:
 401       ch = (*get) ();
 402       state = 5;
 403       old_state = 8;
 404       return ch;
 405
 406     case 8:
 407       do
 408         ch = (*get) ();
 409       while (ch != '\n');
 410       state = 0;
 411       return ch;
 412     }
 413
 414   /* OK, we are somewhere in states 0 through 4 or 9 through 10 */
 415
 416   /* flushchar: */
 417   ch = (*get) ();
 418 recycle:
 419   if (ch == EOF)
 420     {
 421       if (state != 0)
 422         as_warn ("End of file not at end of a line: Newline inserted.");
 423       return ch;
 424     }
 425
 426   switch (lex[ch])
 427     {
 428     case LEX_IS_WHITESPACE:
 429       do
 430         /* Preserve a single whitespace character at the beginning of
 431            a line.  */
 432         if (state == 0)
 433           {
 434             state = 1;
 435             return ch;
 436           }
 437         else
 438           ch = (*get) ();
 439       while (ch != EOF && IS_WHITESPACE (ch));
 440       if (ch == EOF)
 441         return ch;
 442
 443       if (IS_COMMENT (ch) || (state == 0 && IS_LINE_COMMENT (ch)) || ch == '/' || IS_LINE_SEPARATOR (ch))
 444         {
 445           /* cpp never outputs a leading space before the #, so try to
 446              avoid being confused.  */
 447           not_cpp_line = 1;
 448           goto recycle;
 449         }
 450
 451       /* If we're in state 2, we've seen a non-white
 452          character followed by whitespace.  If the next
 453          character is ':', this is whitespace after a label
 454          name which we can ignore.  */
 455       if (state == 2 && lex[ch] == LEX_IS_COLON)
 456         {
 457           state = 0;
 458           return ch;
 459         }
 460
 461 #if defined (LABELS_WITHOUT_COLONS) || defined (MRI)
 462       /* Like above, but handles case where labels are not
 463          required to have colons (and therefore must be identified
 464          by their *position* in the input stream.)  For a testcase
 465          see hppa/more.parse/labelbug.s.
 466
 467          This also has the effect of sometimes leaving a whitespace
 468          before a newline.  Instead of trying to rework this horribly
 469          broken and hairy code I'm just going to zap the extra space here.  */
 470       if (state == 2 && lex[ch] == LEX_IS_SYMBOL_COMPONENT)
 471         {
 472           (*unget) (ch);
 473           return ' ';
 474         }
 475
 476       /* Don't emit a space before a newline.  */
 477       if (state == 2 && lex[ch] == LEX_IS_NEWLINE)
 478         {
 479           state = 0;
 480           return ch;
 481         }
 482 #endif
 483
 484       switch (state)
 485         {
 486         case 0:
 487           state++;
 488           goto recycle;         /* Punted leading sp */
 489         case 1:
 490           /* We can arrive here if we leave a leading whitespace character
 491              at the beginning of a line.  */
 492           goto recycle;
 493         case 2:
 494           state = 3;
 495           (*unget) (ch);
 496           return ' ';           /* Sp after opco */
 497         case 3:
 498           goto recycle;         /* Sp in operands */
 499         case 9:
 500         case 10:
 501           state = 10;           /* Sp after symbol char */
 502           goto recycle;
 503         default:
 504           BAD_CASE (state);
 505         }
 506       break;
 507
 508     case LEX_IS_TWOCHAR_COMMENT_1ST:
 509       ch2 = (*get) ();
 510       if (ch2 != EOF && lex[ch2] == LEX_IS_TWOCHAR_COMMENT_2ND)
 511         {
 512           for (;;)
 513             {
 514               do
 515                 {
 516                   ch2 = (*get) ();
 517                   if (ch2 != EOF && IS_NEWLINE (ch2))
 518                     add_newlines++;
 519                 }
 520               while (ch2 != EOF &&
 521                      (lex[ch2] != LEX_IS_TWOCHAR_COMMENT_2ND));
 522
 523               while (ch2 != EOF &&
 524                      (lex[ch2] == LEX_IS_TWOCHAR_COMMENT_2ND))
 525                 {
 526                   ch2 = (*get) ();
 527                 }
 528
 529               if (ch2 == EOF
 530                   || lex[ch2] == LEX_IS_TWOCHAR_COMMENT_1ST)
 531                 break;
 532               (*unget) (ch);
 533             }
 534           if (ch2 == EOF)
 535             as_warn ("End of file in multiline comment");
 536
 537           ch = ' ';
 538           goto recycle;
 539         }
 540       else
 541         {
 542           if (ch2 != EOF)
 543             (*unget) (ch2);
 544           if (state == 9 || state == 10)
 545             state = 3;
 546           return ch;
 547         }
 548       break;
 549
 550     case LEX_IS_STRINGQUOTE:
 551       if (state == 9 || state == 10)
 552         old_state = 3;
 553       else
 554         old_state = state;
 555       state = 5;
 556       return ch;
 557 #ifndef MRI
 558 #ifndef IEEE_STYLE
 559     case LEX_IS_ONECHAR_QUOTE:
 560       ch = (*get) ();
 561       if (ch == EOF)
 562         {
 563           as_warn ("End-of-file after a one-character quote; \\000 inserted");
 564           ch = 0;
 565         }
 566       if (ch == '\\')
 567         {
 568           ch = (*get) ();
 569           ch = process_escape (ch);
 570         }
 571       sprintf (out_buf, "%d", (int) (unsigned char) ch);
 572
 573
 574       /* None of these 'x constants for us.  We want 'x'.  */
 575       if ((ch = (*get) ()) != '\'')
 576         {
 577 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
 578           as_warn ("Missing close quote: (assumed)");
 579 #else
 580           (*unget) (ch);
 581 #endif
 582         }
 583       if (strlen (out_buf) == 1)
 584         {
 585           return out_buf[0];
 586         }
 587       if (state == 9 || state == 10)
 588         old_state = 3;
 589       else
 590         old_state = state;
 591       state = -1;
 592       out_string = out_buf;
 593       return *out_string++;
 594 #endif
 595 #endif
 596     case LEX_IS_COLON:
 597       if (state == 9 || state == 10)
 598         state = 3;
 599       else if (state != 3)
 600         state = 0;
 601       return ch;
 602
 603     case LEX_IS_NEWLINE:
 604       /* Roll out a bunch of newlines from inside comments, etc.  */
 605       if (add_newlines)
 606         {
 607           --add_newlines;
 608           (*unget) (ch);
 609         }
 610       /* fall thru into... */
 611
 612     case LEX_IS_LINE_SEPARATOR:
 613       state = 0;
 614       return ch;
 615
 616     case LEX_IS_LINE_COMMENT_START:
 617       if (state == 0)           /* Only comment at start of line.  */
 618         {
 619           /* FIXME-someday: The two character comment stuff was badly
 620              thought out.  On i386, we want '/' as line comment start
 621              AND we want C style comments.  hence this hack.  The
 622              whole lexical process should be reworked.  xoxorich.  */
 623           if (ch == '/')
 624             {
 625               ch2 = (*get) ();
 626               if (ch2 == '*')
 627                 {
 628                   state = -2;
 629                   return (do_scrub_next_char (get, unget));
 630                 }
 631               else
 632                 {
 633                   (*unget) (ch2);
 634                 }
 635             }                   /* bad hack */
 636
 637           if (ch != '#')
 638             not_cpp_line = 1;
 639
 640           do
 641             ch = (*get) ();
 642           while (ch != EOF && IS_WHITESPACE (ch));
 643           if (ch == EOF)
 644             {
 645               as_warn ("EOF in comment:  Newline inserted");
 646               return '\n';
 647             }
 648           if (ch < '0' || ch > '9' || not_cpp_line)
 649             {
 650               /* Non-numerics:  Eat whole comment line */
 651               while (ch != EOF && !IS_NEWLINE (ch))
 652                 ch = (*get) ();
 653               if (ch == EOF)
 654                 as_warn ("EOF in Comment: Newline inserted");
 655               state = 0;
 656               return '\n';
 657             }
 658           /* Numerics begin comment.  Perhaps CPP `# 123 "filename"' */
 659           (*unget) (ch);
 660           old_state = 4;
 661           state = -1;
 662           out_string = "\t.appline ";
 663           return *out_string++;
 664         }
 665
 666       /* We have a line comment character which is not at the start of
 667          a line.  If this is also a normal comment character, fall
 668          through.  Otherwise treat it as a default character.  */
 669       if (strchr (comment_chars, ch) == NULL)
 670         goto de_fault;
 671       /* Fall through.  */
 672     case LEX_IS_COMMENT_START:
 673       do
 674         ch = (*get) ();
 675       while (ch != EOF && !IS_NEWLINE (ch));
 676       if (ch == EOF)
 677         as_warn ("EOF in comment:  Newline inserted");
 678       state = 0;
 679       return '\n';
 680
 681     case LEX_IS_SYMBOL_COMPONENT:
 682       if (state == 10)
 683         {
 684           /* This is a symbol character following another symbol
 685              character, with whitespace in between.  We skipped the
 686              whitespace earlier, so output it now.  */
 687           (*unget) (ch);
 688           state = 3;
 689           return ' ';
 690         }
 691       if (state == 3)
 692         state = 9;
 693       /* Fall through.  */
 694     default:
 695     de_fault:
 696       /* Some relatively `normal' character.  */
 697       if (state == 0)
 698         {
 699           state = 2;            /* Now seeing opcode */
 700           return ch;
 701         }
 702       else if (state == 1)
 703         {
 704           state = 2;            /* Ditto */
 705           return ch;
 706         }
 707       else if (state == 9)
 708         {
 709           if (lex[ch] != LEX_IS_SYMBOL_COMPONENT)
 710             state = 3;
 711           return ch;
 712         }
 713       else if (state == 10)
 714         {
 715           state = 3;
 716           return ch;
 717         }
 718       else
 719         {
 720           return ch;            /* Opcode or operands already */
 721         }
 722     }
 723   return -1;
 724 }
 725
 726 #ifdef TEST
 727
 728 const char comment_chars[] = "|";
 729 const char line_comment_chars[] = "#";
 730
 731 main ()
 732 {
 733   int ch;
 734
 735   app_begin ();
 736   while ((ch = do_scrub_next_char (stdin)) != EOF)
 737     putc (ch, stdout);
 738 }
 739
 740 as_warn (str)
 741      char *str;
 742 {
 743   fputs (str, stderr);
 744   putc ('\n', stderr);
 745 }
 746
 747 #endif
 748
 749 /* end of app.c */