gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright (C) 1987, 1990, 1991, 1992, 1994 Free Software Foundation, Inc.
   3
   4    This file is part of GAS, the GNU Assembler.
   5
   6    GAS is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    GAS is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GAS; see the file COPYING.  If not, write to
  18    the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
  21 /* App, the assembler pre-processor.  This pre-processor strips out excess
  22    spaces, turns single-quoted characters into a decimal constant, and turns
  23    # <number> <filename> <garbage> into a .line <number>\n.file <filename>
  24    pair.  This needs better error-handling.  */
  25
  26 #include <stdio.h>
  27 #include "as.h"                 /* For BAD_CASE() only */
  28
  29 #if (__STDC__ != 1)
  30 #ifndef const
  31 #define const  /* empty */
  32 #endif
  33 #endif
  34
  35 static char lex[256];
  36 static const char symbol_chars[] =
  37 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  38
  39 #define LEX_IS_SYMBOL_COMPONENT         1
  40 #define LEX_IS_WHITESPACE               2
  41 #define LEX_IS_LINE_SEPARATOR           3
  42 #define LEX_IS_COMMENT_START            4
  43 #define LEX_IS_LINE_COMMENT_START       5
  44 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  45 #define LEX_IS_TWOCHAR_COMMENT_2ND      7
  46 #define LEX_IS_STRINGQUOTE              8
  47 #define LEX_IS_COLON                    9
  48 #define LEX_IS_NEWLINE                  10
  49 #define LEX_IS_ONECHAR_QUOTE            11
  50 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  51 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  52 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  53 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  54 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  55 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  56
  57 static int process_escape PARAMS ((int));
  58
  59 /* FIXME-soon: The entire lexer/parser thingy should be
  60    built statically at compile time rather than dynamically
  61    each and every time the assembler is run.  xoxorich. */
  62
  63 void
  64 do_scrub_begin ()
  65 {
  66   const char *p;
  67
  68   lex[' '] = LEX_IS_WHITESPACE;
  69   lex['\t'] = LEX_IS_WHITESPACE;
  70   lex['\n'] = LEX_IS_NEWLINE;
  71   lex[';'] = LEX_IS_LINE_SEPARATOR;
  72   lex['"'] = LEX_IS_STRINGQUOTE;
  73 #ifndef TC_HPPA
  74   lex['\''] = LEX_IS_ONECHAR_QUOTE;
  75 #endif
  76   lex[':'] = LEX_IS_COLON;
  77
  78
  79
  80 #ifdef SINGLE_QUOTE_STRINGS
  81   lex['\''] = LEX_IS_STRINGQUOTE;
  82 #endif
  83
  84   /* Note that these override the previous defaults, e.g. if ';' is a
  85      comment char, then it isn't a line separator.  */
  86   for (p = symbol_chars; *p; ++p)
  87     {
  88       lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
  89     }                           /* declare symbol characters */
  90
  91   for (p = comment_chars; *p; p++)
  92     {
  93       lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
  94     }                           /* declare comment chars */
  95
  96   for (p = line_comment_chars; *p; p++)
  97     {
  98       lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
  99     }                           /* declare line comment chars */
 100
 101   for (p = line_separator_chars; *p; p++)
 102     {
 103       lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 104     }                           /* declare line separators */
 105
 106   /* Only allow slash-star comments if slash is not in use */
 107   if (lex['/'] == 0)
 108     {
 109       lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 110     }
 111   /* FIXME-soon.  This is a bad hack but otherwise, we can't do
 112      c-style comments when '/' is a line comment char. xoxorich. */
 113   if (lex['*'] == 0)
 114     {
 115       lex['*'] = LEX_IS_TWOCHAR_COMMENT_2ND;
 116     }
 117 }                               /* do_scrub_begin() */
 118
 119 FILE *scrub_file;
 120
 121 int
 122 scrub_from_file ()
 123 {
 124   return getc (scrub_file);
 125 }
 126
 127 void
 128 scrub_to_file (ch)
 129      int ch;
 130 {
 131   ungetc (ch, scrub_file);
 132 }                               /* scrub_to_file() */
 133
 134 char *scrub_string;
 135 char *scrub_last_string;
 136
 137 int
 138 scrub_from_string ()
 139 {
 140   return scrub_string == scrub_last_string ? EOF : *scrub_string++;
 141 }                               /* scrub_from_string() */
 142
 143 void
 144 scrub_to_string (ch)
 145      int ch;
 146 {
 147   *--scrub_string = ch;
 148 }                               /* scrub_to_string() */
 149
 150 /* Saved state of the scrubber */
 151 static int state;
 152 static int old_state;
 153 static char *out_string;
 154 static char out_buf[20];
 155 static int add_newlines = 0;
 156
 157 /* Data structure for saving the state of app across #include's.  Note that
 158    app is called asynchronously to the parsing of the .include's, so our
 159    state at the time .include is interpreted is completely unrelated.
 160    That's why we have to save it all.  */
 161
 162 struct app_save
 163   {
 164     int state;
 165     int old_state;
 166     char *out_string;
 167     char out_buf[sizeof (out_buf)];
 168     int add_newlines;
 169     char *scrub_string;
 170     char *scrub_last_string;
 171     FILE *scrub_file;
 172   };
 173
 174 char *
 175 app_push ()
 176 {
 177   register struct app_save *saved;
 178
 179   saved = (struct app_save *) xmalloc (sizeof (*saved));
 180   saved->state = state;
 181   saved->old_state = old_state;
 182   saved->out_string = out_string;
 183   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 184   saved->add_newlines = add_newlines;
 185   saved->scrub_string = scrub_string;
 186   saved->scrub_last_string = scrub_last_string;
 187   saved->scrub_file = scrub_file;
 188
 189   /* do_scrub_begin() is not useful, just wastes time. */
 190   return (char *) saved;
 191 }
 192
 193 void
 194 app_pop (arg)
 195      char *arg;
 196 {
 197   register struct app_save *saved = (struct app_save *) arg;
 198
 199   /* There is no do_scrub_end (). */
 200   state = saved->state;
 201   old_state = saved->old_state;
 202   out_string = saved->out_string;
 203   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 204   add_newlines = saved->add_newlines;
 205   scrub_string = saved->scrub_string;
 206   scrub_last_string = saved->scrub_last_string;
 207   scrub_file = saved->scrub_file;
 208
 209   free (arg);
 210 }                               /* app_pop() */
 211
 212 /* @@ This assumes that \n &c are the same on host and target.  This is not
 213    necessarily true.  */
 214 static int
 215 process_escape (ch)
 216      int ch;
 217 {
 218   switch (ch)
 219     {
 220     case 'b':
 221       return '\b';
 222     case 'f':
 223       return '\f';
 224     case 'n':
 225       return '\n';
 226     case 'r':
 227       return '\r';
 228     case 't':
 229       return '\t';
 230     case '\'':
 231       return '\'';
 232     case '"':
 233       return '\"';
 234     default:
 235       return ch;
 236     }
 237 }
 238 int
 239 do_scrub_next_char (get, unget)
 240      int (*get) ();
 241      void (*unget) ();
 242 {
 243   /*State 0: beginning of normal line
 244           1: After first whitespace on line (flush more white)
 245           2: After first non-white (opcode) on line (keep 1white)
 246           3: after second white on line (into operands) (flush white)
 247           4: after putting out a .line, put out digits
 248           5: parsing a string, then go to old-state
 249           6: putting out \ escape in a "d string.
 250           7: After putting out a .appfile, put out string.
 251           8: After putting out a .appfile string, flush until newline.
 252           9: After seeing symbol char in state 3 (keep 1white after symchar)
 253          10: After seeing whitespace in state 9 (keep white before symchar)
 254          11: After seeing a symbol character in state 0 (eg a label definition)
 255          -1: output string in out_string and go to the state in old_state
 256          -2: flush text until a '*' '/' is seen, then go to state old_state
 257           */
 258
 259   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 260      constructs like ``.loc 1 20''.  This was turning into ``.loc
 261      120''.  States 9 and 10 ensure that a space is never dropped in
 262      between characters which could appear in a identifier.  Ian
 263      Taylor, ian@cygnus.com.
 264
 265      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 266      correctly on the PA (and any other target where colons are optional).
 267      Jeff Law, law@cs.utah.edu.  */
 268
 269   /* This is purely an optimization hack, and relies on gcc's inlining
 270      capability.  */
 271 #if defined (__GNUC__) && defined (__OPTIMIZE__)
 272 #define GET()   (get == scrub_from_file ? scrub_from_file () : (*get) ())
 273 #else
 274 #define GET()   ((*get) ())
 275 #endif
 276
 277   register int ch, ch2 = 0;
 278   int not_cpp_line = 0;
 279
 280   switch (state)
 281     {
 282     case -1:
 283       ch = *out_string++;
 284       if (*out_string == 0)
 285         {
 286           state = old_state;
 287           old_state = 3;
 288         }
 289       return ch;
 290
 291     case -2:
 292       for (;;)
 293         {
 294           do
 295             {
 296               ch = GET ();
 297             }
 298           while (ch != EOF && ch != '\n' && ch != '*');
 299           if (ch == '\n' || ch == EOF)
 300             return ch;
 301
 302           /* At this point, ch must be a '*' */
 303           while ((ch = GET ()) == '*')
 304             {
 305               ;
 306             }
 307           if (ch == EOF || ch == '/')
 308             break;
 309           (*unget) (ch);
 310         }
 311       state = old_state;
 312       return ' ';
 313
 314     case 4:
 315       ch = GET ();
 316       if (ch == EOF || (ch >= '0' && ch <= '9'))
 317         return ch;
 318       else
 319         {
 320           while (ch != EOF && IS_WHITESPACE (ch))
 321             ch = GET ();
 322           if (ch == '"')
 323             {
 324               (*unget) (ch);
 325               out_string = "\n\t.appfile ";
 326               old_state = 7;
 327               state = -1;
 328               return *out_string++;
 329             }
 330           else
 331             {
 332               while (ch != EOF && ch != '\n')
 333                 ch = GET ();
 334               state = 0;
 335               return ch;
 336             }
 337         }
 338
 339     case 5:
 340       ch = GET ();
 341       if (lex[ch] == LEX_IS_STRINGQUOTE)
 342         {
 343           state = old_state;
 344           return ch;
 345         }
 346 #ifndef NO_STRING_ESCAPES
 347       else if (ch == '\\')
 348         {
 349           state = 6;
 350           return ch;
 351         }
 352 #endif
 353       else if (ch == EOF)
 354         {
 355           as_warn ("End of file in string: inserted '\"'");
 356           state = old_state;
 357           (*unget) ('\n');
 358           return '"';
 359         }
 360       else
 361         {
 362           return ch;
 363         }
 364
 365     case 6:
 366       state = 5;
 367       ch = GET ();
 368       switch (ch)
 369         {
 370           /* Handle strings broken across lines, by turning '\n' into
 371              '\\' and 'n'.  */
 372         case '\n':
 373           (*unget) ('n');
 374           add_newlines++;
 375           return '\\';
 376
 377         case '"':
 378         case '\\':
 379         case 'b':
 380         case 'f':
 381         case 'n':
 382         case 'r':
 383         case 't':
 384 #ifdef BACKSLASH_V
 385         case 'v':
 386 #endif /* BACKSLASH_V */
 387         case 'x':
 388         case 'X':
 389         case '0':
 390         case '1':
 391         case '2':
 392         case '3':
 393         case '4':
 394         case '5':
 395         case '6':
 396         case '7':
 397           break;
 398 #if defined(IGNORE_NONSTANDARD_ESCAPES) | defined(ONLY_STANDARD_ESCAPES)
 399         default:
 400           as_warn ("Unknown escape '\\%c' in string: Ignored", ch);
 401           break;
 402 #else /* ONLY_STANDARD_ESCAPES */
 403         default:
 404           /* Accept \x as x for any x */
 405           break;
 406 #endif /* ONLY_STANDARD_ESCAPES */
 407
 408         case EOF:
 409           as_warn ("End of file in string: '\"' inserted");
 410           return '"';
 411         }
 412       return ch;
 413
 414     case 7:
 415       ch = GET ();
 416       state = 5;
 417       old_state = 8;
 418       return ch;
 419
 420     case 8:
 421       do
 422         ch = GET ();
 423       while (ch != '\n');
 424       state = 0;
 425       return ch;
 426     }
 427
 428   /* OK, we are somewhere in states 0 through 4 or 9 through 11 */
 429
 430   /* flushchar: */
 431   ch = GET ();
 432 recycle:
 433   if (ch == EOF)
 434     {
 435       if (state != 0)
 436         {
 437           as_warn ("End of file not at end of a line: Newline inserted.");
 438           state = 0;
 439           return '\n';
 440         }
 441       return ch;
 442     }
 443
 444   switch (lex[ch])
 445     {
 446     case LEX_IS_WHITESPACE:
 447       do
 448         /* Preserve a single whitespace character at the beginning of
 449            a line.  */
 450         if (state == 0)
 451           {
 452             state = 1;
 453             return ch;
 454           }
 455         else
 456           ch = GET ();
 457       while (ch != EOF && IS_WHITESPACE (ch));
 458       if (ch == EOF)
 459         return ch;
 460
 461       if (IS_COMMENT (ch)
 462           || (state == 0 && IS_LINE_COMMENT (ch))
 463           || ch == '/'
 464           || IS_LINE_SEPARATOR (ch))
 465         {
 466           /* cpp never outputs a leading space before the #, so try to
 467              avoid being confused.  */
 468           not_cpp_line = 1;
 469           goto recycle;
 470         }
 471 #ifdef MRI
 472       (*unget) (ch);            /* Put back */
 473       return ' ';               /* Always return one space at start of line */
 474 #endif
 475
 476       /* If we're in state 2 or 11, we've seen a non-white character
 477          followed by whitespace.  If the next character is ':', this
 478          is whitespace after a label name which we *must* ignore.  */
 479       if ((state == 2 || state == 11) && lex[ch] == LEX_IS_COLON)
 480         {
 481           state = 1;
 482           return ch;
 483         }
 484
 485       switch (state)
 486         {
 487         case 0:
 488           state++;
 489           goto recycle;         /* Punted leading sp */
 490         case 1:
 491           /* We can arrive here if we leave a leading whitespace character
 492              at the beginning of a line.  */
 493           goto recycle;
 494         case 2:
 495           state = 3;
 496           (*unget) (ch);
 497           return ' ';           /* Sp after opco */
 498         case 3:
 499           goto recycle;         /* Sp in operands */
 500         case 9:
 501         case 10:
 502           state = 10;           /* Sp after symbol char */
 503           goto recycle;
 504         case 11:
 505           state = 1;
 506           (*unget) (ch);
 507           return ' ';           /* Sp after label definition.  */
 508         default:
 509           BAD_CASE (state);
 510         }
 511       break;
 512
 513     case LEX_IS_TWOCHAR_COMMENT_1ST:
 514       ch2 = GET ();
 515       if (ch2 != EOF && lex[ch2] == LEX_IS_TWOCHAR_COMMENT_2ND)
 516         {
 517           for (;;)
 518             {
 519               do
 520                 {
 521                   ch2 = GET ();
 522                   if (ch2 != EOF && IS_NEWLINE (ch2))
 523                     add_newlines++;
 524                 }
 525               while (ch2 != EOF &&
 526                      (lex[ch2] != LEX_IS_TWOCHAR_COMMENT_2ND));
 527
 528               while (ch2 != EOF &&
 529                      (lex[ch2] == LEX_IS_TWOCHAR_COMMENT_2ND))
 530                 {
 531                   ch2 = GET ();
 532                 }
 533
 534               if (ch2 == EOF
 535                   || lex[ch2] == LEX_IS_TWOCHAR_COMMENT_1ST)
 536                 break;
 537               (*unget) (ch);
 538             }
 539           if (ch2 == EOF)
 540             as_warn ("End of file in multiline comment");
 541
 542           ch = ' ';
 543           goto recycle;
 544         }
 545       else
 546         {
 547           if (ch2 != EOF)
 548             (*unget) (ch2);
 549           if (state == 9 || state == 10)
 550             state = 3;
 551           return ch;
 552         }
 553       break;
 554
 555     case LEX_IS_STRINGQUOTE:
 556       if (state == 9 || state == 10)
 557         old_state = 3;
 558       else
 559         old_state = state;
 560       state = 5;
 561       return ch;
 562 #ifndef MRI
 563 #ifndef IEEE_STYLE
 564     case LEX_IS_ONECHAR_QUOTE:
 565       ch = GET ();
 566       if (ch == EOF)
 567         {
 568           as_warn ("End-of-file after a one-character quote; \\000 inserted");
 569           ch = 0;
 570         }
 571       if (ch == '\\')
 572         {
 573           ch = GET ();
 574           ch = process_escape (ch);
 575         }
 576       sprintf (out_buf, "%d", (int) (unsigned char) ch);
 577
 578
 579       /* None of these 'x constants for us.  We want 'x'.  */
 580       if ((ch = GET ()) != '\'')
 581         {
 582 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
 583           as_warn ("Missing close quote: (assumed)");
 584 #else
 585           (*unget) (ch);
 586 #endif
 587         }
 588       if (strlen (out_buf) == 1)
 589         {
 590           return out_buf[0];
 591         }
 592       if (state == 9 || state == 10)
 593         old_state = 3;
 594       else
 595         old_state = state;
 596       state = -1;
 597       out_string = out_buf;
 598       return *out_string++;
 599 #endif
 600 #endif
 601     case LEX_IS_COLON:
 602       if (state == 9 || state == 10)
 603         state = 3;
 604       else if (state != 3)
 605         state = 1;
 606       return ch;
 607
 608     case LEX_IS_NEWLINE:
 609       /* Roll out a bunch of newlines from inside comments, etc.  */
 610       if (add_newlines)
 611         {
 612           --add_newlines;
 613           (*unget) (ch);
 614         }
 615       /* fall thru into... */
 616
 617     case LEX_IS_LINE_SEPARATOR:
 618       state = 0;
 619       return ch;
 620
 621     case LEX_IS_LINE_COMMENT_START:
 622       if (state == 0)           /* Only comment at start of line.  */
 623         {
 624           /* FIXME-someday: The two character comment stuff was badly
 625              thought out.  On i386, we want '/' as line comment start
 626              AND we want C style comments.  hence this hack.  The
 627              whole lexical process should be reworked.  xoxorich.  */
 628           if (ch == '/')
 629             {
 630               ch2 = GET ();
 631               if (ch2 == '*')
 632                 {
 633                   state = -2;
 634                   return (do_scrub_next_char (get, unget));
 635                 }
 636               else
 637                 {
 638                   (*unget) (ch2);
 639                 }
 640             }                   /* bad hack */
 641
 642           if (ch != '#')
 643             not_cpp_line = 1;
 644
 645           do
 646             ch = GET ();
 647           while (ch != EOF && IS_WHITESPACE (ch));
 648           if (ch == EOF)
 649             {
 650               as_warn ("EOF in comment:  Newline inserted");
 651               return '\n';
 652             }
 653           if (ch < '0' || ch > '9' || not_cpp_line)
 654             {
 655               /* Non-numerics:  Eat whole comment line */
 656               while (ch != EOF && !IS_NEWLINE (ch))
 657                 ch = GET ();
 658               if (ch == EOF)
 659                 as_warn ("EOF in Comment: Newline inserted");
 660               state = 0;
 661               return '\n';
 662             }
 663           /* Numerics begin comment.  Perhaps CPP `# 123 "filename"' */
 664           (*unget) (ch);
 665           old_state = 4;
 666           state = -1;
 667           out_string = "\t.appline ";
 668           return *out_string++;
 669         }
 670
 671       /* We have a line comment character which is not at the start of
 672          a line.  If this is also a normal comment character, fall
 673          through.  Otherwise treat it as a default character.  */
 674       if (strchr (comment_chars, ch) == NULL)
 675         goto de_fault;
 676       /* Fall through.  */
 677     case LEX_IS_COMMENT_START:
 678       do
 679         ch = GET ();
 680       while (ch != EOF && !IS_NEWLINE (ch));
 681       if (ch == EOF)
 682         as_warn ("EOF in comment:  Newline inserted");
 683       state = 0;
 684       return '\n';
 685
 686     case LEX_IS_SYMBOL_COMPONENT:
 687       if (state == 10)
 688         {
 689           /* This is a symbol character following another symbol
 690              character, with whitespace in between.  We skipped the
 691              whitespace earlier, so output it now.  */
 692           (*unget) (ch);
 693           state = 3;
 694           return ' ';
 695         }
 696       if (state == 3)
 697         state = 9;
 698       /* Fall through.  */
 699     default:
 700     de_fault:
 701       /* Some relatively `normal' character.  */
 702       if (state == 0)
 703         {
 704           state = 11;           /* Now seeing label definition */
 705           return ch;
 706         }
 707       else if (state == 1)
 708         {
 709           state = 2;            /* Ditto */
 710           return ch;
 711         }
 712       else if (state == 9)
 713         {
 714           if (lex[ch] != LEX_IS_SYMBOL_COMPONENT)
 715             state = 3;
 716           return ch;
 717         }
 718       else if (state == 10)
 719         {
 720           state = 3;
 721           return ch;
 722         }
 723       else
 724         {
 725           return ch;            /* Opcode or operands already */
 726         }
 727     }
 728   return -1;
 729
 730 #undef GET
 731 }
 732
 733 #ifdef TEST
 734
 735 const char comment_chars[] = "|";
 736 const char line_comment_chars[] = "#";
 737
 738 main ()
 739 {
 740   int ch;
 741
 742   app_begin ();
 743   while ((ch = do_scrub_next_char (stdin)) != EOF)
 744     putc (ch, stdout);
 745 }
 746
 747 as_warn (str)
 748      char *str;
 749 {
 750   fputs (str, stderr);
 751   putc ('\n', stderr);
 752 }
 753
 754 #endif
 755
 756 /* end of app.c */