gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright (C) 1987, 1990, 1991, 1992, 1994 Free Software Foundation, Inc.
   3
   4    This file is part of GAS, the GNU Assembler.
   5
   6    GAS is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    GAS is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GAS; see the file COPYING.  If not, write to
  18    the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
  21 /* App, the assembler pre-processor.  This pre-processor strips out excess
  22    spaces, turns single-quoted characters into a decimal constant, and turns
  23    # <number> <filename> <garbage> into a .line <number>\n.file <filename>
  24    pair.  This needs better error-handling.  */
  25
  26 #include <stdio.h>
  27 #include "as.h"                 /* For BAD_CASE() only */
  28
  29 #if (__STDC__ != 1)
  30 #ifndef const
  31 #define const  /* empty */
  32 #endif
  33 #endif
  34
  35 static char lex[256];
  36 static const char symbol_chars[] =
  37 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  38
  39 #define LEX_IS_SYMBOL_COMPONENT         1
  40 #define LEX_IS_WHITESPACE               2
  41 #define LEX_IS_LINE_SEPARATOR           3
  42 #define LEX_IS_COMMENT_START            4
  43 #define LEX_IS_LINE_COMMENT_START       5
  44 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  45 #define LEX_IS_TWOCHAR_COMMENT_2ND      7
  46 #define LEX_IS_STRINGQUOTE              8
  47 #define LEX_IS_COLON                    9
  48 #define LEX_IS_NEWLINE                  10
  49 #define LEX_IS_ONECHAR_QUOTE            11
  50 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  51 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  52 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  53 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  54 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  55 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  56
  57 static int process_escape PARAMS ((int));
  58
  59 /* FIXME-soon: The entire lexer/parser thingy should be
  60    built statically at compile time rather than dynamically
  61    each and every time the assembler is run.  xoxorich. */
  62
  63 void
  64 do_scrub_begin ()
  65 {
  66   const char *p;
  67
  68   lex[' '] = LEX_IS_WHITESPACE;
  69   lex['\t'] = LEX_IS_WHITESPACE;
  70   lex['\n'] = LEX_IS_NEWLINE;
  71   lex[';'] = LEX_IS_LINE_SEPARATOR;
  72   lex[':'] = LEX_IS_COLON;
  73
  74   if (! flag_mri)
  75     {
  76       lex['"'] = LEX_IS_STRINGQUOTE;
  77
  78 #ifndef TC_HPPA
  79       lex['\''] = LEX_IS_ONECHAR_QUOTE;
  80 #endif
  81
  82 #ifdef SINGLE_QUOTE_STRINGS
  83       lex['\''] = LEX_IS_STRINGQUOTE;
  84 #endif
  85     }
  86
  87   /* Note that these override the previous defaults, e.g. if ';' is a
  88      comment char, then it isn't a line separator.  */
  89   for (p = symbol_chars; *p; ++p)
  90     {
  91       lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
  92     }                           /* declare symbol characters */
  93
  94   for (p = comment_chars; *p; p++)
  95     {
  96       lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
  97     }                           /* declare comment chars */
  98
  99   for (p = line_comment_chars; *p; p++)
 100     {
 101       lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 102     }                           /* declare line comment chars */
 103
 104   for (p = line_separator_chars; *p; p++)
 105     {
 106       lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 107     }                           /* declare line separators */
 108
 109   /* Only allow slash-star comments if slash is not in use */
 110   if (lex['/'] == 0)
 111     {
 112       lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 113     }
 114   /* FIXME-soon.  This is a bad hack but otherwise, we can't do
 115      c-style comments when '/' is a line comment char. xoxorich. */
 116   if (lex['*'] == 0)
 117     {
 118       lex['*'] = LEX_IS_TWOCHAR_COMMENT_2ND;
 119     }
 120
 121   if (flag_mri)
 122     {
 123       lex['\''] = LEX_IS_STRINGQUOTE;
 124       lex[';'] = LEX_IS_COMMENT_START;
 125       lex['*'] = LEX_IS_LINE_COMMENT_START;
 126       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 127          then it can't be used in an expression.  */
 128       lex['!'] = LEX_IS_LINE_COMMENT_START;
 129     }
 130 }                               /* do_scrub_begin() */
 131
 132 FILE *scrub_file;
 133
 134 int
 135 scrub_from_file ()
 136 {
 137   return getc (scrub_file);
 138 }
 139
 140 void
 141 scrub_to_file (ch)
 142      int ch;
 143 {
 144   ungetc (ch, scrub_file);
 145 }                               /* scrub_to_file() */
 146
 147 char *scrub_string;
 148 char *scrub_last_string;
 149
 150 int
 151 scrub_from_string ()
 152 {
 153   return scrub_string == scrub_last_string ? EOF : *scrub_string++;
 154 }                               /* scrub_from_string() */
 155
 156 void
 157 scrub_to_string (ch)
 158      int ch;
 159 {
 160   *--scrub_string = ch;
 161 }                               /* scrub_to_string() */
 162
 163 /* Saved state of the scrubber */
 164 static int state;
 165 static int old_state;
 166 static char *out_string;
 167 static char out_buf[20];
 168 static int add_newlines = 0;
 169
 170 /* Data structure for saving the state of app across #include's.  Note that
 171    app is called asynchronously to the parsing of the .include's, so our
 172    state at the time .include is interpreted is completely unrelated.
 173    That's why we have to save it all.  */
 174
 175 struct app_save
 176   {
 177     int state;
 178     int old_state;
 179     char *out_string;
 180     char out_buf[sizeof (out_buf)];
 181     int add_newlines;
 182     char *scrub_string;
 183     char *scrub_last_string;
 184     FILE *scrub_file;
 185   };
 186
 187 char *
 188 app_push ()
 189 {
 190   register struct app_save *saved;
 191
 192   saved = (struct app_save *) xmalloc (sizeof (*saved));
 193   saved->state = state;
 194   saved->old_state = old_state;
 195   saved->out_string = out_string;
 196   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 197   saved->add_newlines = add_newlines;
 198   saved->scrub_string = scrub_string;
 199   saved->scrub_last_string = scrub_last_string;
 200   saved->scrub_file = scrub_file;
 201
 202   /* do_scrub_begin() is not useful, just wastes time. */
 203   return (char *) saved;
 204 }
 205
 206 void
 207 app_pop (arg)
 208      char *arg;
 209 {
 210   register struct app_save *saved = (struct app_save *) arg;
 211
 212   /* There is no do_scrub_end (). */
 213   state = saved->state;
 214   old_state = saved->old_state;
 215   out_string = saved->out_string;
 216   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 217   add_newlines = saved->add_newlines;
 218   scrub_string = saved->scrub_string;
 219   scrub_last_string = saved->scrub_last_string;
 220   scrub_file = saved->scrub_file;
 221
 222   free (arg);
 223 }                               /* app_pop() */
 224
 225 /* @@ This assumes that \n &c are the same on host and target.  This is not
 226    necessarily true.  */
 227 static int
 228 process_escape (ch)
 229      int ch;
 230 {
 231   switch (ch)
 232     {
 233     case 'b':
 234       return '\b';
 235     case 'f':
 236       return '\f';
 237     case 'n':
 238       return '\n';
 239     case 'r':
 240       return '\r';
 241     case 't':
 242       return '\t';
 243     case '\'':
 244       return '\'';
 245     case '"':
 246       return '\"';
 247     default:
 248       return ch;
 249     }
 250 }
 251 int
 252 do_scrub_next_char (get, unget)
 253      int (*get) ();
 254      void (*unget) ();
 255 {
 256   /*State 0: beginning of normal line
 257           1: After first whitespace on line (flush more white)
 258           2: After first non-white (opcode) on line (keep 1white)
 259           3: after second white on line (into operands) (flush white)
 260           4: after putting out a .line, put out digits
 261           5: parsing a string, then go to old-state
 262           6: putting out \ escape in a "d string.
 263           7: After putting out a .appfile, put out string.
 264           8: After putting out a .appfile string, flush until newline.
 265           9: After seeing symbol char in state 3 (keep 1white after symchar)
 266          10: After seeing whitespace in state 9 (keep white before symchar)
 267          11: After seeing a symbol character in state 0 (eg a label definition)
 268          -1: output string in out_string and go to the state in old_state
 269          -2: flush text until a '*' '/' is seen, then go to state old_state
 270           */
 271
 272   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 273      constructs like ``.loc 1 20''.  This was turning into ``.loc
 274      120''.  States 9 and 10 ensure that a space is never dropped in
 275      between characters which could appear in a identifier.  Ian
 276      Taylor, ian@cygnus.com.
 277
 278      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 279      correctly on the PA (and any other target where colons are optional).
 280      Jeff Law, law@cs.utah.edu.  */
 281
 282   /* This is purely an optimization hack, and relies on gcc's inlining
 283      capability.  */
 284 #if defined (__GNUC__) && defined (__OPTIMIZE__)
 285 #define GET()   (get == scrub_from_file ? scrub_from_file () : (*get) ())
 286 #else
 287 #define GET()   ((*get) ())
 288 #endif
 289
 290   register int ch, ch2 = 0;
 291   int not_cpp_line = 0;
 292
 293   switch (state)
 294     {
 295     case -1:
 296       ch = *out_string++;
 297       if (*out_string == 0)
 298         {
 299           state = old_state;
 300           old_state = 3;
 301         }
 302       return ch;
 303
 304     case -2:
 305       for (;;)
 306         {
 307           do
 308             {
 309               ch = GET ();
 310             }
 311           while (ch != EOF && ch != '\n' && ch != '*');
 312           if (ch == '\n' || ch == EOF)
 313             return ch;
 314
 315           /* At this point, ch must be a '*' */
 316           while ((ch = GET ()) == '*')
 317             {
 318               ;
 319             }
 320           if (ch == EOF || ch == '/')
 321             break;
 322           (*unget) (ch);
 323         }
 324       state = old_state;
 325       return ' ';
 326
 327     case 4:
 328       ch = GET ();
 329       if (ch == EOF || (ch >= '0' && ch <= '9'))
 330         return ch;
 331       else
 332         {
 333           while (ch != EOF && IS_WHITESPACE (ch))
 334             ch = GET ();
 335           if (ch == '"')
 336             {
 337               (*unget) (ch);
 338               out_string = "\n\t.appfile ";
 339               old_state = 7;
 340               state = -1;
 341               return *out_string++;
 342             }
 343           else
 344             {
 345               while (ch != EOF && ch != '\n')
 346                 ch = GET ();
 347               state = 0;
 348               return ch;
 349             }
 350         }
 351
 352     case 5:
 353       ch = GET ();
 354       if (lex[ch] == LEX_IS_STRINGQUOTE)
 355         {
 356           state = old_state;
 357           return ch;
 358         }
 359 #ifndef NO_STRING_ESCAPES
 360       else if (ch == '\\')
 361         {
 362           state = 6;
 363           return ch;
 364         }
 365 #endif
 366       else if (ch == EOF)
 367         {
 368           as_warn ("End of file in string: inserted '\"'");
 369           state = old_state;
 370           (*unget) ('\n');
 371           return '"';
 372         }
 373       else
 374         {
 375           return ch;
 376         }
 377
 378     case 6:
 379       state = 5;
 380       ch = GET ();
 381       switch (ch)
 382         {
 383           /* Handle strings broken across lines, by turning '\n' into
 384              '\\' and 'n'.  */
 385         case '\n':
 386           (*unget) ('n');
 387           add_newlines++;
 388           return '\\';
 389
 390         case '"':
 391         case '\\':
 392         case 'b':
 393         case 'f':
 394         case 'n':
 395         case 'r':
 396         case 't':
 397         case 'v':
 398         case 'x':
 399         case 'X':
 400         case '0':
 401         case '1':
 402         case '2':
 403         case '3':
 404         case '4':
 405         case '5':
 406         case '6':
 407         case '7':
 408           break;
 409 #if defined(IGNORE_NONSTANDARD_ESCAPES) | defined(ONLY_STANDARD_ESCAPES)
 410         default:
 411           as_warn ("Unknown escape '\\%c' in string: Ignored", ch);
 412           break;
 413 #else /* ONLY_STANDARD_ESCAPES */
 414         default:
 415           /* Accept \x as x for any x */
 416           break;
 417 #endif /* ONLY_STANDARD_ESCAPES */
 418
 419         case EOF:
 420           as_warn ("End of file in string: '\"' inserted");
 421           return '"';
 422         }
 423       return ch;
 424
 425     case 7:
 426       ch = GET ();
 427       state = 5;
 428       old_state = 8;
 429       return ch;
 430
 431     case 8:
 432       do
 433         ch = GET ();
 434       while (ch != '\n');
 435       state = 0;
 436       return ch;
 437     }
 438
 439   /* OK, we are somewhere in states 0 through 4 or 9 through 11 */
 440
 441   /* flushchar: */
 442   ch = GET ();
 443 recycle:
 444   if (ch == EOF)
 445     {
 446       if (state != 0)
 447         {
 448           as_warn ("End of file not at end of a line: Newline inserted.");
 449           state = 0;
 450           return '\n';
 451         }
 452       return ch;
 453     }
 454
 455   switch (lex[ch])
 456     {
 457     case LEX_IS_WHITESPACE:
 458       do
 459         /* Preserve a single whitespace character at the beginning of
 460            a line.  */
 461         if (state == 0)
 462           {
 463             state = 1;
 464             return ch;
 465           }
 466         else
 467           ch = GET ();
 468       while (ch != EOF && IS_WHITESPACE (ch));
 469       if (ch == EOF)
 470         return ch;
 471
 472       if (IS_COMMENT (ch)
 473           || (state == 0 && IS_LINE_COMMENT (ch))
 474           || ch == '/'
 475           || IS_LINE_SEPARATOR (ch))
 476         {
 477           /* cpp never outputs a leading space before the #, so try to
 478              avoid being confused.  */
 479           not_cpp_line = 1;
 480           goto recycle;
 481         }
 482
 483       /* If we're in state 2 or 11, we've seen a non-white character
 484          followed by whitespace.  If the next character is ':', this
 485          is whitespace after a label name which we normally must
 486          ignore.  In MRI mode, though, spaces are not permitted
 487          between the label and the colon.  */
 488       if ((state == 2 || state == 11)
 489           && lex[ch] == LEX_IS_COLON
 490           && ! flag_mri)
 491         {
 492           state = 1;
 493           return ch;
 494         }
 495
 496       switch (state)
 497         {
 498         case 0:
 499           state++;
 500           goto recycle;         /* Punted leading sp */
 501         case 1:
 502           /* We can arrive here if we leave a leading whitespace character
 503              at the beginning of a line.  */
 504           goto recycle;
 505         case 2:
 506           state = 3;
 507           (*unget) (ch);
 508           return ' ';           /* Sp after opco */
 509         case 3:
 510           goto recycle;         /* Sp in operands */
 511         case 9:
 512         case 10:
 513           state = 10;           /* Sp after symbol char */
 514           goto recycle;
 515         case 11:
 516           state = 1;
 517           (*unget) (ch);
 518           return ' ';           /* Sp after label definition.  */
 519         default:
 520           BAD_CASE (state);
 521         }
 522       break;
 523
 524     case LEX_IS_TWOCHAR_COMMENT_1ST:
 525       ch2 = GET ();
 526       if (ch2 != EOF && lex[ch2] == LEX_IS_TWOCHAR_COMMENT_2ND)
 527         {
 528           for (;;)
 529             {
 530               do
 531                 {
 532                   ch2 = GET ();
 533                   if (ch2 != EOF && IS_NEWLINE (ch2))
 534                     add_newlines++;
 535                 }
 536               while (ch2 != EOF &&
 537                      (lex[ch2] != LEX_IS_TWOCHAR_COMMENT_2ND));
 538
 539               while (ch2 != EOF &&
 540                      (lex[ch2] == LEX_IS_TWOCHAR_COMMENT_2ND))
 541                 {
 542                   ch2 = GET ();
 543                 }
 544
 545               if (ch2 == EOF
 546                   || lex[ch2] == LEX_IS_TWOCHAR_COMMENT_1ST)
 547                 break;
 548               (*unget) (ch);
 549             }
 550           if (ch2 == EOF)
 551             as_warn ("End of file in multiline comment");
 552
 553           ch = ' ';
 554           goto recycle;
 555         }
 556       else
 557         {
 558           if (ch2 != EOF)
 559             (*unget) (ch2);
 560           if (state == 9 || state == 10)
 561             state = 3;
 562           return ch;
 563         }
 564       break;
 565
 566     case LEX_IS_STRINGQUOTE:
 567       if (state == 10)
 568         {
 569           /* Preserve the whitespace in foo "bar" */
 570           (*unget) (ch);
 571           state = 3;
 572           return ' ';
 573         }
 574       else if (state == 9)
 575         old_state = 3;
 576       else
 577         old_state = state;
 578       state = 5;
 579       return ch;
 580 #ifndef IEEE_STYLE
 581     case LEX_IS_ONECHAR_QUOTE:
 582       if (state == 10)
 583         {
 584           /* Preserve the whitespace in foo 'b' */
 585           (*unget) (ch);
 586           state = 3;
 587           return ' ';
 588         }
 589       ch = GET ();
 590       if (ch == EOF)
 591         {
 592           as_warn ("End-of-file after a one-character quote; \\000 inserted");
 593           ch = 0;
 594         }
 595       if (ch == '\\')
 596         {
 597           ch = GET ();
 598           ch = process_escape (ch);
 599         }
 600       sprintf (out_buf, "%d", (int) (unsigned char) ch);
 601
 602
 603       /* None of these 'x constants for us.  We want 'x'.  */
 604       if ((ch = GET ()) != '\'')
 605         {
 606 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
 607           as_warn ("Missing close quote: (assumed)");
 608 #else
 609           (*unget) (ch);
 610 #endif
 611         }
 612       if (strlen (out_buf) == 1)
 613         {
 614           return out_buf[0];
 615         }
 616       if (state == 9)
 617         old_state = 3;
 618       else
 619         old_state = state;
 620       state = -1;
 621       out_string = out_buf;
 622       return *out_string++;
 623 #endif
 624     case LEX_IS_COLON:
 625       if (state == 9 || state == 10)
 626         state = 3;
 627       else if (state != 3)
 628         state = 1;
 629       return ch;
 630
 631     case LEX_IS_NEWLINE:
 632       /* Roll out a bunch of newlines from inside comments, etc.  */
 633       if (add_newlines)
 634         {
 635           --add_newlines;
 636           (*unget) (ch);
 637         }
 638       /* fall thru into... */
 639
 640     case LEX_IS_LINE_SEPARATOR:
 641       state = 0;
 642       return ch;
 643
 644     case LEX_IS_LINE_COMMENT_START:
 645       if (state == 0)           /* Only comment at start of line.  */
 646         {
 647           /* FIXME-someday: The two character comment stuff was badly
 648              thought out.  On i386, we want '/' as line comment start
 649              AND we want C style comments.  hence this hack.  The
 650              whole lexical process should be reworked.  xoxorich.  */
 651           if (ch == '/')
 652             {
 653               ch2 = GET ();
 654               if (ch2 == '*')
 655                 {
 656                   state = -2;
 657                   return (do_scrub_next_char (get, unget));
 658                 }
 659               else
 660                 {
 661                   (*unget) (ch2);
 662                 }
 663             }                   /* bad hack */
 664
 665           if (ch != '#')
 666             not_cpp_line = 1;
 667
 668           do
 669             ch = GET ();
 670           while (ch != EOF && IS_WHITESPACE (ch));
 671           if (ch == EOF)
 672             {
 673               as_warn ("EOF in comment:  Newline inserted");
 674               return '\n';
 675             }
 676           if (ch < '0' || ch > '9' || not_cpp_line)
 677             {
 678               /* Non-numerics:  Eat whole comment line */
 679               while (ch != EOF && !IS_NEWLINE (ch))
 680                 ch = GET ();
 681               if (ch == EOF)
 682                 as_warn ("EOF in Comment: Newline inserted");
 683               state = 0;
 684               return '\n';
 685             }
 686           /* Numerics begin comment.  Perhaps CPP `# 123 "filename"' */
 687           (*unget) (ch);
 688           old_state = 4;
 689           state = -1;
 690           out_string = "\t.appline ";
 691           return *out_string++;
 692         }
 693
 694       /* We have a line comment character which is not at the start of
 695          a line.  If this is also a normal comment character, fall
 696          through.  Otherwise treat it as a default character.  */
 697       if ((flag_mri && (ch == '!' || ch == '*'))
 698           || strchr (comment_chars, ch) == NULL)
 699         goto de_fault;
 700       /* Fall through.  */
 701     case LEX_IS_COMMENT_START:
 702       do
 703         ch = GET ();
 704       while (ch != EOF && !IS_NEWLINE (ch));
 705       if (ch == EOF)
 706         as_warn ("EOF in comment:  Newline inserted");
 707       state = 0;
 708       return '\n';
 709
 710     case LEX_IS_SYMBOL_COMPONENT:
 711       if (state == 10)
 712         {
 713           /* This is a symbol character following another symbol
 714              character, with whitespace in between.  We skipped the
 715              whitespace earlier, so output it now.  */
 716           (*unget) (ch);
 717           state = 3;
 718           return ' ';
 719         }
 720       if (state == 3)
 721         state = 9;
 722       /* Fall through.  */
 723     default:
 724     de_fault:
 725       /* Some relatively `normal' character.  */
 726       if (state == 0)
 727         {
 728           state = 11;           /* Now seeing label definition */
 729           return ch;
 730         }
 731       else if (state == 1)
 732         {
 733           state = 2;            /* Ditto */
 734           return ch;
 735         }
 736       else if (state == 9)
 737         {
 738           if (lex[ch] != LEX_IS_SYMBOL_COMPONENT)
 739             state = 3;
 740           return ch;
 741         }
 742       else if (state == 10)
 743         {
 744           state = 3;
 745           return ch;
 746         }
 747       else
 748         {
 749           return ch;            /* Opcode or operands already */
 750         }
 751     }
 752   return -1;
 753
 754 #undef GET
 755 }
 756
 757 #ifdef TEST
 758
 759 const char comment_chars[] = "|";
 760 const char line_comment_chars[] = "#";
 761
 762 main ()
 763 {
 764   int ch;
 765
 766   app_begin ();
 767   while ((ch = do_scrub_next_char (stdin)) != EOF)
 768     putc (ch, stdout);
 769 }
 770
 771 as_warn (str)
 772      char *str;
 773 {
 774   fputs (str, stderr);
 775   putc ('\n', stderr);
 776 }
 777
 778 #endif
 779
 780 /* end of app.c */