Wed Jun 10 11:34:07 1998 Jason Molenda (crash@bugshack.cygnus.com)
[deliverable/binutils-gdb.git] / gdb / gnu-regex.c
1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993, 94, 95, 96, 97, 98 Free Software Foundation, Inc.
6
7 NOTE: The canonical source of this file is maintained with the
8 GNU C Library. Bugs can be reported to bug-glibc@prep.ai.mit.edu.
9
10 This program is free software; you can redistribute it and/or modify it
11 under the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option) any
13 later version.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software Foundation,
22 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
23
24 /* AIX requires this to be the first thing in the file. */
25 #if defined _AIX && !defined REGEX_MALLOC
26 #pragma alloca
27 #endif
28
29 #undef _GNU_SOURCE
30 #define _GNU_SOURCE
31
32 #ifdef HAVE_CONFIG_H
33 # include <config.h>
34 #endif
35
36 #ifndef PARAMS
37 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
38 # define PARAMS(args) args
39 # else
40 # define PARAMS(args) ()
41 # endif /* GCC. */
42 #endif /* Not PARAMS. */
43
44 #if defined STDC_HEADERS && !defined emacs
45 # include <stddef.h>
46 #else
47 /* We need this for `regex.h', and perhaps for the Emacs include files. */
48 # include <sys/types.h>
49 #endif
50
51 /* For platform which support the ISO C amendement 1 functionality we
52 support user defined character classes. */
53 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H)
54 # include <wctype.h>
55 # include <wchar.h>
56
57 /* We have to keep the namespace clean. */
58 # define regfree(preg) __regfree (preg)
59 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
60 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
61 # define regerror(errcode, preg, errbuf, errbuf_size) \
62 __regerror(errcode, preg, errbuf, errbuf_size)
63 # define re_set_registers(bu, re, nu, st, en) \
64 __re_set_registers (bu, re, nu, st, en)
65 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
66 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
67 # define re_match(bufp, string, size, pos, regs) \
68 __re_match (bufp, string, size, pos, regs)
69 # define re_search(bufp, string, size, startpos, range, regs) \
70 __re_search (bufp, string, size, startpos, range, regs)
71 # define re_compile_pattern(pattern, length, bufp) \
72 __re_compile_pattern (pattern, length, bufp)
73 # define re_set_syntax(syntax) __re_set_syntax (syntax)
74 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
75 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
76 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
77
78 #define btowc __btowc
79 #endif
80
81 /* This is for other GNU distributions with internationalized messages. */
82 #if HAVE_LIBINTL_H || defined _LIBC
83 # include <libintl.h>
84 #else
85 # define gettext(msgid) (msgid)
86 #endif
87
88 #ifndef gettext_noop
89 /* This define is so xgettext can find the internationalizable
90 strings. */
91 # define gettext_noop(String) String
92 #endif
93
94 /* The `emacs' switch turns on certain matching commands
95 that make sense only in Emacs. */
96 #ifdef emacs
97
98 # include "lisp.h"
99 # include "buffer.h"
100 # include "syntax.h"
101
102 #else /* not emacs */
103
104 /* If we are not linking with Emacs proper,
105 we can't use the relocating allocator
106 even if config.h says that we can. */
107 # undef REL_ALLOC
108
109 # if defined STDC_HEADERS || defined _LIBC
110 # include <stdlib.h>
111 # else
112 char *malloc ();
113 char *realloc ();
114 # endif
115
116 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
117 If nothing else has been done, use the method below. */
118 # ifdef INHIBIT_STRING_HEADER
119 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
120 # if !defined bzero && !defined bcopy
121 # undef INHIBIT_STRING_HEADER
122 # endif
123 # endif
124 # endif
125
126 /* This is the normal way of making sure we have a bcopy and a bzero.
127 This is used in most programs--a few other programs avoid this
128 by defining INHIBIT_STRING_HEADER. */
129 # ifndef INHIBIT_STRING_HEADER
130 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
131 # include <string.h>
132 # ifndef bzero
133 # ifndef _LIBC
134 # define bzero(s, n) (memset (s, '\0', n), (s))
135 # else
136 # define bzero(s, n) __bzero (s, n)
137 # endif
138 # endif
139 # else
140 # include <strings.h>
141 # ifndef memcmp
142 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
143 # endif
144 # ifndef memcpy
145 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
146 # endif
147 # endif
148 # endif
149
150 /* Define the syntax stuff for \<, \>, etc. */
151
152 /* This must be nonzero for the wordchar and notwordchar pattern
153 commands in re_match_2. */
154 # ifndef Sword
155 # define Sword 1
156 # endif
157
158 # ifdef SWITCH_ENUM_BUG
159 # define SWITCH_ENUM_CAST(x) ((int)(x))
160 # else
161 # define SWITCH_ENUM_CAST(x) (x)
162 # endif
163
164 /* How many characters in the character set. */
165 # define CHAR_SET_SIZE 256
166
167 # ifdef SYNTAX_TABLE
168
169 extern char *re_syntax_table;
170
171 # else /* not SYNTAX_TABLE */
172
173 static char re_syntax_table[CHAR_SET_SIZE];
174
175 static void
176 init_syntax_once ()
177 {
178 register int c;
179 static int done = 0;
180
181 if (done)
182 return;
183
184 bzero (re_syntax_table, sizeof re_syntax_table);
185
186 for (c = 'a'; c <= 'z'; c++)
187 re_syntax_table[c] = Sword;
188
189 for (c = 'A'; c <= 'Z'; c++)
190 re_syntax_table[c] = Sword;
191
192 for (c = '0'; c <= '9'; c++)
193 re_syntax_table[c] = Sword;
194
195 re_syntax_table['_'] = Sword;
196
197 done = 1;
198 }
199
200 # endif /* not SYNTAX_TABLE */
201
202 # define SYNTAX(c) re_syntax_table[c]
203
204 #endif /* not emacs */
205 \f
206 /* Get the interface, including the syntax bits. */
207 #include "regex.h"
208
209 /* isalpha etc. are used for the character classes. */
210 #include <ctype.h>
211
212 /* Jim Meyering writes:
213
214 "... Some ctype macros are valid only for character codes that
215 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
216 using /bin/cc or gcc but without giving an ansi option). So, all
217 ctype uses should be through macros like ISPRINT... If
218 STDC_HEADERS is defined, then autoconf has verified that the ctype
219 macros don't need to be guarded with references to isascii. ...
220 Defining isascii to 1 should let any compiler worth its salt
221 eliminate the && through constant folding."
222 Solaris defines some of these symbols so we must undefine them first. */
223
224 #undef ISASCII
225 #if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
226 # define ISASCII(c) 1
227 #else
228 # define ISASCII(c) isascii(c)
229 #endif
230
231 #ifdef isblank
232 # define ISBLANK(c) (ISASCII (c) && isblank (c))
233 #else
234 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
235 #endif
236 #ifdef isgraph
237 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
238 #else
239 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
240 #endif
241
242 #undef ISPRINT
243 #define ISPRINT(c) (ISASCII (c) && isprint (c))
244 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
245 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
246 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
247 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
248 #define ISLOWER(c) (ISASCII (c) && islower (c))
249 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
250 #define ISSPACE(c) (ISASCII (c) && isspace (c))
251 #define ISUPPER(c) (ISASCII (c) && isupper (c))
252 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
253
254 #ifndef NULL
255 # define NULL (void *)0
256 #endif
257
258 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
259 since ours (we hope) works properly with all combinations of
260 machines, compilers, `char' and `unsigned char' argument types.
261 (Per Bothner suggested the basic approach.) */
262 #undef SIGN_EXTEND_CHAR
263 #if __STDC__
264 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
265 #else /* not __STDC__ */
266 /* As in Harbison and Steele. */
267 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
268 #endif
269 \f
270 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
271 use `alloca' instead of `malloc'. This is because using malloc in
272 re_search* or re_match* could cause memory leaks when C-g is used in
273 Emacs; also, malloc is slower and causes storage fragmentation. On
274 the other hand, malloc is more portable, and easier to debug.
275
276 Because we sometimes use alloca, some routines have to be macros,
277 not functions -- `alloca'-allocated space disappears at the end of the
278 function it is called in. */
279
280 #ifdef REGEX_MALLOC
281
282 # define REGEX_ALLOCATE malloc
283 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
284 # define REGEX_FREE free
285
286 #else /* not REGEX_MALLOC */
287
288 /* Emacs already defines alloca, sometimes. */
289 # ifndef alloca
290
291 /* Make alloca work the best possible way. */
292 # ifdef __GNUC__
293 # define alloca __builtin_alloca
294 # else /* not __GNUC__ */
295 # if HAVE_ALLOCA_H
296 # include <alloca.h>
297 # endif /* HAVE_ALLOCA_H */
298 # endif /* not __GNUC__ */
299
300 # endif /* not alloca */
301
302 # define REGEX_ALLOCATE alloca
303
304 /* Assumes a `char *destination' variable. */
305 # define REGEX_REALLOCATE(source, osize, nsize) \
306 (destination = (char *) alloca (nsize), \
307 memcpy (destination, source, osize))
308
309 /* No need to do anything to free, after alloca. */
310 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
311
312 #endif /* not REGEX_MALLOC */
313
314 /* Define how to allocate the failure stack. */
315
316 #if defined REL_ALLOC && defined REGEX_MALLOC
317
318 # define REGEX_ALLOCATE_STACK(size) \
319 r_alloc (&failure_stack_ptr, (size))
320 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
321 r_re_alloc (&failure_stack_ptr, (nsize))
322 # define REGEX_FREE_STACK(ptr) \
323 r_alloc_free (&failure_stack_ptr)
324
325 #else /* not using relocating allocator */
326
327 # ifdef REGEX_MALLOC
328
329 # define REGEX_ALLOCATE_STACK malloc
330 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
331 # define REGEX_FREE_STACK free
332
333 # else /* not REGEX_MALLOC */
334
335 # define REGEX_ALLOCATE_STACK alloca
336
337 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
338 REGEX_REALLOCATE (source, osize, nsize)
339 /* No need to explicitly free anything. */
340 # define REGEX_FREE_STACK(arg)
341
342 # endif /* not REGEX_MALLOC */
343 #endif /* not using relocating allocator */
344
345
346 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
347 `string1' or just past its end. This works if PTR is NULL, which is
348 a good thing. */
349 #define FIRST_STRING_P(ptr) \
350 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
351
352 /* (Re)Allocate N items of type T using malloc, or fail. */
353 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
354 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
355 #define RETALLOC_IF(addr, n, t) \
356 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
357 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
358
359 #define BYTEWIDTH 8 /* In bits. */
360
361 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
362
363 #undef MAX
364 #undef MIN
365 #define MAX(a, b) ((a) > (b) ? (a) : (b))
366 #define MIN(a, b) ((a) < (b) ? (a) : (b))
367
368 typedef char boolean;
369 #define false 0
370 #define true 1
371
372 static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
373 const char *string1, int size1,
374 const char *string2, int size2,
375 int pos,
376 struct re_registers *regs,
377 int stop));
378 \f
379 /* These are the command codes that appear in compiled regular
380 expressions. Some opcodes are followed by argument bytes. A
381 command code can specify any interpretation whatsoever for its
382 arguments. Zero bytes may appear in the compiled regular expression. */
383
384 typedef enum
385 {
386 no_op = 0,
387
388 /* Succeed right away--no more backtracking. */
389 succeed,
390
391 /* Followed by one byte giving n, then by n literal bytes. */
392 exactn,
393
394 /* Matches any (more or less) character. */
395 anychar,
396
397 /* Matches any one char belonging to specified set. First
398 following byte is number of bitmap bytes. Then come bytes
399 for a bitmap saying which chars are in. Bits in each byte
400 are ordered low-bit-first. A character is in the set if its
401 bit is 1. A character too large to have a bit in the map is
402 automatically not in the set. */
403 charset,
404
405 /* Same parameters as charset, but match any character that is
406 not one of those specified. */
407 charset_not,
408
409 /* Start remembering the text that is matched, for storing in a
410 register. Followed by one byte with the register number, in
411 the range 0 to one less than the pattern buffer's re_nsub
412 field. Then followed by one byte with the number of groups
413 inner to this one. (This last has to be part of the
414 start_memory only because we need it in the on_failure_jump
415 of re_match_2.) */
416 start_memory,
417
418 /* Stop remembering the text that is matched and store it in a
419 memory register. Followed by one byte with the register
420 number, in the range 0 to one less than `re_nsub' in the
421 pattern buffer, and one byte with the number of inner groups,
422 just like `start_memory'. (We need the number of inner
423 groups here because we don't have any easy way of finding the
424 corresponding start_memory when we're at a stop_memory.) */
425 stop_memory,
426
427 /* Match a duplicate of something remembered. Followed by one
428 byte containing the register number. */
429 duplicate,
430
431 /* Fail unless at beginning of line. */
432 begline,
433
434 /* Fail unless at end of line. */
435 endline,
436
437 /* Succeeds if at beginning of buffer (if emacs) or at beginning
438 of string to be matched (if not). */
439 begbuf,
440
441 /* Analogously, for end of buffer/string. */
442 endbuf,
443
444 /* Followed by two byte relative address to which to jump. */
445 jump,
446
447 /* Same as jump, but marks the end of an alternative. */
448 jump_past_alt,
449
450 /* Followed by two-byte relative address of place to resume at
451 in case of failure. */
452 on_failure_jump,
453
454 /* Like on_failure_jump, but pushes a placeholder instead of the
455 current string position when executed. */
456 on_failure_keep_string_jump,
457
458 /* Throw away latest failure point and then jump to following
459 two-byte relative address. */
460 pop_failure_jump,
461
462 /* Change to pop_failure_jump if know won't have to backtrack to
463 match; otherwise change to jump. This is used to jump
464 back to the beginning of a repeat. If what follows this jump
465 clearly won't match what the repeat does, such that we can be
466 sure that there is no use backtracking out of repetitions
467 already matched, then we change it to a pop_failure_jump.
468 Followed by two-byte address. */
469 maybe_pop_jump,
470
471 /* Jump to following two-byte address, and push a dummy failure
472 point. This failure point will be thrown away if an attempt
473 is made to use it for a failure. A `+' construct makes this
474 before the first repeat. Also used as an intermediary kind
475 of jump when compiling an alternative. */
476 dummy_failure_jump,
477
478 /* Push a dummy failure point and continue. Used at the end of
479 alternatives. */
480 push_dummy_failure,
481
482 /* Followed by two-byte relative address and two-byte number n.
483 After matching N times, jump to the address upon failure. */
484 succeed_n,
485
486 /* Followed by two-byte relative address, and two-byte number n.
487 Jump to the address N times, then fail. */
488 jump_n,
489
490 /* Set the following two-byte relative address to the
491 subsequent two-byte number. The address *includes* the two
492 bytes of number. */
493 set_number_at,
494
495 wordchar, /* Matches any word-constituent character. */
496 notwordchar, /* Matches any char that is not a word-constituent. */
497
498 wordbeg, /* Succeeds if at word beginning. */
499 wordend, /* Succeeds if at word end. */
500
501 wordbound, /* Succeeds if at a word boundary. */
502 notwordbound /* Succeeds if not at a word boundary. */
503
504 #ifdef emacs
505 ,before_dot, /* Succeeds if before point. */
506 at_dot, /* Succeeds if at point. */
507 after_dot, /* Succeeds if after point. */
508
509 /* Matches any character whose syntax is specified. Followed by
510 a byte which contains a syntax code, e.g., Sword. */
511 syntaxspec,
512
513 /* Matches any character whose syntax is not that specified. */
514 notsyntaxspec
515 #endif /* emacs */
516 } re_opcode_t;
517 \f
518 /* Common operations on the compiled pattern. */
519
520 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
521
522 #define STORE_NUMBER(destination, number) \
523 do { \
524 (destination)[0] = (number) & 0377; \
525 (destination)[1] = (number) >> 8; \
526 } while (0)
527
528 /* Same as STORE_NUMBER, except increment DESTINATION to
529 the byte after where the number is stored. Therefore, DESTINATION
530 must be an lvalue. */
531
532 #define STORE_NUMBER_AND_INCR(destination, number) \
533 do { \
534 STORE_NUMBER (destination, number); \
535 (destination) += 2; \
536 } while (0)
537
538 /* Put into DESTINATION a number stored in two contiguous bytes starting
539 at SOURCE. */
540
541 #define EXTRACT_NUMBER(destination, source) \
542 do { \
543 (destination) = *(source) & 0377; \
544 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
545 } while (0)
546
547 #ifdef DEBUG
548 static void extract_number _RE_ARGS ((int *dest, unsigned char *source));
549 static void
550 extract_number (dest, source)
551 int *dest;
552 unsigned char *source;
553 {
554 int temp = SIGN_EXTEND_CHAR (*(source + 1));
555 *dest = *source & 0377;
556 *dest += temp << 8;
557 }
558
559 # ifndef EXTRACT_MACROS /* To debug the macros. */
560 # undef EXTRACT_NUMBER
561 # define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
562 # endif /* not EXTRACT_MACROS */
563
564 #endif /* DEBUG */
565
566 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
567 SOURCE must be an lvalue. */
568
569 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
570 do { \
571 EXTRACT_NUMBER (destination, source); \
572 (source) += 2; \
573 } while (0)
574
575 #ifdef DEBUG
576 static void extract_number_and_incr _RE_ARGS ((int *destination,
577 unsigned char **source));
578 static void
579 extract_number_and_incr (destination, source)
580 int *destination;
581 unsigned char **source;
582 {
583 extract_number (destination, *source);
584 *source += 2;
585 }
586
587 # ifndef EXTRACT_MACROS
588 # undef EXTRACT_NUMBER_AND_INCR
589 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
590 extract_number_and_incr (&dest, &src)
591 # endif /* not EXTRACT_MACROS */
592
593 #endif /* DEBUG */
594 \f
595 /* If DEBUG is defined, Regex prints many voluminous messages about what
596 it is doing (if the variable `debug' is nonzero). If linked with the
597 main program in `iregex.c', you can enter patterns and strings
598 interactively. And if linked with the main program in `main.c' and
599 the other test files, you can run the already-written tests. */
600
601 #ifdef DEBUG
602
603 /* We use standard I/O for debugging. */
604 # include <stdio.h>
605
606 /* It is useful to test things that ``must'' be true when debugging. */
607 # include <assert.h>
608
609 static int debug = 0;
610
611 # define DEBUG_STATEMENT(e) e
612 # define DEBUG_PRINT1(x) if (debug) printf (x)
613 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
614 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
615 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
616 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
617 if (debug) print_partial_compiled_pattern (s, e)
618 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
619 if (debug) print_double_string (w, s1, sz1, s2, sz2)
620
621
622 /* Print the fastmap in human-readable form. */
623
624 void
625 print_fastmap (fastmap)
626 char *fastmap;
627 {
628 unsigned was_a_range = 0;
629 unsigned i = 0;
630
631 while (i < (1 << BYTEWIDTH))
632 {
633 if (fastmap[i++])
634 {
635 was_a_range = 0;
636 putchar (i - 1);
637 while (i < (1 << BYTEWIDTH) && fastmap[i])
638 {
639 was_a_range = 1;
640 i++;
641 }
642 if (was_a_range)
643 {
644 printf ("-");
645 putchar (i - 1);
646 }
647 }
648 }
649 putchar ('\n');
650 }
651
652
653 /* Print a compiled pattern string in human-readable form, starting at
654 the START pointer into it and ending just before the pointer END. */
655
656 void
657 print_partial_compiled_pattern (start, end)
658 unsigned char *start;
659 unsigned char *end;
660 {
661 int mcnt, mcnt2;
662 unsigned char *p1;
663 unsigned char *p = start;
664 unsigned char *pend = end;
665
666 if (start == NULL)
667 {
668 printf ("(null)\n");
669 return;
670 }
671
672 /* Loop over pattern commands. */
673 while (p < pend)
674 {
675 printf ("%d:\t", p - start);
676
677 switch ((re_opcode_t) *p++)
678 {
679 case no_op:
680 printf ("/no_op");
681 break;
682
683 case exactn:
684 mcnt = *p++;
685 printf ("/exactn/%d", mcnt);
686 do
687 {
688 putchar ('/');
689 putchar (*p++);
690 }
691 while (--mcnt);
692 break;
693
694 case start_memory:
695 mcnt = *p++;
696 printf ("/start_memory/%d/%d", mcnt, *p++);
697 break;
698
699 case stop_memory:
700 mcnt = *p++;
701 printf ("/stop_memory/%d/%d", mcnt, *p++);
702 break;
703
704 case duplicate:
705 printf ("/duplicate/%d", *p++);
706 break;
707
708 case anychar:
709 printf ("/anychar");
710 break;
711
712 case charset:
713 case charset_not:
714 {
715 register int c, last = -100;
716 register int in_range = 0;
717
718 printf ("/charset [%s",
719 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
720
721 assert (p + *p < pend);
722
723 for (c = 0; c < 256; c++)
724 if (c / 8 < *p
725 && (p[1 + (c/8)] & (1 << (c % 8))))
726 {
727 /* Are we starting a range? */
728 if (last + 1 == c && ! in_range)
729 {
730 putchar ('-');
731 in_range = 1;
732 }
733 /* Have we broken a range? */
734 else if (last + 1 != c && in_range)
735 {
736 putchar (last);
737 in_range = 0;
738 }
739
740 if (! in_range)
741 putchar (c);
742
743 last = c;
744 }
745
746 if (in_range)
747 putchar (last);
748
749 putchar (']');
750
751 p += 1 + *p;
752 }
753 break;
754
755 case begline:
756 printf ("/begline");
757 break;
758
759 case endline:
760 printf ("/endline");
761 break;
762
763 case on_failure_jump:
764 extract_number_and_incr (&mcnt, &p);
765 printf ("/on_failure_jump to %d", p + mcnt - start);
766 break;
767
768 case on_failure_keep_string_jump:
769 extract_number_and_incr (&mcnt, &p);
770 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
771 break;
772
773 case dummy_failure_jump:
774 extract_number_and_incr (&mcnt, &p);
775 printf ("/dummy_failure_jump to %d", p + mcnt - start);
776 break;
777
778 case push_dummy_failure:
779 printf ("/push_dummy_failure");
780 break;
781
782 case maybe_pop_jump:
783 extract_number_and_incr (&mcnt, &p);
784 printf ("/maybe_pop_jump to %d", p + mcnt - start);
785 break;
786
787 case pop_failure_jump:
788 extract_number_and_incr (&mcnt, &p);
789 printf ("/pop_failure_jump to %d", p + mcnt - start);
790 break;
791
792 case jump_past_alt:
793 extract_number_and_incr (&mcnt, &p);
794 printf ("/jump_past_alt to %d", p + mcnt - start);
795 break;
796
797 case jump:
798 extract_number_and_incr (&mcnt, &p);
799 printf ("/jump to %d", p + mcnt - start);
800 break;
801
802 case succeed_n:
803 extract_number_and_incr (&mcnt, &p);
804 p1 = p + mcnt;
805 extract_number_and_incr (&mcnt2, &p);
806 printf ("/succeed_n to %d, %d times", p1 - start, mcnt2);
807 break;
808
809 case jump_n:
810 extract_number_and_incr (&mcnt, &p);
811 p1 = p + mcnt;
812 extract_number_and_incr (&mcnt2, &p);
813 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
814 break;
815
816 case set_number_at:
817 extract_number_and_incr (&mcnt, &p);
818 p1 = p + mcnt;
819 extract_number_and_incr (&mcnt2, &p);
820 printf ("/set_number_at location %d to %d", p1 - start, mcnt2);
821 break;
822
823 case wordbound:
824 printf ("/wordbound");
825 break;
826
827 case notwordbound:
828 printf ("/notwordbound");
829 break;
830
831 case wordbeg:
832 printf ("/wordbeg");
833 break;
834
835 case wordend:
836 printf ("/wordend");
837
838 # ifdef emacs
839 case before_dot:
840 printf ("/before_dot");
841 break;
842
843 case at_dot:
844 printf ("/at_dot");
845 break;
846
847 case after_dot:
848 printf ("/after_dot");
849 break;
850
851 case syntaxspec:
852 printf ("/syntaxspec");
853 mcnt = *p++;
854 printf ("/%d", mcnt);
855 break;
856
857 case notsyntaxspec:
858 printf ("/notsyntaxspec");
859 mcnt = *p++;
860 printf ("/%d", mcnt);
861 break;
862 # endif /* emacs */
863
864 case wordchar:
865 printf ("/wordchar");
866 break;
867
868 case notwordchar:
869 printf ("/notwordchar");
870 break;
871
872 case begbuf:
873 printf ("/begbuf");
874 break;
875
876 case endbuf:
877 printf ("/endbuf");
878 break;
879
880 default:
881 printf ("?%d", *(p-1));
882 }
883
884 putchar ('\n');
885 }
886
887 printf ("%d:\tend of pattern.\n", p - start);
888 }
889
890
891 void
892 print_compiled_pattern (bufp)
893 struct re_pattern_buffer *bufp;
894 {
895 unsigned char *buffer = bufp->buffer;
896
897 print_partial_compiled_pattern (buffer, buffer + bufp->used);
898 printf ("%ld bytes used/%ld bytes allocated.\n",
899 bufp->used, bufp->allocated);
900
901 if (bufp->fastmap_accurate && bufp->fastmap)
902 {
903 printf ("fastmap: ");
904 print_fastmap (bufp->fastmap);
905 }
906
907 printf ("re_nsub: %d\t", bufp->re_nsub);
908 printf ("regs_alloc: %d\t", bufp->regs_allocated);
909 printf ("can_be_null: %d\t", bufp->can_be_null);
910 printf ("newline_anchor: %d\n", bufp->newline_anchor);
911 printf ("no_sub: %d\t", bufp->no_sub);
912 printf ("not_bol: %d\t", bufp->not_bol);
913 printf ("not_eol: %d\t", bufp->not_eol);
914 printf ("syntax: %lx\n", bufp->syntax);
915 /* Perhaps we should print the translate table? */
916 }
917
918
919 void
920 print_double_string (where, string1, size1, string2, size2)
921 const char *where;
922 const char *string1;
923 const char *string2;
924 int size1;
925 int size2;
926 {
927 int this_char;
928
929 if (where == NULL)
930 printf ("(null)");
931 else
932 {
933 if (FIRST_STRING_P (where))
934 {
935 for (this_char = where - string1; this_char < size1; this_char++)
936 putchar (string1[this_char]);
937
938 where = string2;
939 }
940
941 for (this_char = where - string2; this_char < size2; this_char++)
942 putchar (string2[this_char]);
943 }
944 }
945
946 void
947 printchar (c)
948 int c;
949 {
950 putc (c, stderr);
951 }
952
953 #else /* not DEBUG */
954
955 # undef assert
956 # define assert(e)
957
958 # define DEBUG_STATEMENT(e)
959 # define DEBUG_PRINT1(x)
960 # define DEBUG_PRINT2(x1, x2)
961 # define DEBUG_PRINT3(x1, x2, x3)
962 # define DEBUG_PRINT4(x1, x2, x3, x4)
963 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
964 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
965
966 #endif /* not DEBUG */
967 \f
968 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
969 also be assigned to arbitrarily: each pattern buffer stores its own
970 syntax, so it can be changed between regex compilations. */
971 /* This has no initializer because initialized variables in Emacs
972 become read-only after dumping. */
973 reg_syntax_t re_syntax_options;
974
975
976 /* Specify the precise syntax of regexps for compilation. This provides
977 for compatibility for various utilities which historically have
978 different, incompatible syntaxes.
979
980 The argument SYNTAX is a bit mask comprised of the various bits
981 defined in regex.h. We return the old syntax. */
982
983 reg_syntax_t
984 re_set_syntax (syntax)
985 reg_syntax_t syntax;
986 {
987 reg_syntax_t ret = re_syntax_options;
988
989 re_syntax_options = syntax;
990 #ifdef DEBUG
991 if (syntax & RE_DEBUG)
992 debug = 1;
993 else if (debug) /* was on but now is not */
994 debug = 0;
995 #endif /* DEBUG */
996 return ret;
997 }
998 #ifdef _LIBC
999 weak_alias (__re_set_syntax, re_set_syntax)
1000 #endif
1001 \f
1002 /* This table gives an error message for each of the error codes listed
1003 in regex.h. Obviously the order here has to be same as there.
1004 POSIX doesn't require that we do anything for REG_NOERROR,
1005 but why not be nice? */
1006
1007 static const char *re_error_msgid[] =
1008 {
1009 gettext_noop ("Success"), /* REG_NOERROR */
1010 gettext_noop ("No match"), /* REG_NOMATCH */
1011 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1012 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1013 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1014 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1015 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1016 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1017 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1018 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1019 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1020 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1021 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1022 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1023 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1024 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1025 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1026 };
1027 \f
1028 /* Avoiding alloca during matching, to placate r_alloc. */
1029
1030 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1031 searching and matching functions should not call alloca. On some
1032 systems, alloca is implemented in terms of malloc, and if we're
1033 using the relocating allocator routines, then malloc could cause a
1034 relocation, which might (if the strings being searched are in the
1035 ralloc heap) shift the data out from underneath the regexp
1036 routines.
1037
1038 Here's another reason to avoid allocation: Emacs
1039 processes input from X in a signal handler; processing X input may
1040 call malloc; if input arrives while a matching routine is calling
1041 malloc, then we're scrod. But Emacs can't just block input while
1042 calling matching routines; then we don't notice interrupts when
1043 they come in. So, Emacs blocks input around all regexp calls
1044 except the matching calls, which it leaves unprotected, in the
1045 faith that they will not malloc. */
1046
1047 /* Normally, this is fine. */
1048 #define MATCH_MAY_ALLOCATE
1049
1050 /* When using GNU C, we are not REALLY using the C alloca, no matter
1051 what config.h may say. So don't take precautions for it. */
1052 #ifdef __GNUC__
1053 # undef C_ALLOCA
1054 #endif
1055
1056 /* The match routines may not allocate if (1) they would do it with malloc
1057 and (2) it's not safe for them to use malloc.
1058 Note that if REL_ALLOC is defined, matching would not use malloc for the
1059 failure stack, but we would still use it for the register vectors;
1060 so REL_ALLOC should not affect this. */
1061 #if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1062 # undef MATCH_MAY_ALLOCATE
1063 #endif
1064
1065 \f
1066 /* Failure stack declarations and macros; both re_compile_fastmap and
1067 re_match_2 use a failure stack. These have to be macros because of
1068 REGEX_ALLOCATE_STACK. */
1069
1070
1071 /* Number of failure points for which to initially allocate space
1072 when matching. If this number is exceeded, we allocate more
1073 space, so it is not a hard limit. */
1074 #ifndef INIT_FAILURE_ALLOC
1075 # define INIT_FAILURE_ALLOC 5
1076 #endif
1077
1078 /* Roughly the maximum number of failure points on the stack. Would be
1079 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1080 This is a variable only so users of regex can assign to it; we never
1081 change it ourselves. */
1082
1083 #ifdef INT_IS_16BIT
1084
1085 # if defined MATCH_MAY_ALLOCATE
1086 /* 4400 was enough to cause a crash on Alpha OSF/1,
1087 whose default stack limit is 2mb. */
1088 long int re_max_failures = 4000;
1089 # else
1090 long int re_max_failures = 2000;
1091 # endif
1092
1093 union fail_stack_elt
1094 {
1095 unsigned char *pointer;
1096 long int integer;
1097 };
1098
1099 typedef union fail_stack_elt fail_stack_elt_t;
1100
1101 typedef struct
1102 {
1103 fail_stack_elt_t *stack;
1104 unsigned long int size;
1105 unsigned long int avail; /* Offset of next open position. */
1106 } fail_stack_type;
1107
1108 #else /* not INT_IS_16BIT */
1109
1110 # if defined MATCH_MAY_ALLOCATE
1111 /* 4400 was enough to cause a crash on Alpha OSF/1,
1112 whose default stack limit is 2mb. */
1113 int re_max_failures = 20000;
1114 # else
1115 int re_max_failures = 2000;
1116 # endif
1117
1118 union fail_stack_elt
1119 {
1120 unsigned char *pointer;
1121 int integer;
1122 };
1123
1124 typedef union fail_stack_elt fail_stack_elt_t;
1125
1126 typedef struct
1127 {
1128 fail_stack_elt_t *stack;
1129 unsigned size;
1130 unsigned avail; /* Offset of next open position. */
1131 } fail_stack_type;
1132
1133 #endif /* INT_IS_16BIT */
1134
1135 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1136 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1137 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1138
1139
1140 /* Define macros to initialize and free the failure stack.
1141 Do `return -2' if the alloc fails. */
1142
1143 #ifdef MATCH_MAY_ALLOCATE
1144 # define INIT_FAIL_STACK() \
1145 do { \
1146 fail_stack.stack = (fail_stack_elt_t *) \
1147 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
1148 \
1149 if (fail_stack.stack == NULL) \
1150 return -2; \
1151 \
1152 fail_stack.size = INIT_FAILURE_ALLOC; \
1153 fail_stack.avail = 0; \
1154 } while (0)
1155
1156 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1157 #else
1158 # define INIT_FAIL_STACK() \
1159 do { \
1160 fail_stack.avail = 0; \
1161 } while (0)
1162
1163 # define RESET_FAIL_STACK()
1164 #endif
1165
1166
1167 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1168
1169 Return 1 if succeeds, and 0 if either ran out of memory
1170 allocating space for it or it was already too large.
1171
1172 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1173
1174 #define DOUBLE_FAIL_STACK(fail_stack) \
1175 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1176 ? 0 \
1177 : ((fail_stack).stack = (fail_stack_elt_t *) \
1178 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1179 (fail_stack).size * sizeof (fail_stack_elt_t), \
1180 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1181 \
1182 (fail_stack).stack == NULL \
1183 ? 0 \
1184 : ((fail_stack).size <<= 1, \
1185 1)))
1186
1187
1188 /* Push pointer POINTER on FAIL_STACK.
1189 Return 1 if was able to do so and 0 if ran out of memory allocating
1190 space to do so. */
1191 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1192 ((FAIL_STACK_FULL () \
1193 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1194 ? 0 \
1195 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1196 1))
1197
1198 /* Push a pointer value onto the failure stack.
1199 Assumes the variable `fail_stack'. Probably should only
1200 be called from within `PUSH_FAILURE_POINT'. */
1201 #define PUSH_FAILURE_POINTER(item) \
1202 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1203
1204 /* This pushes an integer-valued item onto the failure stack.
1205 Assumes the variable `fail_stack'. Probably should only
1206 be called from within `PUSH_FAILURE_POINT'. */
1207 #define PUSH_FAILURE_INT(item) \
1208 fail_stack.stack[fail_stack.avail++].integer = (item)
1209
1210 /* Push a fail_stack_elt_t value onto the failure stack.
1211 Assumes the variable `fail_stack'. Probably should only
1212 be called from within `PUSH_FAILURE_POINT'. */
1213 #define PUSH_FAILURE_ELT(item) \
1214 fail_stack.stack[fail_stack.avail++] = (item)
1215
1216 /* These three POP... operations complement the three PUSH... operations.
1217 All assume that `fail_stack' is nonempty. */
1218 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1219 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1220 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1221
1222 /* Used to omit pushing failure point id's when we're not debugging. */
1223 #ifdef DEBUG
1224 # define DEBUG_PUSH PUSH_FAILURE_INT
1225 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1226 #else
1227 # define DEBUG_PUSH(item)
1228 # define DEBUG_POP(item_addr)
1229 #endif
1230
1231
1232 /* Push the information about the state we will need
1233 if we ever fail back to it.
1234
1235 Requires variables fail_stack, regstart, regend, reg_info, and
1236 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1237 be declared.
1238
1239 Does `return FAILURE_CODE' if runs out of memory. */
1240
1241 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1242 do { \
1243 char *destination; \
1244 /* Must be int, so when we don't save any registers, the arithmetic \
1245 of 0 + -1 isn't done as unsigned. */ \
1246 /* Can't be int, since there is not a shred of a guarantee that int \
1247 is wide enough to hold a value of something to which pointer can \
1248 be assigned */ \
1249 active_reg_t this_reg; \
1250 \
1251 DEBUG_STATEMENT (failure_id++); \
1252 DEBUG_STATEMENT (nfailure_points_pushed++); \
1253 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1254 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1255 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1256 \
1257 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1258 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1259 \
1260 /* Ensure we have enough space allocated for what we will push. */ \
1261 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1262 { \
1263 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1264 return failure_code; \
1265 \
1266 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1267 (fail_stack).size); \
1268 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1269 } \
1270 \
1271 /* Push the info, starting with the registers. */ \
1272 DEBUG_PRINT1 ("\n"); \
1273 \
1274 if (1) \
1275 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1276 this_reg++) \
1277 { \
1278 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1279 DEBUG_STATEMENT (num_regs_pushed++); \
1280 \
1281 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1282 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1283 \
1284 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1285 PUSH_FAILURE_POINTER (regend[this_reg]); \
1286 \
1287 DEBUG_PRINT2 (" info: %p\n ", \
1288 reg_info[this_reg].word.pointer); \
1289 DEBUG_PRINT2 (" match_null=%d", \
1290 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1291 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1292 DEBUG_PRINT2 (" matched_something=%d", \
1293 MATCHED_SOMETHING (reg_info[this_reg])); \
1294 DEBUG_PRINT2 (" ever_matched=%d", \
1295 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1296 DEBUG_PRINT1 ("\n"); \
1297 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1298 } \
1299 \
1300 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1301 PUSH_FAILURE_INT (lowest_active_reg); \
1302 \
1303 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1304 PUSH_FAILURE_INT (highest_active_reg); \
1305 \
1306 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1307 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1308 PUSH_FAILURE_POINTER (pattern_place); \
1309 \
1310 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1311 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1312 size2); \
1313 DEBUG_PRINT1 ("'\n"); \
1314 PUSH_FAILURE_POINTER (string_place); \
1315 \
1316 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1317 DEBUG_PUSH (failure_id); \
1318 } while (0)
1319
1320 /* This is the number of items that are pushed and popped on the stack
1321 for each register. */
1322 #define NUM_REG_ITEMS 3
1323
1324 /* Individual items aside from the registers. */
1325 #ifdef DEBUG
1326 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1327 #else
1328 # define NUM_NONREG_ITEMS 4
1329 #endif
1330
1331 /* We push at most this many items on the stack. */
1332 /* We used to use (num_regs - 1), which is the number of registers
1333 this regexp will save; but that was changed to 5
1334 to avoid stack overflow for a regexp with lots of parens. */
1335 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1336
1337 /* We actually push this many items. */
1338 #define NUM_FAILURE_ITEMS \
1339 (((0 \
1340 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1341 * NUM_REG_ITEMS) \
1342 + NUM_NONREG_ITEMS)
1343
1344 /* How many items can still be added to the stack without overflowing it. */
1345 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1346
1347
1348 /* Pops what PUSH_FAIL_STACK pushes.
1349
1350 We restore into the parameters, all of which should be lvalues:
1351 STR -- the saved data position.
1352 PAT -- the saved pattern position.
1353 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1354 REGSTART, REGEND -- arrays of string positions.
1355 REG_INFO -- array of information about each subexpression.
1356
1357 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1358 `pend', `string1', `size1', `string2', and `size2'. */
1359
1360 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1361 { \
1362 DEBUG_STATEMENT (unsigned failure_id;) \
1363 active_reg_t this_reg; \
1364 const unsigned char *string_temp; \
1365 \
1366 assert (!FAIL_STACK_EMPTY ()); \
1367 \
1368 /* Remove failure points and point to how many regs pushed. */ \
1369 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1370 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1371 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1372 \
1373 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1374 \
1375 DEBUG_POP (&failure_id); \
1376 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1377 \
1378 /* If the saved string location is NULL, it came from an \
1379 on_failure_keep_string_jump opcode, and we want to throw away the \
1380 saved NULL, thus retaining our current position in the string. */ \
1381 string_temp = POP_FAILURE_POINTER (); \
1382 if (string_temp != NULL) \
1383 str = (const char *) string_temp; \
1384 \
1385 DEBUG_PRINT2 (" Popping string %p: `", str); \
1386 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1387 DEBUG_PRINT1 ("'\n"); \
1388 \
1389 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1390 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1391 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1392 \
1393 /* Restore register info. */ \
1394 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1395 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1396 \
1397 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1398 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1399 \
1400 if (1) \
1401 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1402 { \
1403 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1404 \
1405 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1406 DEBUG_PRINT2 (" info: %p\n", \
1407 reg_info[this_reg].word.pointer); \
1408 \
1409 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1410 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1411 \
1412 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \
1413 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1414 } \
1415 else \
1416 { \
1417 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1418 { \
1419 reg_info[this_reg].word.integer = 0; \
1420 regend[this_reg] = 0; \
1421 regstart[this_reg] = 0; \
1422 } \
1423 highest_active_reg = high_reg; \
1424 } \
1425 \
1426 set_regs_matched_done = 0; \
1427 DEBUG_STATEMENT (nfailure_points_popped++); \
1428 } /* POP_FAILURE_POINT */
1429
1430
1431 \f
1432 /* Structure for per-register (a.k.a. per-group) information.
1433 Other register information, such as the
1434 starting and ending positions (which are addresses), and the list of
1435 inner groups (which is a bits list) are maintained in separate
1436 variables.
1437
1438 We are making a (strictly speaking) nonportable assumption here: that
1439 the compiler will pack our bit fields into something that fits into
1440 the type of `word', i.e., is something that fits into one item on the
1441 failure stack. */
1442
1443
1444 /* Declarations and macros for re_match_2. */
1445
1446 typedef union
1447 {
1448 fail_stack_elt_t word;
1449 struct
1450 {
1451 /* This field is one if this group can match the empty string,
1452 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1453 #define MATCH_NULL_UNSET_VALUE 3
1454 unsigned match_null_string_p : 2;
1455 unsigned is_active : 1;
1456 unsigned matched_something : 1;
1457 unsigned ever_matched_something : 1;
1458 } bits;
1459 } register_info_type;
1460
1461 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1462 #define IS_ACTIVE(R) ((R).bits.is_active)
1463 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1464 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1465
1466
1467 /* Call this when have matched a real character; it sets `matched' flags
1468 for the subexpressions which we are currently inside. Also records
1469 that those subexprs have matched. */
1470 #define SET_REGS_MATCHED() \
1471 do \
1472 { \
1473 if (!set_regs_matched_done) \
1474 { \
1475 active_reg_t r; \
1476 set_regs_matched_done = 1; \
1477 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1478 { \
1479 MATCHED_SOMETHING (reg_info[r]) \
1480 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1481 = 1; \
1482 } \
1483 } \
1484 } \
1485 while (0)
1486
1487 /* Registers are set to a sentinel when they haven't yet matched. */
1488 static char reg_unset_dummy;
1489 #define REG_UNSET_VALUE (&reg_unset_dummy)
1490 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1491 \f
1492 /* Subroutine declarations and macros for regex_compile. */
1493
1494 static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,
1495 reg_syntax_t syntax,
1496 struct re_pattern_buffer *bufp));
1497 static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1498 static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1499 int arg1, int arg2));
1500 static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1501 int arg, unsigned char *end));
1502 static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1503 int arg1, int arg2, unsigned char *end));
1504 static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,
1505 reg_syntax_t syntax));
1506 static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,
1507 reg_syntax_t syntax));
1508 static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr,
1509 const char *pend,
1510 char *translate,
1511 reg_syntax_t syntax,
1512 unsigned char *b));
1513
1514 /* Fetch the next character in the uncompiled pattern---translating it
1515 if necessary. Also cast from a signed character in the constant
1516 string passed to us by the user to an unsigned char that we can use
1517 as an array index (in, e.g., `translate'). */
1518 #ifndef PATFETCH
1519 # define PATFETCH(c) \
1520 do {if (p == pend) return REG_EEND; \
1521 c = (unsigned char) *p++; \
1522 if (translate) c = (unsigned char) translate[c]; \
1523 } while (0)
1524 #endif
1525
1526 /* Fetch the next character in the uncompiled pattern, with no
1527 translation. */
1528 #define PATFETCH_RAW(c) \
1529 do {if (p == pend) return REG_EEND; \
1530 c = (unsigned char) *p++; \
1531 } while (0)
1532
1533 /* Go backwards one character in the pattern. */
1534 #define PATUNFETCH p--
1535
1536
1537 /* If `translate' is non-null, return translate[D], else just D. We
1538 cast the subscript to translate because some data is declared as
1539 `char *', to avoid warnings when a string constant is passed. But
1540 when we use a character as a subscript we must make it unsigned. */
1541 #ifndef TRANSLATE
1542 # define TRANSLATE(d) \
1543 (translate ? (char) translate[(unsigned char) (d)] : (d))
1544 #endif
1545
1546
1547 /* Macros for outputting the compiled pattern into `buffer'. */
1548
1549 /* If the buffer isn't allocated when it comes in, use this. */
1550 #define INIT_BUF_SIZE 32
1551
1552 /* Make sure we have at least N more bytes of space in buffer. */
1553 #define GET_BUFFER_SPACE(n) \
1554 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
1555 EXTEND_BUFFER ()
1556
1557 /* Make sure we have one more byte of buffer space and then add C to it. */
1558 #define BUF_PUSH(c) \
1559 do { \
1560 GET_BUFFER_SPACE (1); \
1561 *b++ = (unsigned char) (c); \
1562 } while (0)
1563
1564
1565 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1566 #define BUF_PUSH_2(c1, c2) \
1567 do { \
1568 GET_BUFFER_SPACE (2); \
1569 *b++ = (unsigned char) (c1); \
1570 *b++ = (unsigned char) (c2); \
1571 } while (0)
1572
1573
1574 /* As with BUF_PUSH_2, except for three bytes. */
1575 #define BUF_PUSH_3(c1, c2, c3) \
1576 do { \
1577 GET_BUFFER_SPACE (3); \
1578 *b++ = (unsigned char) (c1); \
1579 *b++ = (unsigned char) (c2); \
1580 *b++ = (unsigned char) (c3); \
1581 } while (0)
1582
1583
1584 /* Store a jump with opcode OP at LOC to location TO. We store a
1585 relative address offset by the three bytes the jump itself occupies. */
1586 #define STORE_JUMP(op, loc, to) \
1587 store_op1 (op, loc, (int) ((to) - (loc) - 3))
1588
1589 /* Likewise, for a two-argument jump. */
1590 #define STORE_JUMP2(op, loc, to, arg) \
1591 store_op2 (op, loc, (int) ((to) - (loc) - 3), arg)
1592
1593 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1594 #define INSERT_JUMP(op, loc, to) \
1595 insert_op1 (op, loc, (int) ((to) - (loc) - 3), b)
1596
1597 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1598 #define INSERT_JUMP2(op, loc, to, arg) \
1599 insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b)
1600
1601
1602 /* This is not an arbitrary limit: the arguments which represent offsets
1603 into the pattern are two bytes long. So if 2^16 bytes turns out to
1604 be too small, many things would have to change. */
1605 /* Any other compiler which, like MSC, has allocation limit below 2^16
1606 bytes will have to use approach similar to what was done below for
1607 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1608 reallocating to 0 bytes. Such thing is not going to work too well.
1609 You have been warned!! */
1610 #if defined _MSC_VER && !defined WIN32
1611 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
1612 The REALLOC define eliminates a flurry of conversion warnings,
1613 but is not required. */
1614 # define MAX_BUF_SIZE 65500L
1615 # define REALLOC(p,s) realloc ((p), (size_t) (s))
1616 #else
1617 # define MAX_BUF_SIZE (1L << 16)
1618 # define REALLOC(p,s) realloc ((p), (s))
1619 #endif
1620
1621 /* Extend the buffer by twice its current size via realloc and
1622 reset the pointers that pointed into the old block to point to the
1623 correct places in the new one. If extending the buffer results in it
1624 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1625 #define EXTEND_BUFFER() \
1626 do { \
1627 unsigned char *old_buffer = bufp->buffer; \
1628 if (bufp->allocated == MAX_BUF_SIZE) \
1629 return REG_ESIZE; \
1630 bufp->allocated <<= 1; \
1631 if (bufp->allocated > MAX_BUF_SIZE) \
1632 bufp->allocated = MAX_BUF_SIZE; \
1633 bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\
1634 if (bufp->buffer == NULL) \
1635 return REG_ESPACE; \
1636 /* If the buffer moved, move all the pointers into it. */ \
1637 if (old_buffer != bufp->buffer) \
1638 { \
1639 b = (b - old_buffer) + bufp->buffer; \
1640 begalt = (begalt - old_buffer) + bufp->buffer; \
1641 if (fixup_alt_jump) \
1642 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1643 if (laststart) \
1644 laststart = (laststart - old_buffer) + bufp->buffer; \
1645 if (pending_exact) \
1646 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1647 } \
1648 } while (0)
1649
1650
1651 /* Since we have one byte reserved for the register number argument to
1652 {start,stop}_memory, the maximum number of groups we can report
1653 things about is what fits in that byte. */
1654 #define MAX_REGNUM 255
1655
1656 /* But patterns can have more than `MAX_REGNUM' registers. We just
1657 ignore the excess. */
1658 typedef unsigned regnum_t;
1659
1660
1661 /* Macros for the compile stack. */
1662
1663 /* Since offsets can go either forwards or backwards, this type needs to
1664 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1665 /* int may be not enough when sizeof(int) == 2. */
1666 typedef long pattern_offset_t;
1667
1668 typedef struct
1669 {
1670 pattern_offset_t begalt_offset;
1671 pattern_offset_t fixup_alt_jump;
1672 pattern_offset_t inner_group_offset;
1673 pattern_offset_t laststart_offset;
1674 regnum_t regnum;
1675 } compile_stack_elt_t;
1676
1677
1678 typedef struct
1679 {
1680 compile_stack_elt_t *stack;
1681 unsigned size;
1682 unsigned avail; /* Offset of next open position. */
1683 } compile_stack_type;
1684
1685
1686 #define INIT_COMPILE_STACK_SIZE 32
1687
1688 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1689 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1690
1691 /* The next available element. */
1692 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1693
1694
1695 /* Set the bit for character C in a list. */
1696 #define SET_LIST_BIT(c) \
1697 (b[((unsigned char) (c)) / BYTEWIDTH] \
1698 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1699
1700
1701 /* Get the next unsigned number in the uncompiled pattern. */
1702 #define GET_UNSIGNED_NUMBER(num) \
1703 { if (p != pend) \
1704 { \
1705 PATFETCH (c); \
1706 while (ISDIGIT (c)) \
1707 { \
1708 if (num < 0) \
1709 num = 0; \
1710 num = num * 10 + c - '0'; \
1711 if (p == pend) \
1712 break; \
1713 PATFETCH (c); \
1714 } \
1715 } \
1716 }
1717
1718 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H)
1719 /* The GNU C library provides support for user-defined character classes
1720 and the functions from ISO C amendement 1. */
1721 # ifdef CHARCLASS_NAME_MAX
1722 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
1723 # else
1724 /* This shouldn't happen but some implementation might still have this
1725 problem. Use a reasonable default value. */
1726 # define CHAR_CLASS_MAX_LENGTH 256
1727 # endif
1728
1729 # ifdef _LIBC
1730 # define IS_CHAR_CLASS(string) __wctype (string)
1731 # else
1732 # define IS_CHAR_CLASS(string) wctype (string)
1733 # endif
1734 #else
1735 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1736
1737 # define IS_CHAR_CLASS(string) \
1738 (STREQ (string, "alpha") || STREQ (string, "upper") \
1739 || STREQ (string, "lower") || STREQ (string, "digit") \
1740 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1741 || STREQ (string, "space") || STREQ (string, "print") \
1742 || STREQ (string, "punct") || STREQ (string, "graph") \
1743 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1744 #endif
1745 \f
1746 #ifndef MATCH_MAY_ALLOCATE
1747
1748 /* If we cannot allocate large objects within re_match_2_internal,
1749 we make the fail stack and register vectors global.
1750 The fail stack, we grow to the maximum size when a regexp
1751 is compiled.
1752 The register vectors, we adjust in size each time we
1753 compile a regexp, according to the number of registers it needs. */
1754
1755 static fail_stack_type fail_stack;
1756
1757 /* Size with which the following vectors are currently allocated.
1758 That is so we can make them bigger as needed,
1759 but never make them smaller. */
1760 static int regs_allocated_size;
1761
1762 static const char ** regstart, ** regend;
1763 static const char ** old_regstart, ** old_regend;
1764 static const char **best_regstart, **best_regend;
1765 static register_info_type *reg_info;
1766 static const char **reg_dummy;
1767 static register_info_type *reg_info_dummy;
1768
1769 /* Make the register vectors big enough for NUM_REGS registers,
1770 but don't make them smaller. */
1771
1772 static
1773 regex_grow_registers (num_regs)
1774 int num_regs;
1775 {
1776 if (num_regs > regs_allocated_size)
1777 {
1778 RETALLOC_IF (regstart, num_regs, const char *);
1779 RETALLOC_IF (regend, num_regs, const char *);
1780 RETALLOC_IF (old_regstart, num_regs, const char *);
1781 RETALLOC_IF (old_regend, num_regs, const char *);
1782 RETALLOC_IF (best_regstart, num_regs, const char *);
1783 RETALLOC_IF (best_regend, num_regs, const char *);
1784 RETALLOC_IF (reg_info, num_regs, register_info_type);
1785 RETALLOC_IF (reg_dummy, num_regs, const char *);
1786 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
1787
1788 regs_allocated_size = num_regs;
1789 }
1790 }
1791
1792 #endif /* not MATCH_MAY_ALLOCATE */
1793 \f
1794 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1795 compile_stack,
1796 regnum_t regnum));
1797
1798 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1799 Returns one of error codes defined in `regex.h', or zero for success.
1800
1801 Assumes the `allocated' (and perhaps `buffer') and `translate'
1802 fields are set in BUFP on entry.
1803
1804 If it succeeds, results are put in BUFP (if it returns an error, the
1805 contents of BUFP are undefined):
1806 `buffer' is the compiled pattern;
1807 `syntax' is set to SYNTAX;
1808 `used' is set to the length of the compiled pattern;
1809 `fastmap_accurate' is zero;
1810 `re_nsub' is the number of subexpressions in PATTERN;
1811 `not_bol' and `not_eol' are zero;
1812
1813 The `fastmap' and `newline_anchor' fields are neither
1814 examined nor set. */
1815
1816 /* Return, freeing storage we allocated. */
1817 #define FREE_STACK_RETURN(value) \
1818 return (free (compile_stack.stack), value)
1819
1820 static reg_errcode_t
1821 regex_compile (pattern, size, syntax, bufp)
1822 const char *pattern;
1823 size_t size;
1824 reg_syntax_t syntax;
1825 struct re_pattern_buffer *bufp;
1826 {
1827 /* We fetch characters from PATTERN here. Even though PATTERN is
1828 `char *' (i.e., signed), we declare these variables as unsigned, so
1829 they can be reliably used as array indices. */
1830 register unsigned char c, c1;
1831
1832 /* A random temporary spot in PATTERN. */
1833 const char *p1;
1834
1835 /* Points to the end of the buffer, where we should append. */
1836 register unsigned char *b;
1837
1838 /* Keeps track of unclosed groups. */
1839 compile_stack_type compile_stack;
1840
1841 /* Points to the current (ending) position in the pattern. */
1842 const char *p = pattern;
1843 const char *pend = pattern + size;
1844
1845 /* How to translate the characters in the pattern. */
1846 RE_TRANSLATE_TYPE translate = bufp->translate;
1847
1848 /* Address of the count-byte of the most recently inserted `exactn'
1849 command. This makes it possible to tell if a new exact-match
1850 character can be added to that command or if the character requires
1851 a new `exactn' command. */
1852 unsigned char *pending_exact = 0;
1853
1854 /* Address of start of the most recently finished expression.
1855 This tells, e.g., postfix * where to find the start of its
1856 operand. Reset at the beginning of groups and alternatives. */
1857 unsigned char *laststart = 0;
1858
1859 /* Address of beginning of regexp, or inside of last group. */
1860 unsigned char *begalt;
1861
1862 /* Place in the uncompiled pattern (i.e., the {) to
1863 which to go back if the interval is invalid. */
1864 const char *beg_interval;
1865
1866 /* Address of the place where a forward jump should go to the end of
1867 the containing expression. Each alternative of an `or' -- except the
1868 last -- ends with a forward jump of this sort. */
1869 unsigned char *fixup_alt_jump = 0;
1870
1871 /* Counts open-groups as they are encountered. Remembered for the
1872 matching close-group on the compile stack, so the same register
1873 number is put in the stop_memory as the start_memory. */
1874 regnum_t regnum = 0;
1875
1876 #ifdef DEBUG
1877 DEBUG_PRINT1 ("\nCompiling pattern: ");
1878 if (debug)
1879 {
1880 unsigned debug_count;
1881
1882 for (debug_count = 0; debug_count < size; debug_count++)
1883 putchar (pattern[debug_count]);
1884 putchar ('\n');
1885 }
1886 #endif /* DEBUG */
1887
1888 /* Initialize the compile stack. */
1889 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1890 if (compile_stack.stack == NULL)
1891 return REG_ESPACE;
1892
1893 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1894 compile_stack.avail = 0;
1895
1896 /* Initialize the pattern buffer. */
1897 bufp->syntax = syntax;
1898 bufp->fastmap_accurate = 0;
1899 bufp->not_bol = bufp->not_eol = 0;
1900
1901 /* Set `used' to zero, so that if we return an error, the pattern
1902 printer (for debugging) will think there's no pattern. We reset it
1903 at the end. */
1904 bufp->used = 0;
1905
1906 /* Always count groups, whether or not bufp->no_sub is set. */
1907 bufp->re_nsub = 0;
1908
1909 #if !defined emacs && !defined SYNTAX_TABLE
1910 /* Initialize the syntax table. */
1911 init_syntax_once ();
1912 #endif
1913
1914 if (bufp->allocated == 0)
1915 {
1916 if (bufp->buffer)
1917 { /* If zero allocated, but buffer is non-null, try to realloc
1918 enough space. This loses if buffer's address is bogus, but
1919 that is the user's responsibility. */
1920 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1921 }
1922 else
1923 { /* Caller did not allocate a buffer. Do it for them. */
1924 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1925 }
1926 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1927
1928 bufp->allocated = INIT_BUF_SIZE;
1929 }
1930
1931 begalt = b = bufp->buffer;
1932
1933 /* Loop through the uncompiled pattern until we're at the end. */
1934 while (p != pend)
1935 {
1936 PATFETCH (c);
1937
1938 switch (c)
1939 {
1940 case '^':
1941 {
1942 if ( /* If at start of pattern, it's an operator. */
1943 p == pattern + 1
1944 /* If context independent, it's an operator. */
1945 || syntax & RE_CONTEXT_INDEP_ANCHORS
1946 /* Otherwise, depends on what's come before. */
1947 || at_begline_loc_p (pattern, p, syntax))
1948 BUF_PUSH (begline);
1949 else
1950 goto normal_char;
1951 }
1952 break;
1953
1954
1955 case '$':
1956 {
1957 if ( /* If at end of pattern, it's an operator. */
1958 p == pend
1959 /* If context independent, it's an operator. */
1960 || syntax & RE_CONTEXT_INDEP_ANCHORS
1961 /* Otherwise, depends on what's next. */
1962 || at_endline_loc_p (p, pend, syntax))
1963 BUF_PUSH (endline);
1964 else
1965 goto normal_char;
1966 }
1967 break;
1968
1969
1970 case '+':
1971 case '?':
1972 if ((syntax & RE_BK_PLUS_QM)
1973 || (syntax & RE_LIMITED_OPS))
1974 goto normal_char;
1975 handle_plus:
1976 case '*':
1977 /* If there is no previous pattern... */
1978 if (!laststart)
1979 {
1980 if (syntax & RE_CONTEXT_INVALID_OPS)
1981 FREE_STACK_RETURN (REG_BADRPT);
1982 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1983 goto normal_char;
1984 }
1985
1986 {
1987 /* Are we optimizing this jump? */
1988 boolean keep_string_p = false;
1989
1990 /* 1 means zero (many) matches is allowed. */
1991 char zero_times_ok = 0, many_times_ok = 0;
1992
1993 /* If there is a sequence of repetition chars, collapse it
1994 down to just one (the right one). We can't combine
1995 interval operators with these because of, e.g., `a{2}*',
1996 which should only match an even number of `a's. */
1997
1998 for (;;)
1999 {
2000 zero_times_ok |= c != '+';
2001 many_times_ok |= c != '?';
2002
2003 if (p == pend)
2004 break;
2005
2006 PATFETCH (c);
2007
2008 if (c == '*'
2009 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2010 ;
2011
2012 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2013 {
2014 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2015
2016 PATFETCH (c1);
2017 if (!(c1 == '+' || c1 == '?'))
2018 {
2019 PATUNFETCH;
2020 PATUNFETCH;
2021 break;
2022 }
2023
2024 c = c1;
2025 }
2026 else
2027 {
2028 PATUNFETCH;
2029 break;
2030 }
2031
2032 /* If we get here, we found another repeat character. */
2033 }
2034
2035 /* Star, etc. applied to an empty pattern is equivalent
2036 to an empty pattern. */
2037 if (!laststart)
2038 break;
2039
2040 /* Now we know whether or not zero matches is allowed
2041 and also whether or not two or more matches is allowed. */
2042 if (many_times_ok)
2043 { /* More than one repetition is allowed, so put in at the
2044 end a backward relative jump from `b' to before the next
2045 jump we're going to put in below (which jumps from
2046 laststart to after this jump).
2047
2048 But if we are at the `*' in the exact sequence `.*\n',
2049 insert an unconditional jump backwards to the .,
2050 instead of the beginning of the loop. This way we only
2051 push a failure point once, instead of every time
2052 through the loop. */
2053 assert (p - 1 > pattern);
2054
2055 /* Allocate the space for the jump. */
2056 GET_BUFFER_SPACE (3);
2057
2058 /* We know we are not at the first character of the pattern,
2059 because laststart was nonzero. And we've already
2060 incremented `p', by the way, to be the character after
2061 the `*'. Do we have to do something analogous here
2062 for null bytes, because of RE_DOT_NOT_NULL? */
2063 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2064 && zero_times_ok
2065 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2066 && !(syntax & RE_DOT_NEWLINE))
2067 { /* We have .*\n. */
2068 STORE_JUMP (jump, b, laststart);
2069 keep_string_p = true;
2070 }
2071 else
2072 /* Anything else. */
2073 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
2074
2075 /* We've added more stuff to the buffer. */
2076 b += 3;
2077 }
2078
2079 /* On failure, jump from laststart to b + 3, which will be the
2080 end of the buffer after this jump is inserted. */
2081 GET_BUFFER_SPACE (3);
2082 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2083 : on_failure_jump,
2084 laststart, b + 3);
2085 pending_exact = 0;
2086 b += 3;
2087
2088 if (!zero_times_ok)
2089 {
2090 /* At least one repetition is required, so insert a
2091 `dummy_failure_jump' before the initial
2092 `on_failure_jump' instruction of the loop. This
2093 effects a skip over that instruction the first time
2094 we hit that loop. */
2095 GET_BUFFER_SPACE (3);
2096 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
2097 b += 3;
2098 }
2099 }
2100 break;
2101
2102
2103 case '.':
2104 laststart = b;
2105 BUF_PUSH (anychar);
2106 break;
2107
2108
2109 case '[':
2110 {
2111 boolean had_char_class = false;
2112
2113 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2114
2115 /* Ensure that we have enough space to push a charset: the
2116 opcode, the length count, and the bitset; 34 bytes in all. */
2117 GET_BUFFER_SPACE (34);
2118
2119 laststart = b;
2120
2121 /* We test `*p == '^' twice, instead of using an if
2122 statement, so we only need one BUF_PUSH. */
2123 BUF_PUSH (*p == '^' ? charset_not : charset);
2124 if (*p == '^')
2125 p++;
2126
2127 /* Remember the first position in the bracket expression. */
2128 p1 = p;
2129
2130 /* Push the number of bytes in the bitmap. */
2131 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2132
2133 /* Clear the whole map. */
2134 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2135
2136 /* charset_not matches newline according to a syntax bit. */
2137 if ((re_opcode_t) b[-2] == charset_not
2138 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2139 SET_LIST_BIT ('\n');
2140
2141 /* Read in characters and ranges, setting map bits. */
2142 for (;;)
2143 {
2144 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2145
2146 PATFETCH (c);
2147
2148 /* \ might escape characters inside [...] and [^...]. */
2149 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2150 {
2151 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2152
2153 PATFETCH (c1);
2154 SET_LIST_BIT (c1);
2155 continue;
2156 }
2157
2158 /* Could be the end of the bracket expression. If it's
2159 not (i.e., when the bracket expression is `[]' so
2160 far), the ']' character bit gets set way below. */
2161 if (c == ']' && p != p1 + 1)
2162 break;
2163
2164 /* Look ahead to see if it's a range when the last thing
2165 was a character class. */
2166 if (had_char_class && c == '-' && *p != ']')
2167 FREE_STACK_RETURN (REG_ERANGE);
2168
2169 /* Look ahead to see if it's a range when the last thing
2170 was a character: if this is a hyphen not at the
2171 beginning or the end of a list, then it's the range
2172 operator. */
2173 if (c == '-'
2174 && !(p - 2 >= pattern && p[-2] == '[')
2175 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2176 && *p != ']')
2177 {
2178 reg_errcode_t ret
2179 = compile_range (&p, pend, translate, syntax, b);
2180 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2181 }
2182
2183 else if (p[0] == '-' && p[1] != ']')
2184 { /* This handles ranges made up of characters only. */
2185 reg_errcode_t ret;
2186
2187 /* Move past the `-'. */
2188 PATFETCH (c1);
2189
2190 ret = compile_range (&p, pend, translate, syntax, b);
2191 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2192 }
2193
2194 /* See if we're at the beginning of a possible character
2195 class. */
2196
2197 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2198 { /* Leave room for the null. */
2199 char str[CHAR_CLASS_MAX_LENGTH + 1];
2200
2201 PATFETCH (c);
2202 c1 = 0;
2203
2204 /* If pattern is `[[:'. */
2205 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2206
2207 for (;;)
2208 {
2209 PATFETCH (c);
2210 if ((c == ':' && *p == ']') || p == pend
2211 || c1 == CHAR_CLASS_MAX_LENGTH)
2212 break;
2213 str[c1++] = c;
2214 }
2215 str[c1] = '\0';
2216
2217 /* If isn't a word bracketed by `[:' and `:]':
2218 undo the ending character, the letters, and leave
2219 the leading `:' and `[' (but set bits for them). */
2220 if (c == ':' && *p == ']')
2221 {
2222 #if defined _LIBC || (defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H)
2223 boolean is_lower = STREQ (str, "lower");
2224 boolean is_upper = STREQ (str, "upper");
2225 wctype_t wt;
2226 int ch;
2227
2228 wt = IS_CHAR_CLASS (str);
2229 if (wt == 0)
2230 FREE_STACK_RETURN (REG_ECTYPE);
2231
2232 /* Throw away the ] at the end of the character
2233 class. */
2234 PATFETCH (c);
2235
2236 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2237
2238 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
2239 {
2240 # ifdef _LIBC
2241 if (__iswctype (__btowc (ch), wt))
2242 SET_LIST_BIT (ch);
2243 #else
2244 if (iswctype (btowc (ch), wt))
2245 SET_LIST_BIT (ch);
2246 #endif
2247
2248 if (translate && (is_upper || is_lower)
2249 && (ISUPPER (ch) || ISLOWER (ch)))
2250 SET_LIST_BIT (ch);
2251 }
2252
2253 had_char_class = true;
2254 #else
2255 int ch;
2256 boolean is_alnum = STREQ (str, "alnum");
2257 boolean is_alpha = STREQ (str, "alpha");
2258 boolean is_blank = STREQ (str, "blank");
2259 boolean is_cntrl = STREQ (str, "cntrl");
2260 boolean is_digit = STREQ (str, "digit");
2261 boolean is_graph = STREQ (str, "graph");
2262 boolean is_lower = STREQ (str, "lower");
2263 boolean is_print = STREQ (str, "print");
2264 boolean is_punct = STREQ (str, "punct");
2265 boolean is_space = STREQ (str, "space");
2266 boolean is_upper = STREQ (str, "upper");
2267 boolean is_xdigit = STREQ (str, "xdigit");
2268
2269 if (!IS_CHAR_CLASS (str))
2270 FREE_STACK_RETURN (REG_ECTYPE);
2271
2272 /* Throw away the ] at the end of the character
2273 class. */
2274 PATFETCH (c);
2275
2276 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2277
2278 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2279 {
2280 /* This was split into 3 if's to
2281 avoid an arbitrary limit in some compiler. */
2282 if ( (is_alnum && ISALNUM (ch))
2283 || (is_alpha && ISALPHA (ch))
2284 || (is_blank && ISBLANK (ch))
2285 || (is_cntrl && ISCNTRL (ch)))
2286 SET_LIST_BIT (ch);
2287 if ( (is_digit && ISDIGIT (ch))
2288 || (is_graph && ISGRAPH (ch))
2289 || (is_lower && ISLOWER (ch))
2290 || (is_print && ISPRINT (ch)))
2291 SET_LIST_BIT (ch);
2292 if ( (is_punct && ISPUNCT (ch))
2293 || (is_space && ISSPACE (ch))
2294 || (is_upper && ISUPPER (ch))
2295 || (is_xdigit && ISXDIGIT (ch)))
2296 SET_LIST_BIT (ch);
2297 if ( translate && (is_upper || is_lower)
2298 && (ISUPPER (ch) || ISLOWER (ch)))
2299 SET_LIST_BIT (ch);
2300 }
2301 had_char_class = true;
2302 #endif /* libc || wctype.h */
2303 }
2304 else
2305 {
2306 c1++;
2307 while (c1--)
2308 PATUNFETCH;
2309 SET_LIST_BIT ('[');
2310 SET_LIST_BIT (':');
2311 had_char_class = false;
2312 }
2313 }
2314 else
2315 {
2316 had_char_class = false;
2317 SET_LIST_BIT (c);
2318 }
2319 }
2320
2321 /* Discard any (non)matching list bytes that are all 0 at the
2322 end of the map. Decrease the map-length byte too. */
2323 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2324 b[-1]--;
2325 b += b[-1];
2326 }
2327 break;
2328
2329
2330 case '(':
2331 if (syntax & RE_NO_BK_PARENS)
2332 goto handle_open;
2333 else
2334 goto normal_char;
2335
2336
2337 case ')':
2338 if (syntax & RE_NO_BK_PARENS)
2339 goto handle_close;
2340 else
2341 goto normal_char;
2342
2343
2344 case '\n':
2345 if (syntax & RE_NEWLINE_ALT)
2346 goto handle_alt;
2347 else
2348 goto normal_char;
2349
2350
2351 case '|':
2352 if (syntax & RE_NO_BK_VBAR)
2353 goto handle_alt;
2354 else
2355 goto normal_char;
2356
2357
2358 case '{':
2359 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2360 goto handle_interval;
2361 else
2362 goto normal_char;
2363
2364
2365 case '\\':
2366 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2367
2368 /* Do not translate the character after the \, so that we can
2369 distinguish, e.g., \B from \b, even if we normally would
2370 translate, e.g., B to b. */
2371 PATFETCH_RAW (c);
2372
2373 switch (c)
2374 {
2375 case '(':
2376 if (syntax & RE_NO_BK_PARENS)
2377 goto normal_backslash;
2378
2379 handle_open:
2380 bufp->re_nsub++;
2381 regnum++;
2382
2383 if (COMPILE_STACK_FULL)
2384 {
2385 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2386 compile_stack_elt_t);
2387 if (compile_stack.stack == NULL) return REG_ESPACE;
2388
2389 compile_stack.size <<= 1;
2390 }
2391
2392 /* These are the values to restore when we hit end of this
2393 group. They are all relative offsets, so that if the
2394 whole pattern moves because of realloc, they will still
2395 be valid. */
2396 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2397 COMPILE_STACK_TOP.fixup_alt_jump
2398 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2399 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2400 COMPILE_STACK_TOP.regnum = regnum;
2401
2402 /* We will eventually replace the 0 with the number of
2403 groups inner to this one. But do not push a
2404 start_memory for groups beyond the last one we can
2405 represent in the compiled pattern. */
2406 if (regnum <= MAX_REGNUM)
2407 {
2408 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2409 BUF_PUSH_3 (start_memory, regnum, 0);
2410 }
2411
2412 compile_stack.avail++;
2413
2414 fixup_alt_jump = 0;
2415 laststart = 0;
2416 begalt = b;
2417 /* If we've reached MAX_REGNUM groups, then this open
2418 won't actually generate any code, so we'll have to
2419 clear pending_exact explicitly. */
2420 pending_exact = 0;
2421 break;
2422
2423
2424 case ')':
2425 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2426
2427 if (COMPILE_STACK_EMPTY)
2428 {
2429 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2430 goto normal_backslash;
2431 else
2432 FREE_STACK_RETURN (REG_ERPAREN);
2433 }
2434
2435 handle_close:
2436 if (fixup_alt_jump)
2437 { /* Push a dummy failure point at the end of the
2438 alternative for a possible future
2439 `pop_failure_jump' to pop. See comments at
2440 `push_dummy_failure' in `re_match_2'. */
2441 BUF_PUSH (push_dummy_failure);
2442
2443 /* We allocated space for this jump when we assigned
2444 to `fixup_alt_jump', in the `handle_alt' case below. */
2445 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2446 }
2447
2448 /* See similar code for backslashed left paren above. */
2449 if (COMPILE_STACK_EMPTY)
2450 {
2451 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2452 goto normal_char;
2453 else
2454 FREE_STACK_RETURN (REG_ERPAREN);
2455 }
2456
2457 /* Since we just checked for an empty stack above, this
2458 ``can't happen''. */
2459 assert (compile_stack.avail != 0);
2460 {
2461 /* We don't just want to restore into `regnum', because
2462 later groups should continue to be numbered higher,
2463 as in `(ab)c(de)' -- the second group is #2. */
2464 regnum_t this_group_regnum;
2465
2466 compile_stack.avail--;
2467 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2468 fixup_alt_jump
2469 = COMPILE_STACK_TOP.fixup_alt_jump
2470 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2471 : 0;
2472 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2473 this_group_regnum = COMPILE_STACK_TOP.regnum;
2474 /* If we've reached MAX_REGNUM groups, then this open
2475 won't actually generate any code, so we'll have to
2476 clear pending_exact explicitly. */
2477 pending_exact = 0;
2478
2479 /* We're at the end of the group, so now we know how many
2480 groups were inside this one. */
2481 if (this_group_regnum <= MAX_REGNUM)
2482 {
2483 unsigned char *inner_group_loc
2484 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2485
2486 *inner_group_loc = regnum - this_group_regnum;
2487 BUF_PUSH_3 (stop_memory, this_group_regnum,
2488 regnum - this_group_regnum);
2489 }
2490 }
2491 break;
2492
2493
2494 case '|': /* `\|'. */
2495 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2496 goto normal_backslash;
2497 handle_alt:
2498 if (syntax & RE_LIMITED_OPS)
2499 goto normal_char;
2500
2501 /* Insert before the previous alternative a jump which
2502 jumps to this alternative if the former fails. */
2503 GET_BUFFER_SPACE (3);
2504 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2505 pending_exact = 0;
2506 b += 3;
2507
2508 /* The alternative before this one has a jump after it
2509 which gets executed if it gets matched. Adjust that
2510 jump so it will jump to this alternative's analogous
2511 jump (put in below, which in turn will jump to the next
2512 (if any) alternative's such jump, etc.). The last such
2513 jump jumps to the correct final destination. A picture:
2514 _____ _____
2515 | | | |
2516 | v | v
2517 a | b | c
2518
2519 If we are at `b', then fixup_alt_jump right now points to a
2520 three-byte space after `a'. We'll put in the jump, set
2521 fixup_alt_jump to right after `b', and leave behind three
2522 bytes which we'll fill in when we get to after `c'. */
2523
2524 if (fixup_alt_jump)
2525 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2526
2527 /* Mark and leave space for a jump after this alternative,
2528 to be filled in later either by next alternative or
2529 when know we're at the end of a series of alternatives. */
2530 fixup_alt_jump = b;
2531 GET_BUFFER_SPACE (3);
2532 b += 3;
2533
2534 laststart = 0;
2535 begalt = b;
2536 break;
2537
2538
2539 case '{':
2540 /* If \{ is a literal. */
2541 if (!(syntax & RE_INTERVALS)
2542 /* If we're at `\{' and it's not the open-interval
2543 operator. */
2544 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2545 || (p - 2 == pattern && p == pend))
2546 goto normal_backslash;
2547
2548 handle_interval:
2549 {
2550 /* If got here, then the syntax allows intervals. */
2551
2552 /* At least (most) this many matches must be made. */
2553 int lower_bound = -1, upper_bound = -1;
2554
2555 beg_interval = p - 1;
2556
2557 if (p == pend)
2558 {
2559 if (syntax & RE_NO_BK_BRACES)
2560 goto unfetch_interval;
2561 else
2562 FREE_STACK_RETURN (REG_EBRACE);
2563 }
2564
2565 GET_UNSIGNED_NUMBER (lower_bound);
2566
2567 if (c == ',')
2568 {
2569 GET_UNSIGNED_NUMBER (upper_bound);
2570 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2571 }
2572 else
2573 /* Interval such as `{1}' => match exactly once. */
2574 upper_bound = lower_bound;
2575
2576 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2577 || lower_bound > upper_bound)
2578 {
2579 if (syntax & RE_NO_BK_BRACES)
2580 goto unfetch_interval;
2581 else
2582 FREE_STACK_RETURN (REG_BADBR);
2583 }
2584
2585 if (!(syntax & RE_NO_BK_BRACES))
2586 {
2587 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2588
2589 PATFETCH (c);
2590 }
2591
2592 if (c != '}')
2593 {
2594 if (syntax & RE_NO_BK_BRACES)
2595 goto unfetch_interval;
2596 else
2597 FREE_STACK_RETURN (REG_BADBR);
2598 }
2599
2600 /* We just parsed a valid interval. */
2601
2602 /* If it's invalid to have no preceding re. */
2603 if (!laststart)
2604 {
2605 if (syntax & RE_CONTEXT_INVALID_OPS)
2606 FREE_STACK_RETURN (REG_BADRPT);
2607 else if (syntax & RE_CONTEXT_INDEP_OPS)
2608 laststart = b;
2609 else
2610 goto unfetch_interval;
2611 }
2612
2613 /* If the upper bound is zero, don't want to succeed at
2614 all; jump from `laststart' to `b + 3', which will be
2615 the end of the buffer after we insert the jump. */
2616 if (upper_bound == 0)
2617 {
2618 GET_BUFFER_SPACE (3);
2619 INSERT_JUMP (jump, laststart, b + 3);
2620 b += 3;
2621 }
2622
2623 /* Otherwise, we have a nontrivial interval. When
2624 we're all done, the pattern will look like:
2625 set_number_at <jump count> <upper bound>
2626 set_number_at <succeed_n count> <lower bound>
2627 succeed_n <after jump addr> <succeed_n count>
2628 <body of loop>
2629 jump_n <succeed_n addr> <jump count>
2630 (The upper bound and `jump_n' are omitted if
2631 `upper_bound' is 1, though.) */
2632 else
2633 { /* If the upper bound is > 1, we need to insert
2634 more at the end of the loop. */
2635 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2636
2637 GET_BUFFER_SPACE (nbytes);
2638
2639 /* Initialize lower bound of the `succeed_n', even
2640 though it will be set during matching by its
2641 attendant `set_number_at' (inserted next),
2642 because `re_compile_fastmap' needs to know.
2643 Jump to the `jump_n' we might insert below. */
2644 INSERT_JUMP2 (succeed_n, laststart,
2645 b + 5 + (upper_bound > 1) * 5,
2646 lower_bound);
2647 b += 5;
2648
2649 /* Code to initialize the lower bound. Insert
2650 before the `succeed_n'. The `5' is the last two
2651 bytes of this `set_number_at', plus 3 bytes of
2652 the following `succeed_n'. */
2653 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2654 b += 5;
2655
2656 if (upper_bound > 1)
2657 { /* More than one repetition is allowed, so
2658 append a backward jump to the `succeed_n'
2659 that starts this interval.
2660
2661 When we've reached this during matching,
2662 we'll have matched the interval once, so
2663 jump back only `upper_bound - 1' times. */
2664 STORE_JUMP2 (jump_n, b, laststart + 5,
2665 upper_bound - 1);
2666 b += 5;
2667
2668 /* The location we want to set is the second
2669 parameter of the `jump_n'; that is `b-2' as
2670 an absolute address. `laststart' will be
2671 the `set_number_at' we're about to insert;
2672 `laststart+3' the number to set, the source
2673 for the relative address. But we are
2674 inserting into the middle of the pattern --
2675 so everything is getting moved up by 5.
2676 Conclusion: (b - 2) - (laststart + 3) + 5,
2677 i.e., b - laststart.
2678
2679 We insert this at the beginning of the loop
2680 so that if we fail during matching, we'll
2681 reinitialize the bounds. */
2682 insert_op2 (set_number_at, laststart, b - laststart,
2683 upper_bound - 1, b);
2684 b += 5;
2685 }
2686 }
2687 pending_exact = 0;
2688 beg_interval = NULL;
2689 }
2690 break;
2691
2692 unfetch_interval:
2693 /* If an invalid interval, match the characters as literals. */
2694 assert (beg_interval);
2695 p = beg_interval;
2696 beg_interval = NULL;
2697
2698 /* normal_char and normal_backslash need `c'. */
2699 PATFETCH (c);
2700
2701 if (!(syntax & RE_NO_BK_BRACES))
2702 {
2703 if (p > pattern && p[-1] == '\\')
2704 goto normal_backslash;
2705 }
2706 goto normal_char;
2707
2708 #ifdef emacs
2709 /* There is no way to specify the before_dot and after_dot
2710 operators. rms says this is ok. --karl */
2711 case '=':
2712 BUF_PUSH (at_dot);
2713 break;
2714
2715 case 's':
2716 laststart = b;
2717 PATFETCH (c);
2718 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2719 break;
2720
2721 case 'S':
2722 laststart = b;
2723 PATFETCH (c);
2724 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2725 break;
2726 #endif /* emacs */
2727
2728
2729 case 'w':
2730 if (syntax & RE_NO_GNU_OPS)
2731 goto normal_char;
2732 laststart = b;
2733 BUF_PUSH (wordchar);
2734 break;
2735
2736
2737 case 'W':
2738 if (syntax & RE_NO_GNU_OPS)
2739 goto normal_char;
2740 laststart = b;
2741 BUF_PUSH (notwordchar);
2742 break;
2743
2744
2745 case '<':
2746 if (syntax & RE_NO_GNU_OPS)
2747 goto normal_char;
2748 BUF_PUSH (wordbeg);
2749 break;
2750
2751 case '>':
2752 if (syntax & RE_NO_GNU_OPS)
2753 goto normal_char;
2754 BUF_PUSH (wordend);
2755 break;
2756
2757 case 'b':
2758 if (syntax & RE_NO_GNU_OPS)
2759 goto normal_char;
2760 BUF_PUSH (wordbound);
2761 break;
2762
2763 case 'B':
2764 if (syntax & RE_NO_GNU_OPS)
2765 goto normal_char;
2766 BUF_PUSH (notwordbound);
2767 break;
2768
2769 case '`':
2770 if (syntax & RE_NO_GNU_OPS)
2771 goto normal_char;
2772 BUF_PUSH (begbuf);
2773 break;
2774
2775 case '\'':
2776 if (syntax & RE_NO_GNU_OPS)
2777 goto normal_char;
2778 BUF_PUSH (endbuf);
2779 break;
2780
2781 case '1': case '2': case '3': case '4': case '5':
2782 case '6': case '7': case '8': case '9':
2783 if (syntax & RE_NO_BK_REFS)
2784 goto normal_char;
2785
2786 c1 = c - '0';
2787
2788 if (c1 > regnum)
2789 FREE_STACK_RETURN (REG_ESUBREG);
2790
2791 /* Can't back reference to a subexpression if inside of it. */
2792 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
2793 goto normal_char;
2794
2795 laststart = b;
2796 BUF_PUSH_2 (duplicate, c1);
2797 break;
2798
2799
2800 case '+':
2801 case '?':
2802 if (syntax & RE_BK_PLUS_QM)
2803 goto handle_plus;
2804 else
2805 goto normal_backslash;
2806
2807 default:
2808 normal_backslash:
2809 /* You might think it would be useful for \ to mean
2810 not to translate; but if we don't translate it
2811 it will never match anything. */
2812 c = TRANSLATE (c);
2813 goto normal_char;
2814 }
2815 break;
2816
2817
2818 default:
2819 /* Expects the character in `c'. */
2820 normal_char:
2821 /* If no exactn currently being built. */
2822 if (!pending_exact
2823
2824 /* If last exactn not at current position. */
2825 || pending_exact + *pending_exact + 1 != b
2826
2827 /* We have only one byte following the exactn for the count. */
2828 || *pending_exact == (1 << BYTEWIDTH) - 1
2829
2830 /* If followed by a repetition operator. */
2831 || *p == '*' || *p == '^'
2832 || ((syntax & RE_BK_PLUS_QM)
2833 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2834 : (*p == '+' || *p == '?'))
2835 || ((syntax & RE_INTERVALS)
2836 && ((syntax & RE_NO_BK_BRACES)
2837 ? *p == '{'
2838 : (p[0] == '\\' && p[1] == '{'))))
2839 {
2840 /* Start building a new exactn. */
2841
2842 laststart = b;
2843
2844 BUF_PUSH_2 (exactn, 0);
2845 pending_exact = b - 1;
2846 }
2847
2848 BUF_PUSH (c);
2849 (*pending_exact)++;
2850 break;
2851 } /* switch (c) */
2852 } /* while p != pend */
2853
2854
2855 /* Through the pattern now. */
2856
2857 if (fixup_alt_jump)
2858 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2859
2860 if (!COMPILE_STACK_EMPTY)
2861 FREE_STACK_RETURN (REG_EPAREN);
2862
2863 /* If we don't want backtracking, force success
2864 the first time we reach the end of the compiled pattern. */
2865 if (syntax & RE_NO_POSIX_BACKTRACKING)
2866 BUF_PUSH (succeed);
2867
2868 free (compile_stack.stack);
2869
2870 /* We have succeeded; set the length of the buffer. */
2871 bufp->used = b - bufp->buffer;
2872
2873 #ifdef DEBUG
2874 if (debug)
2875 {
2876 DEBUG_PRINT1 ("\nCompiled pattern: \n");
2877 print_compiled_pattern (bufp);
2878 }
2879 #endif /* DEBUG */
2880
2881 #ifndef MATCH_MAY_ALLOCATE
2882 /* Initialize the failure stack to the largest possible stack. This
2883 isn't necessary unless we're trying to avoid calling alloca in
2884 the search and match routines. */
2885 {
2886 int num_regs = bufp->re_nsub + 1;
2887
2888 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
2889 is strictly greater than re_max_failures, the largest possible stack
2890 is 2 * re_max_failures failure points. */
2891 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
2892 {
2893 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
2894
2895 # ifdef emacs
2896 if (! fail_stack.stack)
2897 fail_stack.stack
2898 = (fail_stack_elt_t *) xmalloc (fail_stack.size
2899 * sizeof (fail_stack_elt_t));
2900 else
2901 fail_stack.stack
2902 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
2903 (fail_stack.size
2904 * sizeof (fail_stack_elt_t)));
2905 # else /* not emacs */
2906 if (! fail_stack.stack)
2907 fail_stack.stack
2908 = (fail_stack_elt_t *) malloc (fail_stack.size
2909 * sizeof (fail_stack_elt_t));
2910 else
2911 fail_stack.stack
2912 = (fail_stack_elt_t *) realloc (fail_stack.stack,
2913 (fail_stack.size
2914 * sizeof (fail_stack_elt_t)));
2915 # endif /* not emacs */
2916 }
2917
2918 regex_grow_registers (num_regs);
2919 }
2920 #endif /* not MATCH_MAY_ALLOCATE */
2921
2922 return REG_NOERROR;
2923 } /* regex_compile */
2924 \f
2925 /* Subroutines for `regex_compile'. */
2926
2927 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2928
2929 static void
2930 store_op1 (op, loc, arg)
2931 re_opcode_t op;
2932 unsigned char *loc;
2933 int arg;
2934 {
2935 *loc = (unsigned char) op;
2936 STORE_NUMBER (loc + 1, arg);
2937 }
2938
2939
2940 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2941
2942 static void
2943 store_op2 (op, loc, arg1, arg2)
2944 re_opcode_t op;
2945 unsigned char *loc;
2946 int arg1, arg2;
2947 {
2948 *loc = (unsigned char) op;
2949 STORE_NUMBER (loc + 1, arg1);
2950 STORE_NUMBER (loc + 3, arg2);
2951 }
2952
2953
2954 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2955 for OP followed by two-byte integer parameter ARG. */
2956
2957 static void
2958 insert_op1 (op, loc, arg, end)
2959 re_opcode_t op;
2960 unsigned char *loc;
2961 int arg;
2962 unsigned char *end;
2963 {
2964 register unsigned char *pfrom = end;
2965 register unsigned char *pto = end + 3;
2966
2967 while (pfrom != loc)
2968 *--pto = *--pfrom;
2969
2970 store_op1 (op, loc, arg);
2971 }
2972
2973
2974 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2975
2976 static void
2977 insert_op2 (op, loc, arg1, arg2, end)
2978 re_opcode_t op;
2979 unsigned char *loc;
2980 int arg1, arg2;
2981 unsigned char *end;
2982 {
2983 register unsigned char *pfrom = end;
2984 register unsigned char *pto = end + 5;
2985
2986 while (pfrom != loc)
2987 *--pto = *--pfrom;
2988
2989 store_op2 (op, loc, arg1, arg2);
2990 }
2991
2992
2993 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2994 after an alternative or a begin-subexpression. We assume there is at
2995 least one character before the ^. */
2996
2997 static boolean
2998 at_begline_loc_p (pattern, p, syntax)
2999 const char *pattern, *p;
3000 reg_syntax_t syntax;
3001 {
3002 const char *prev = p - 2;
3003 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3004
3005 return
3006 /* After a subexpression? */
3007 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3008 /* After an alternative? */
3009 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3010 }
3011
3012
3013 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3014 at least one character after the $, i.e., `P < PEND'. */
3015
3016 static boolean
3017 at_endline_loc_p (p, pend, syntax)
3018 const char *p, *pend;
3019 reg_syntax_t syntax;
3020 {
3021 const char *next = p;
3022 boolean next_backslash = *next == '\\';
3023 const char *next_next = p + 1 < pend ? p + 1 : 0;
3024
3025 return
3026 /* Before a subexpression? */
3027 (syntax & RE_NO_BK_PARENS ? *next == ')'
3028 : next_backslash && next_next && *next_next == ')')
3029 /* Before an alternative? */
3030 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3031 : next_backslash && next_next && *next_next == '|');
3032 }
3033
3034
3035 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3036 false if it's not. */
3037
3038 static boolean
3039 group_in_compile_stack (compile_stack, regnum)
3040 compile_stack_type compile_stack;
3041 regnum_t regnum;
3042 {
3043 int this_element;
3044
3045 for (this_element = compile_stack.avail - 1;
3046 this_element >= 0;
3047 this_element--)
3048 if (compile_stack.stack[this_element].regnum == regnum)
3049 return true;
3050
3051 return false;
3052 }
3053
3054
3055 /* Read the ending character of a range (in a bracket expression) from the
3056 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3057 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3058 Then we set the translation of all bits between the starting and
3059 ending characters (inclusive) in the compiled pattern B.
3060
3061 Return an error code.
3062
3063 We use these short variable names so we can use the same macros as
3064 `regex_compile' itself. */
3065
3066 static reg_errcode_t
3067 compile_range (p_ptr, pend, translate, syntax, b)
3068 const char **p_ptr, *pend;
3069 RE_TRANSLATE_TYPE translate;
3070 reg_syntax_t syntax;
3071 unsigned char *b;
3072 {
3073 unsigned this_char;
3074
3075 const char *p = *p_ptr;
3076 unsigned int range_start, range_end;
3077
3078 if (p == pend)
3079 return REG_ERANGE;
3080
3081 /* Even though the pattern is a signed `char *', we need to fetch
3082 with unsigned char *'s; if the high bit of the pattern character
3083 is set, the range endpoints will be negative if we fetch using a
3084 signed char *.
3085
3086 We also want to fetch the endpoints without translating them; the
3087 appropriate translation is done in the bit-setting loop below. */
3088 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3089 range_start = ((const unsigned char *) p)[-2];
3090 range_end = ((const unsigned char *) p)[0];
3091
3092 /* Have to increment the pointer into the pattern string, so the
3093 caller isn't still at the ending character. */
3094 (*p_ptr)++;
3095
3096 /* If the start is after the end, the range is empty. */
3097 if (range_start > range_end)
3098 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3099
3100 /* Here we see why `this_char' has to be larger than an `unsigned
3101 char' -- the range is inclusive, so if `range_end' == 0xff
3102 (assuming 8-bit characters), we would otherwise go into an infinite
3103 loop, since all characters <= 0xff. */
3104 for (this_char = range_start; this_char <= range_end; this_char++)
3105 {
3106 SET_LIST_BIT (TRANSLATE (this_char));
3107 }
3108
3109 return REG_NOERROR;
3110 }
3111 \f
3112 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3113 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3114 characters can start a string that matches the pattern. This fastmap
3115 is used by re_search to skip quickly over impossible starting points.
3116
3117 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3118 area as BUFP->fastmap.
3119
3120 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3121 the pattern buffer.
3122
3123 Returns 0 if we succeed, -2 if an internal error. */
3124
3125 int
3126 re_compile_fastmap (bufp)
3127 struct re_pattern_buffer *bufp;
3128 {
3129 int j, k;
3130 #ifdef MATCH_MAY_ALLOCATE
3131 fail_stack_type fail_stack;
3132 #endif
3133 #ifndef REGEX_MALLOC
3134 char *destination;
3135 #endif
3136
3137 register char *fastmap = bufp->fastmap;
3138 unsigned char *pattern = bufp->buffer;
3139 unsigned char *p = pattern;
3140 register unsigned char *pend = pattern + bufp->used;
3141
3142 #ifdef REL_ALLOC
3143 /* This holds the pointer to the failure stack, when
3144 it is allocated relocatably. */
3145 fail_stack_elt_t *failure_stack_ptr;
3146 #endif
3147
3148 /* Assume that each path through the pattern can be null until
3149 proven otherwise. We set this false at the bottom of switch
3150 statement, to which we get only if a particular path doesn't
3151 match the empty string. */
3152 boolean path_can_be_null = true;
3153
3154 /* We aren't doing a `succeed_n' to begin with. */
3155 boolean succeed_n_p = false;
3156
3157 assert (fastmap != NULL && p != NULL);
3158
3159 INIT_FAIL_STACK ();
3160 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3161 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3162 bufp->can_be_null = 0;
3163
3164 while (1)
3165 {
3166 if (p == pend || *p == succeed)
3167 {
3168 /* We have reached the (effective) end of pattern. */
3169 if (!FAIL_STACK_EMPTY ())
3170 {
3171 bufp->can_be_null |= path_can_be_null;
3172
3173 /* Reset for next path. */
3174 path_can_be_null = true;
3175
3176 p = fail_stack.stack[--fail_stack.avail].pointer;
3177
3178 continue;
3179 }
3180 else
3181 break;
3182 }
3183
3184 /* We should never be about to go beyond the end of the pattern. */
3185 assert (p < pend);
3186
3187 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3188 {
3189
3190 /* I guess the idea here is to simply not bother with a fastmap
3191 if a backreference is used, since it's too hard to figure out
3192 the fastmap for the corresponding group. Setting
3193 `can_be_null' stops `re_search_2' from using the fastmap, so
3194 that is all we do. */
3195 case duplicate:
3196 bufp->can_be_null = 1;
3197 goto done;
3198
3199
3200 /* Following are the cases which match a character. These end
3201 with `break'. */
3202
3203 case exactn:
3204 fastmap[p[1]] = 1;
3205 break;
3206
3207
3208 case charset:
3209 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3210 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3211 fastmap[j] = 1;
3212 break;
3213
3214
3215 case charset_not:
3216 /* Chars beyond end of map must be allowed. */
3217 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3218 fastmap[j] = 1;
3219
3220 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3221 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3222 fastmap[j] = 1;
3223 break;
3224
3225
3226 case wordchar:
3227 for (j = 0; j < (1 << BYTEWIDTH); j++)
3228 if (SYNTAX (j) == Sword)
3229 fastmap[j] = 1;
3230 break;
3231
3232
3233 case notwordchar:
3234 for (j = 0; j < (1 << BYTEWIDTH); j++)
3235 if (SYNTAX (j) != Sword)
3236 fastmap[j] = 1;
3237 break;
3238
3239
3240 case anychar:
3241 {
3242 int fastmap_newline = fastmap['\n'];
3243
3244 /* `.' matches anything ... */
3245 for (j = 0; j < (1 << BYTEWIDTH); j++)
3246 fastmap[j] = 1;
3247
3248 /* ... except perhaps newline. */
3249 if (!(bufp->syntax & RE_DOT_NEWLINE))
3250 fastmap['\n'] = fastmap_newline;
3251
3252 /* Return if we have already set `can_be_null'; if we have,
3253 then the fastmap is irrelevant. Something's wrong here. */
3254 else if (bufp->can_be_null)
3255 goto done;
3256
3257 /* Otherwise, have to check alternative paths. */
3258 break;
3259 }
3260
3261 #ifdef emacs
3262 case syntaxspec:
3263 k = *p++;
3264 for (j = 0; j < (1 << BYTEWIDTH); j++)
3265 if (SYNTAX (j) == (enum syntaxcode) k)
3266 fastmap[j] = 1;
3267 break;
3268
3269
3270 case notsyntaxspec:
3271 k = *p++;
3272 for (j = 0; j < (1 << BYTEWIDTH); j++)
3273 if (SYNTAX (j) != (enum syntaxcode) k)
3274 fastmap[j] = 1;
3275 break;
3276
3277
3278 /* All cases after this match the empty string. These end with
3279 `continue'. */
3280
3281
3282 case before_dot:
3283 case at_dot:
3284 case after_dot:
3285 continue;
3286 #endif /* emacs */
3287
3288
3289 case no_op:
3290 case begline:
3291 case endline:
3292 case begbuf:
3293 case endbuf:
3294 case wordbound:
3295 case notwordbound:
3296 case wordbeg:
3297 case wordend:
3298 case push_dummy_failure:
3299 continue;
3300
3301
3302 case jump_n:
3303 case pop_failure_jump:
3304 case maybe_pop_jump:
3305 case jump:
3306 case jump_past_alt:
3307 case dummy_failure_jump:
3308 EXTRACT_NUMBER_AND_INCR (j, p);
3309 p += j;
3310 if (j > 0)
3311 continue;
3312
3313 /* Jump backward implies we just went through the body of a
3314 loop and matched nothing. Opcode jumped to should be
3315 `on_failure_jump' or `succeed_n'. Just treat it like an
3316 ordinary jump. For a * loop, it has pushed its failure
3317 point already; if so, discard that as redundant. */
3318 if ((re_opcode_t) *p != on_failure_jump
3319 && (re_opcode_t) *p != succeed_n)
3320 continue;
3321
3322 p++;
3323 EXTRACT_NUMBER_AND_INCR (j, p);
3324 p += j;
3325
3326 /* If what's on the stack is where we are now, pop it. */
3327 if (!FAIL_STACK_EMPTY ()
3328 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
3329 fail_stack.avail--;
3330
3331 continue;
3332
3333
3334 case on_failure_jump:
3335 case on_failure_keep_string_jump:
3336 handle_on_failure_jump:
3337 EXTRACT_NUMBER_AND_INCR (j, p);
3338
3339 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
3340 end of the pattern. We don't want to push such a point,
3341 since when we restore it above, entering the switch will
3342 increment `p' past the end of the pattern. We don't need
3343 to push such a point since we obviously won't find any more
3344 fastmap entries beyond `pend'. Such a pattern can match
3345 the null string, though. */
3346 if (p + j < pend)
3347 {
3348 if (!PUSH_PATTERN_OP (p + j, fail_stack))
3349 {
3350 RESET_FAIL_STACK ();
3351 return -2;
3352 }
3353 }
3354 else
3355 bufp->can_be_null = 1;
3356
3357 if (succeed_n_p)
3358 {
3359 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
3360 succeed_n_p = false;
3361 }
3362
3363 continue;
3364
3365
3366 case succeed_n:
3367 /* Get to the number of times to succeed. */
3368 p += 2;
3369
3370 /* Increment p past the n for when k != 0. */
3371 EXTRACT_NUMBER_AND_INCR (k, p);
3372 if (k == 0)
3373 {
3374 p -= 4;
3375 succeed_n_p = true; /* Spaghetti code alert. */
3376 goto handle_on_failure_jump;
3377 }
3378 continue;
3379
3380
3381 case set_number_at:
3382 p += 4;
3383 continue;
3384
3385
3386 case start_memory:
3387 case stop_memory:
3388 p += 2;
3389 continue;
3390
3391
3392 default:
3393 abort (); /* We have listed all the cases. */
3394 } /* switch *p++ */
3395
3396 /* Getting here means we have found the possible starting
3397 characters for one path of the pattern -- and that the empty
3398 string does not match. We need not follow this path further.
3399 Instead, look at the next alternative (remembered on the
3400 stack), or quit if no more. The test at the top of the loop
3401 does these things. */
3402 path_can_be_null = false;
3403 p = pend;
3404 } /* while p */
3405
3406 /* Set `can_be_null' for the last path (also the first path, if the
3407 pattern is empty). */
3408 bufp->can_be_null |= path_can_be_null;
3409
3410 done:
3411 RESET_FAIL_STACK ();
3412 return 0;
3413 } /* re_compile_fastmap */
3414 #ifdef _LIBC
3415 weak_alias (__re_compile_fastmap, re_compile_fastmap)
3416 #endif
3417 \f
3418 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3419 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3420 this memory for recording register information. STARTS and ENDS
3421 must be allocated using the malloc library routine, and must each
3422 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3423
3424 If NUM_REGS == 0, then subsequent matches should allocate their own
3425 register data.
3426
3427 Unless this function is called, the first search or match using
3428 PATTERN_BUFFER will allocate its own register data, without
3429 freeing the old data. */
3430
3431 void
3432 re_set_registers (bufp, regs, num_regs, starts, ends)
3433 struct re_pattern_buffer *bufp;
3434 struct re_registers *regs;
3435 unsigned num_regs;
3436 regoff_t *starts, *ends;
3437 {
3438 if (num_regs)
3439 {
3440 bufp->regs_allocated = REGS_REALLOCATE;
3441 regs->num_regs = num_regs;
3442 regs->start = starts;
3443 regs->end = ends;
3444 }
3445 else
3446 {
3447 bufp->regs_allocated = REGS_UNALLOCATED;
3448 regs->num_regs = 0;
3449 regs->start = regs->end = (regoff_t *) 0;
3450 }
3451 }
3452 #ifdef _LIBC
3453 weak_alias (__re_set_registers, re_set_registers)
3454 #endif
3455 \f
3456 /* Searching routines. */
3457
3458 /* Like re_search_2, below, but only one string is specified, and
3459 doesn't let you say where to stop matching. */
3460
3461 int
3462 re_search (bufp, string, size, startpos, range, regs)
3463 struct re_pattern_buffer *bufp;
3464 const char *string;
3465 int size, startpos, range;
3466 struct re_registers *regs;
3467 {
3468 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3469 regs, size);
3470 }
3471 #ifdef _LIBC
3472 weak_alias (__re_search, re_search)
3473 #endif
3474
3475
3476 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3477 virtual concatenation of STRING1 and STRING2, starting first at index
3478 STARTPOS, then at STARTPOS + 1, and so on.
3479
3480 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3481
3482 RANGE is how far to scan while trying to match. RANGE = 0 means try
3483 only at STARTPOS; in general, the last start tried is STARTPOS +
3484 RANGE.
3485
3486 In REGS, return the indices of the virtual concatenation of STRING1
3487 and STRING2 that matched the entire BUFP->buffer and its contained
3488 subexpressions.
3489
3490 Do not consider matching one past the index STOP in the virtual
3491 concatenation of STRING1 and STRING2.
3492
3493 We return either the position in the strings at which the match was
3494 found, -1 if no match, or -2 if error (such as failure
3495 stack overflow). */
3496
3497 int
3498 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3499 struct re_pattern_buffer *bufp;
3500 const char *string1, *string2;
3501 int size1, size2;
3502 int startpos;
3503 int range;
3504 struct re_registers *regs;
3505 int stop;
3506 {
3507 int val;
3508 register char *fastmap = bufp->fastmap;
3509 register RE_TRANSLATE_TYPE translate = bufp->translate;
3510 int total_size = size1 + size2;
3511 int endpos = startpos + range;
3512
3513 /* Check for out-of-range STARTPOS. */
3514 if (startpos < 0 || startpos > total_size)
3515 return -1;
3516
3517 /* Fix up RANGE if it might eventually take us outside
3518 the virtual concatenation of STRING1 and STRING2.
3519 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3520 if (endpos < 0)
3521 range = 0 - startpos;
3522 else if (endpos > total_size)
3523 range = total_size - startpos;
3524
3525 /* If the search isn't to be a backwards one, don't waste time in a
3526 search for a pattern that must be anchored. */
3527 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3528 {
3529 if (startpos > 0)
3530 return -1;
3531 else
3532 range = 1;
3533 }
3534
3535 #ifdef emacs
3536 /* In a forward search for something that starts with \=.
3537 don't keep searching past point. */
3538 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3539 {
3540 range = PT - startpos;
3541 if (range <= 0)
3542 return -1;
3543 }
3544 #endif /* emacs */
3545
3546 /* Update the fastmap now if not correct already. */
3547 if (fastmap && !bufp->fastmap_accurate)
3548 if (re_compile_fastmap (bufp) == -2)
3549 return -2;
3550
3551 /* Loop through the string, looking for a place to start matching. */
3552 for (;;)
3553 {
3554 /* If a fastmap is supplied, skip quickly over characters that
3555 cannot be the start of a match. If the pattern can match the
3556 null string, however, we don't need to skip characters; we want
3557 the first null string. */
3558 if (fastmap && startpos < total_size && !bufp->can_be_null)
3559 {
3560 if (range > 0) /* Searching forwards. */
3561 {
3562 register const char *d;
3563 register int lim = 0;
3564 int irange = range;
3565
3566 if (startpos < size1 && startpos + range >= size1)
3567 lim = range - (size1 - startpos);
3568
3569 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3570
3571 /* Written out as an if-else to avoid testing `translate'
3572 inside the loop. */
3573 if (translate)
3574 while (range > lim
3575 && !fastmap[(unsigned char)
3576 translate[(unsigned char) *d++]])
3577 range--;
3578 else
3579 while (range > lim && !fastmap[(unsigned char) *d++])
3580 range--;
3581
3582 startpos += irange - range;
3583 }
3584 else /* Searching backwards. */
3585 {
3586 register char c = (size1 == 0 || startpos >= size1
3587 ? string2[startpos - size1]
3588 : string1[startpos]);
3589
3590 if (!fastmap[(unsigned char) TRANSLATE (c)])
3591 goto advance;
3592 }
3593 }
3594
3595 /* If can't match the null string, and that's all we have left, fail. */
3596 if (range >= 0 && startpos == total_size && fastmap
3597 && !bufp->can_be_null)
3598 return -1;
3599
3600 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3601 startpos, regs, stop);
3602 #ifndef REGEX_MALLOC
3603 # ifdef C_ALLOCA
3604 alloca (0);
3605 # endif
3606 #endif
3607
3608 if (val >= 0)
3609 return startpos;
3610
3611 if (val == -2)
3612 return -2;
3613
3614 advance:
3615 if (!range)
3616 break;
3617 else if (range > 0)
3618 {
3619 range--;
3620 startpos++;
3621 }
3622 else
3623 {
3624 range++;
3625 startpos--;
3626 }
3627 }
3628 return -1;
3629 } /* re_search_2 */
3630 #ifdef _LIBC
3631 weak_alias (__re_search_2, re_search_2)
3632 #endif
3633 \f
3634 /* This converts PTR, a pointer into one of the search strings `string1'
3635 and `string2' into an offset from the beginning of that string. */
3636 #define POINTER_TO_OFFSET(ptr) \
3637 (FIRST_STRING_P (ptr) \
3638 ? ((regoff_t) ((ptr) - string1)) \
3639 : ((regoff_t) ((ptr) - string2 + size1)))
3640
3641 /* Macros for dealing with the split strings in re_match_2. */
3642
3643 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3644
3645 /* Call before fetching a character with *d. This switches over to
3646 string2 if necessary. */
3647 #define PREFETCH() \
3648 while (d == dend) \
3649 { \
3650 /* End of string2 => fail. */ \
3651 if (dend == end_match_2) \
3652 goto fail; \
3653 /* End of string1 => advance to string2. */ \
3654 d = string2; \
3655 dend = end_match_2; \
3656 }
3657
3658
3659 /* Test if at very beginning or at very end of the virtual concatenation
3660 of `string1' and `string2'. If only one string, it's `string2'. */
3661 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3662 #define AT_STRINGS_END(d) ((d) == end2)
3663
3664
3665 /* Test if D points to a character which is word-constituent. We have
3666 two special cases to check for: if past the end of string1, look at
3667 the first character in string2; and if before the beginning of
3668 string2, look at the last character in string1. */
3669 #define WORDCHAR_P(d) \
3670 (SYNTAX ((d) == end1 ? *string2 \
3671 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3672 == Sword)
3673
3674 /* Disabled due to a compiler bug -- see comment at case wordbound */
3675 #if 0
3676 /* Test if the character before D and the one at D differ with respect
3677 to being word-constituent. */
3678 #define AT_WORD_BOUNDARY(d) \
3679 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3680 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3681 #endif
3682
3683 /* Free everything we malloc. */
3684 #ifdef MATCH_MAY_ALLOCATE
3685 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
3686 # define FREE_VARIABLES() \
3687 do { \
3688 REGEX_FREE_STACK (fail_stack.stack); \
3689 FREE_VAR (regstart); \
3690 FREE_VAR (regend); \
3691 FREE_VAR (old_regstart); \
3692 FREE_VAR (old_regend); \
3693 FREE_VAR (best_regstart); \
3694 FREE_VAR (best_regend); \
3695 FREE_VAR (reg_info); \
3696 FREE_VAR (reg_dummy); \
3697 FREE_VAR (reg_info_dummy); \
3698 } while (0)
3699 #else
3700 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
3701 #endif /* not MATCH_MAY_ALLOCATE */
3702
3703 /* These values must meet several constraints. They must not be valid
3704 register values; since we have a limit of 255 registers (because
3705 we use only one byte in the pattern for the register number), we can
3706 use numbers larger than 255. They must differ by 1, because of
3707 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3708 be larger than the value for the highest register, so we do not try
3709 to actually save any registers when none are active. */
3710 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3711 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3712 \f
3713 /* Matching routines. */
3714
3715 #ifndef emacs /* Emacs never uses this. */
3716 /* re_match is like re_match_2 except it takes only a single string. */
3717
3718 int
3719 re_match (bufp, string, size, pos, regs)
3720 struct re_pattern_buffer *bufp;
3721 const char *string;
3722 int size, pos;
3723 struct re_registers *regs;
3724 {
3725 int result = re_match_2_internal (bufp, NULL, 0, string, size,
3726 pos, regs, size);
3727 # ifndef REGEX_MALLOC
3728 # ifdef C_ALLOCA
3729 alloca (0);
3730 # endif
3731 # endif
3732 return result;
3733 }
3734 # ifdef _LIBC
3735 weak_alias (__re_match, re_match)
3736 # endif
3737 #endif /* not emacs */
3738
3739 static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p,
3740 unsigned char *end,
3741 register_info_type *reg_info));
3742 static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p,
3743 unsigned char *end,
3744 register_info_type *reg_info));
3745 static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p,
3746 unsigned char *end,
3747 register_info_type *reg_info));
3748 static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2,
3749 int len, char *translate));
3750
3751 /* re_match_2 matches the compiled pattern in BUFP against the
3752 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3753 and SIZE2, respectively). We start matching at POS, and stop
3754 matching at STOP.
3755
3756 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3757 store offsets for the substring each group matched in REGS. See the
3758 documentation for exactly how many groups we fill.
3759
3760 We return -1 if no match, -2 if an internal error (such as the
3761 failure stack overflowing). Otherwise, we return the length of the
3762 matched substring. */
3763
3764 int
3765 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3766 struct re_pattern_buffer *bufp;
3767 const char *string1, *string2;
3768 int size1, size2;
3769 int pos;
3770 struct re_registers *regs;
3771 int stop;
3772 {
3773 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
3774 pos, regs, stop);
3775 #ifndef REGEX_MALLOC
3776 # ifdef C_ALLOCA
3777 alloca (0);
3778 # endif
3779 #endif
3780 return result;
3781 }
3782 #ifdef _LIBC
3783 weak_alias (__re_match_2, re_match_2)
3784 #endif
3785
3786 /* This is a separate function so that we can force an alloca cleanup
3787 afterwards. */
3788 static int
3789 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
3790 struct re_pattern_buffer *bufp;
3791 const char *string1, *string2;
3792 int size1, size2;
3793 int pos;
3794 struct re_registers *regs;
3795 int stop;
3796 {
3797 /* General temporaries. */
3798 int mcnt;
3799 unsigned char *p1;
3800
3801 /* Just past the end of the corresponding string. */
3802 const char *end1, *end2;
3803
3804 /* Pointers into string1 and string2, just past the last characters in
3805 each to consider matching. */
3806 const char *end_match_1, *end_match_2;
3807
3808 /* Where we are in the data, and the end of the current string. */
3809 const char *d, *dend;
3810
3811 /* Where we are in the pattern, and the end of the pattern. */
3812 unsigned char *p = bufp->buffer;
3813 register unsigned char *pend = p + bufp->used;
3814
3815 /* Mark the opcode just after a start_memory, so we can test for an
3816 empty subpattern when we get to the stop_memory. */
3817 unsigned char *just_past_start_mem = 0;
3818
3819 /* We use this to map every character in the string. */
3820 RE_TRANSLATE_TYPE translate = bufp->translate;
3821
3822 /* Failure point stack. Each place that can handle a failure further
3823 down the line pushes a failure point on this stack. It consists of
3824 restart, regend, and reg_info for all registers corresponding to
3825 the subexpressions we're currently inside, plus the number of such
3826 registers, and, finally, two char *'s. The first char * is where
3827 to resume scanning the pattern; the second one is where to resume
3828 scanning the strings. If the latter is zero, the failure point is
3829 a ``dummy''; if a failure happens and the failure point is a dummy,
3830 it gets discarded and the next next one is tried. */
3831 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3832 fail_stack_type fail_stack;
3833 #endif
3834 #ifdef DEBUG
3835 static unsigned failure_id = 0;
3836 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3837 #endif
3838
3839 #ifdef REL_ALLOC
3840 /* This holds the pointer to the failure stack, when
3841 it is allocated relocatably. */
3842 fail_stack_elt_t *failure_stack_ptr;
3843 #endif
3844
3845 /* We fill all the registers internally, independent of what we
3846 return, for use in backreferences. The number here includes
3847 an element for register zero. */
3848 size_t num_regs = bufp->re_nsub + 1;
3849
3850 /* The currently active registers. */
3851 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3852 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3853
3854 /* Information on the contents of registers. These are pointers into
3855 the input strings; they record just what was matched (on this
3856 attempt) by a subexpression part of the pattern, that is, the
3857 regnum-th regstart pointer points to where in the pattern we began
3858 matching and the regnum-th regend points to right after where we
3859 stopped matching the regnum-th subexpression. (The zeroth register
3860 keeps track of what the whole pattern matches.) */
3861 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3862 const char **regstart, **regend;
3863 #endif
3864
3865 /* If a group that's operated upon by a repetition operator fails to
3866 match anything, then the register for its start will need to be
3867 restored because it will have been set to wherever in the string we
3868 are when we last see its open-group operator. Similarly for a
3869 register's end. */
3870 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3871 const char **old_regstart, **old_regend;
3872 #endif
3873
3874 /* The is_active field of reg_info helps us keep track of which (possibly
3875 nested) subexpressions we are currently in. The matched_something
3876 field of reg_info[reg_num] helps us tell whether or not we have
3877 matched any of the pattern so far this time through the reg_num-th
3878 subexpression. These two fields get reset each time through any
3879 loop their register is in. */
3880 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3881 register_info_type *reg_info;
3882 #endif
3883
3884 /* The following record the register info as found in the above
3885 variables when we find a match better than any we've seen before.
3886 This happens as we backtrack through the failure points, which in
3887 turn happens only if we have not yet matched the entire string. */
3888 unsigned best_regs_set = false;
3889 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3890 const char **best_regstart, **best_regend;
3891 #endif
3892
3893 /* Logically, this is `best_regend[0]'. But we don't want to have to
3894 allocate space for that if we're not allocating space for anything
3895 else (see below). Also, we never need info about register 0 for
3896 any of the other register vectors, and it seems rather a kludge to
3897 treat `best_regend' differently than the rest. So we keep track of
3898 the end of the best match so far in a separate variable. We
3899 initialize this to NULL so that when we backtrack the first time
3900 and need to test it, it's not garbage. */
3901 const char *match_end = NULL;
3902
3903 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
3904 int set_regs_matched_done = 0;
3905
3906 /* Used when we pop values we don't care about. */
3907 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3908 const char **reg_dummy;
3909 register_info_type *reg_info_dummy;
3910 #endif
3911
3912 #ifdef DEBUG
3913 /* Counts the total number of registers pushed. */
3914 unsigned num_regs_pushed = 0;
3915 #endif
3916
3917 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3918
3919 INIT_FAIL_STACK ();
3920
3921 #ifdef MATCH_MAY_ALLOCATE
3922 /* Do not bother to initialize all the register variables if there are
3923 no groups in the pattern, as it takes a fair amount of time. If
3924 there are groups, we include space for register 0 (the whole
3925 pattern), even though we never use it, since it simplifies the
3926 array indexing. We should fix this. */
3927 if (bufp->re_nsub)
3928 {
3929 regstart = REGEX_TALLOC (num_regs, const char *);
3930 regend = REGEX_TALLOC (num_regs, const char *);
3931 old_regstart = REGEX_TALLOC (num_regs, const char *);
3932 old_regend = REGEX_TALLOC (num_regs, const char *);
3933 best_regstart = REGEX_TALLOC (num_regs, const char *);
3934 best_regend = REGEX_TALLOC (num_regs, const char *);
3935 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3936 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3937 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3938
3939 if (!(regstart && regend && old_regstart && old_regend && reg_info
3940 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3941 {
3942 FREE_VARIABLES ();
3943 return -2;
3944 }
3945 }
3946 else
3947 {
3948 /* We must initialize all our variables to NULL, so that
3949 `FREE_VARIABLES' doesn't try to free them. */
3950 regstart = regend = old_regstart = old_regend = best_regstart
3951 = best_regend = reg_dummy = NULL;
3952 reg_info = reg_info_dummy = (register_info_type *) NULL;
3953 }
3954 #endif /* MATCH_MAY_ALLOCATE */
3955
3956 /* The starting position is bogus. */
3957 if (pos < 0 || pos > size1 + size2)
3958 {
3959 FREE_VARIABLES ();
3960 return -1;
3961 }
3962
3963 /* Initialize subexpression text positions to -1 to mark ones that no
3964 start_memory/stop_memory has been seen for. Also initialize the
3965 register information struct. */
3966 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
3967 {
3968 regstart[mcnt] = regend[mcnt]
3969 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3970
3971 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3972 IS_ACTIVE (reg_info[mcnt]) = 0;
3973 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3974 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3975 }
3976
3977 /* We move `string1' into `string2' if the latter's empty -- but not if
3978 `string1' is null. */
3979 if (size2 == 0 && string1 != NULL)
3980 {
3981 string2 = string1;
3982 size2 = size1;
3983 string1 = 0;
3984 size1 = 0;
3985 }
3986 end1 = string1 + size1;
3987 end2 = string2 + size2;
3988
3989 /* Compute where to stop matching, within the two strings. */
3990 if (stop <= size1)
3991 {
3992 end_match_1 = string1 + stop;
3993 end_match_2 = string2;
3994 }
3995 else
3996 {
3997 end_match_1 = end1;
3998 end_match_2 = string2 + stop - size1;
3999 }
4000
4001 /* `p' scans through the pattern as `d' scans through the data.
4002 `dend' is the end of the input string that `d' points within. `d'
4003 is advanced into the following input string whenever necessary, but
4004 this happens before fetching; therefore, at the beginning of the
4005 loop, `d' can be pointing at the end of a string, but it cannot
4006 equal `string2'. */
4007 if (size1 > 0 && pos <= size1)
4008 {
4009 d = string1 + pos;
4010 dend = end_match_1;
4011 }
4012 else
4013 {
4014 d = string2 + pos - size1;
4015 dend = end_match_2;
4016 }
4017
4018 DEBUG_PRINT1 ("The compiled pattern is:\n");
4019 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4020 DEBUG_PRINT1 ("The string to match is: `");
4021 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4022 DEBUG_PRINT1 ("'\n");
4023
4024 /* This loops over pattern commands. It exits by returning from the
4025 function if the match is complete, or it drops through if the match
4026 fails at this starting point in the input data. */
4027 for (;;)
4028 {
4029 #ifdef _LIBC
4030 DEBUG_PRINT2 ("\n%p: ", p);
4031 #else
4032 DEBUG_PRINT2 ("\n0x%x: ", p);
4033 #endif
4034
4035 if (p == pend)
4036 { /* End of pattern means we might have succeeded. */
4037 DEBUG_PRINT1 ("end of pattern ... ");
4038
4039 /* If we haven't matched the entire string, and we want the
4040 longest match, try backtracking. */
4041 if (d != end_match_2)
4042 {
4043 /* 1 if this match ends in the same string (string1 or string2)
4044 as the best previous match. */
4045 boolean same_str_p = (FIRST_STRING_P (match_end)
4046 == MATCHING_IN_FIRST_STRING);
4047 /* 1 if this match is the best seen so far. */
4048 boolean best_match_p;
4049
4050 /* AIX compiler got confused when this was combined
4051 with the previous declaration. */
4052 if (same_str_p)
4053 best_match_p = d > match_end;
4054 else
4055 best_match_p = !MATCHING_IN_FIRST_STRING;
4056
4057 DEBUG_PRINT1 ("backtracking.\n");
4058
4059 if (!FAIL_STACK_EMPTY ())
4060 { /* More failure points to try. */
4061
4062 /* If exceeds best match so far, save it. */
4063 if (!best_regs_set || best_match_p)
4064 {
4065 best_regs_set = true;
4066 match_end = d;
4067
4068 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4069
4070 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4071 {
4072 best_regstart[mcnt] = regstart[mcnt];
4073 best_regend[mcnt] = regend[mcnt];
4074 }
4075 }
4076 goto fail;
4077 }
4078
4079 /* If no failure points, don't restore garbage. And if
4080 last match is real best match, don't restore second
4081 best one. */
4082 else if (best_regs_set && !best_match_p)
4083 {
4084 restore_best_regs:
4085 /* Restore best match. It may happen that `dend ==
4086 end_match_1' while the restored d is in string2.
4087 For example, the pattern `x.*y.*z' against the
4088 strings `x-' and `y-z-', if the two strings are
4089 not consecutive in memory. */
4090 DEBUG_PRINT1 ("Restoring best registers.\n");
4091
4092 d = match_end;
4093 dend = ((d >= string1 && d <= end1)
4094 ? end_match_1 : end_match_2);
4095
4096 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
4097 {
4098 regstart[mcnt] = best_regstart[mcnt];
4099 regend[mcnt] = best_regend[mcnt];
4100 }
4101 }
4102 } /* d != end_match_2 */
4103
4104 succeed_label:
4105 DEBUG_PRINT1 ("Accepting match.\n");
4106
4107 /* If caller wants register contents data back, do it. */
4108 if (regs && !bufp->no_sub)
4109 {
4110 /* Have the register data arrays been allocated? */
4111 if (bufp->regs_allocated == REGS_UNALLOCATED)
4112 { /* No. So allocate them with malloc. We need one
4113 extra element beyond `num_regs' for the `-1' marker
4114 GNU code uses. */
4115 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4116 regs->start = TALLOC (regs->num_regs, regoff_t);
4117 regs->end = TALLOC (regs->num_regs, regoff_t);
4118 if (regs->start == NULL || regs->end == NULL)
4119 {
4120 FREE_VARIABLES ();
4121 return -2;
4122 }
4123 bufp->regs_allocated = REGS_REALLOCATE;
4124 }
4125 else if (bufp->regs_allocated == REGS_REALLOCATE)
4126 { /* Yes. If we need more elements than were already
4127 allocated, reallocate them. If we need fewer, just
4128 leave it alone. */
4129 if (regs->num_regs < num_regs + 1)
4130 {
4131 regs->num_regs = num_regs + 1;
4132 RETALLOC (regs->start, regs->num_regs, regoff_t);
4133 RETALLOC (regs->end, regs->num_regs, regoff_t);
4134 if (regs->start == NULL || regs->end == NULL)
4135 {
4136 FREE_VARIABLES ();
4137 return -2;
4138 }
4139 }
4140 }
4141 else
4142 {
4143 /* These braces fend off a "empty body in an else-statement"
4144 warning under GCC when assert expands to nothing. */
4145 assert (bufp->regs_allocated == REGS_FIXED);
4146 }
4147
4148 /* Convert the pointer data in `regstart' and `regend' to
4149 indices. Register zero has to be set differently,
4150 since we haven't kept track of any info for it. */
4151 if (regs->num_regs > 0)
4152 {
4153 regs->start[0] = pos;
4154 regs->end[0] = (MATCHING_IN_FIRST_STRING
4155 ? ((regoff_t) (d - string1))
4156 : ((regoff_t) (d - string2 + size1)));
4157 }
4158
4159 /* Go through the first `min (num_regs, regs->num_regs)'
4160 registers, since that is all we initialized. */
4161 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
4162 mcnt++)
4163 {
4164 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4165 regs->start[mcnt] = regs->end[mcnt] = -1;
4166 else
4167 {
4168 regs->start[mcnt]
4169 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4170 regs->end[mcnt]
4171 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4172 }
4173 }
4174
4175 /* If the regs structure we return has more elements than
4176 were in the pattern, set the extra elements to -1. If
4177 we (re)allocated the registers, this is the case,
4178 because we always allocate enough to have at least one
4179 -1 at the end. */
4180 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
4181 regs->start[mcnt] = regs->end[mcnt] = -1;
4182 } /* regs && !bufp->no_sub */
4183
4184 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4185 nfailure_points_pushed, nfailure_points_popped,
4186 nfailure_points_pushed - nfailure_points_popped);
4187 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4188
4189 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
4190 ? string1
4191 : string2 - size1);
4192
4193 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4194
4195 FREE_VARIABLES ();
4196 return mcnt;
4197 }
4198
4199 /* Otherwise match next pattern command. */
4200 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4201 {
4202 /* Ignore these. Used to ignore the n of succeed_n's which
4203 currently have n == 0. */
4204 case no_op:
4205 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4206 break;
4207
4208 case succeed:
4209 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4210 goto succeed_label;
4211
4212 /* Match the next n pattern characters exactly. The following
4213 byte in the pattern defines n, and the n bytes after that
4214 are the characters to match. */
4215 case exactn:
4216 mcnt = *p++;
4217 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4218
4219 /* This is written out as an if-else so we don't waste time
4220 testing `translate' inside the loop. */
4221 if (translate)
4222 {
4223 do
4224 {
4225 PREFETCH ();
4226 if ((unsigned char) translate[(unsigned char) *d++]
4227 != (unsigned char) *p++)
4228 goto fail;
4229 }
4230 while (--mcnt);
4231 }
4232 else
4233 {
4234 do
4235 {
4236 PREFETCH ();
4237 if (*d++ != (char) *p++) goto fail;
4238 }
4239 while (--mcnt);
4240 }
4241 SET_REGS_MATCHED ();
4242 break;
4243
4244
4245 /* Match any character except possibly a newline or a null. */
4246 case anychar:
4247 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4248
4249 PREFETCH ();
4250
4251 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
4252 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
4253 goto fail;
4254
4255 SET_REGS_MATCHED ();
4256 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4257 d++;
4258 break;
4259
4260
4261 case charset:
4262 case charset_not:
4263 {
4264 register unsigned char c;
4265 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4266
4267 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4268
4269 PREFETCH ();
4270 c = TRANSLATE (*d); /* The character to match. */
4271
4272 /* Cast to `unsigned' instead of `unsigned char' in case the
4273 bit list is a full 32 bytes long. */
4274 if (c < (unsigned) (*p * BYTEWIDTH)
4275 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4276 not = !not;
4277
4278 p += 1 + *p;
4279
4280 if (!not) goto fail;
4281
4282 SET_REGS_MATCHED ();
4283 d++;
4284 break;
4285 }
4286
4287
4288 /* The beginning of a group is represented by start_memory.
4289 The arguments are the register number in the next byte, and the
4290 number of groups inner to this one in the next. The text
4291 matched within the group is recorded (in the internal
4292 registers data structure) under the register number. */
4293 case start_memory:
4294 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
4295
4296 /* Find out if this group can match the empty string. */
4297 p1 = p; /* To send to group_match_null_string_p. */
4298
4299 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
4300 REG_MATCH_NULL_STRING_P (reg_info[*p])
4301 = group_match_null_string_p (&p1, pend, reg_info);
4302
4303 /* Save the position in the string where we were the last time
4304 we were at this open-group operator in case the group is
4305 operated upon by a repetition operator, e.g., with `(a*)*b'
4306 against `ab'; then we want to ignore where we are now in
4307 the string in case this attempt to match fails. */
4308 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4309 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
4310 : regstart[*p];
4311 DEBUG_PRINT2 (" old_regstart: %d\n",
4312 POINTER_TO_OFFSET (old_regstart[*p]));
4313
4314 regstart[*p] = d;
4315 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4316
4317 IS_ACTIVE (reg_info[*p]) = 1;
4318 MATCHED_SOMETHING (reg_info[*p]) = 0;
4319
4320 /* Clear this whenever we change the register activity status. */
4321 set_regs_matched_done = 0;
4322
4323 /* This is the new highest active register. */
4324 highest_active_reg = *p;
4325
4326 /* If nothing was active before, this is the new lowest active
4327 register. */
4328 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4329 lowest_active_reg = *p;
4330
4331 /* Move past the register number and inner group count. */
4332 p += 2;
4333 just_past_start_mem = p;
4334
4335 break;
4336
4337
4338 /* The stop_memory opcode represents the end of a group. Its
4339 arguments are the same as start_memory's: the register
4340 number, and the number of inner groups. */
4341 case stop_memory:
4342 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
4343
4344 /* We need to save the string position the last time we were at
4345 this close-group operator in case the group is operated
4346 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
4347 against `aba'; then we want to ignore where we are now in
4348 the string in case this attempt to match fails. */
4349 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
4350 ? REG_UNSET (regend[*p]) ? d : regend[*p]
4351 : regend[*p];
4352 DEBUG_PRINT2 (" old_regend: %d\n",
4353 POINTER_TO_OFFSET (old_regend[*p]));
4354
4355 regend[*p] = d;
4356 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4357
4358 /* This register isn't active anymore. */
4359 IS_ACTIVE (reg_info[*p]) = 0;
4360
4361 /* Clear this whenever we change the register activity status. */
4362 set_regs_matched_done = 0;
4363
4364 /* If this was the only register active, nothing is active
4365 anymore. */
4366 if (lowest_active_reg == highest_active_reg)
4367 {
4368 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4369 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4370 }
4371 else
4372 { /* We must scan for the new highest active register, since
4373 it isn't necessarily one less than now: consider
4374 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
4375 new highest active register is 1. */
4376 unsigned char r = *p - 1;
4377 while (r > 0 && !IS_ACTIVE (reg_info[r]))
4378 r--;
4379
4380 /* If we end up at register zero, that means that we saved
4381 the registers as the result of an `on_failure_jump', not
4382 a `start_memory', and we jumped to past the innermost
4383 `stop_memory'. For example, in ((.)*) we save
4384 registers 1 and 2 as a result of the *, but when we pop
4385 back to the second ), we are at the stop_memory 1.
4386 Thus, nothing is active. */
4387 if (r == 0)
4388 {
4389 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4390 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4391 }
4392 else
4393 highest_active_reg = r;
4394 }
4395
4396 /* If just failed to match something this time around with a
4397 group that's operated on by a repetition operator, try to
4398 force exit from the ``loop'', and restore the register
4399 information for this group that we had before trying this
4400 last match. */
4401 if ((!MATCHED_SOMETHING (reg_info[*p])
4402 || just_past_start_mem == p - 1)
4403 && (p + 2) < pend)
4404 {
4405 boolean is_a_jump_n = false;
4406
4407 p1 = p + 2;
4408 mcnt = 0;
4409 switch ((re_opcode_t) *p1++)
4410 {
4411 case jump_n:
4412 is_a_jump_n = true;
4413 case pop_failure_jump:
4414 case maybe_pop_jump:
4415 case jump:
4416 case dummy_failure_jump:
4417 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4418 if (is_a_jump_n)
4419 p1 += 2;
4420 break;
4421
4422 default:
4423 /* do nothing */ ;
4424 }
4425 p1 += mcnt;
4426
4427 /* If the next operation is a jump backwards in the pattern
4428 to an on_failure_jump right before the start_memory
4429 corresponding to this stop_memory, exit from the loop
4430 by forcing a failure after pushing on the stack the
4431 on_failure_jump's jump in the pattern, and d. */
4432 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4433 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4434 {
4435 /* If this group ever matched anything, then restore
4436 what its registers were before trying this last
4437 failed match, e.g., with `(a*)*b' against `ab' for
4438 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4439 against `aba' for regend[3].
4440
4441 Also restore the registers for inner groups for,
4442 e.g., `((a*)(b*))*' against `aba' (register 3 would
4443 otherwise get trashed). */
4444
4445 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4446 {
4447 unsigned r;
4448
4449 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4450
4451 /* Restore this and inner groups' (if any) registers. */
4452 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
4453 r++)
4454 {
4455 regstart[r] = old_regstart[r];
4456
4457 /* xx why this test? */
4458 if (old_regend[r] >= regstart[r])
4459 regend[r] = old_regend[r];
4460 }
4461 }
4462 p1++;
4463 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4464 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4465
4466 goto fail;
4467 }
4468 }
4469
4470 /* Move past the register number and the inner group count. */
4471 p += 2;
4472 break;
4473
4474
4475 /* \<digit> has been turned into a `duplicate' command which is
4476 followed by the numeric value of <digit> as the register number. */
4477 case duplicate:
4478 {
4479 register const char *d2, *dend2;
4480 int regno = *p++; /* Get which register to match against. */
4481 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4482
4483 /* Can't back reference a group which we've never matched. */
4484 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4485 goto fail;
4486
4487 /* Where in input to try to start matching. */
4488 d2 = regstart[regno];
4489
4490 /* Where to stop matching; if both the place to start and
4491 the place to stop matching are in the same string, then
4492 set to the place to stop, otherwise, for now have to use
4493 the end of the first string. */
4494
4495 dend2 = ((FIRST_STRING_P (regstart[regno])
4496 == FIRST_STRING_P (regend[regno]))
4497 ? regend[regno] : end_match_1);
4498 for (;;)
4499 {
4500 /* If necessary, advance to next segment in register
4501 contents. */
4502 while (d2 == dend2)
4503 {
4504 if (dend2 == end_match_2) break;
4505 if (dend2 == regend[regno]) break;
4506
4507 /* End of string1 => advance to string2. */
4508 d2 = string2;
4509 dend2 = regend[regno];
4510 }
4511 /* At end of register contents => success */
4512 if (d2 == dend2) break;
4513
4514 /* If necessary, advance to next segment in data. */
4515 PREFETCH ();
4516
4517 /* How many characters left in this segment to match. */
4518 mcnt = dend - d;
4519
4520 /* Want how many consecutive characters we can match in
4521 one shot, so, if necessary, adjust the count. */
4522 if (mcnt > dend2 - d2)
4523 mcnt = dend2 - d2;
4524
4525 /* Compare that many; failure if mismatch, else move
4526 past them. */
4527 if (translate
4528 ? bcmp_translate (d, d2, mcnt, translate)
4529 : memcmp (d, d2, mcnt))
4530 goto fail;
4531 d += mcnt, d2 += mcnt;
4532
4533 /* Do this because we've match some characters. */
4534 SET_REGS_MATCHED ();
4535 }
4536 }
4537 break;
4538
4539
4540 /* begline matches the empty string at the beginning of the string
4541 (unless `not_bol' is set in `bufp'), and, if
4542 `newline_anchor' is set, after newlines. */
4543 case begline:
4544 DEBUG_PRINT1 ("EXECUTING begline.\n");
4545
4546 if (AT_STRINGS_BEG (d))
4547 {
4548 if (!bufp->not_bol) break;
4549 }
4550 else if (d[-1] == '\n' && bufp->newline_anchor)
4551 {
4552 break;
4553 }
4554 /* In all other cases, we fail. */
4555 goto fail;
4556
4557
4558 /* endline is the dual of begline. */
4559 case endline:
4560 DEBUG_PRINT1 ("EXECUTING endline.\n");
4561
4562 if (AT_STRINGS_END (d))
4563 {
4564 if (!bufp->not_eol) break;
4565 }
4566
4567 /* We have to ``prefetch'' the next character. */
4568 else if ((d == end1 ? *string2 : *d) == '\n'
4569 && bufp->newline_anchor)
4570 {
4571 break;
4572 }
4573 goto fail;
4574
4575
4576 /* Match at the very beginning of the data. */
4577 case begbuf:
4578 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
4579 if (AT_STRINGS_BEG (d))
4580 break;
4581 goto fail;
4582
4583
4584 /* Match at the very end of the data. */
4585 case endbuf:
4586 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
4587 if (AT_STRINGS_END (d))
4588 break;
4589 goto fail;
4590
4591
4592 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
4593 pushes NULL as the value for the string on the stack. Then
4594 `pop_failure_point' will keep the current value for the
4595 string, instead of restoring it. To see why, consider
4596 matching `foo\nbar' against `.*\n'. The .* matches the foo;
4597 then the . fails against the \n. But the next thing we want
4598 to do is match the \n against the \n; if we restored the
4599 string value, we would be back at the foo.
4600
4601 Because this is used only in specific cases, we don't need to
4602 check all the things that `on_failure_jump' does, to make
4603 sure the right things get saved on the stack. Hence we don't
4604 share its code. The only reason to push anything on the
4605 stack at all is that otherwise we would have to change
4606 `anychar's code to do something besides goto fail in this
4607 case; that seems worse than this. */
4608 case on_failure_keep_string_jump:
4609 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4610
4611 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4612 #ifdef _LIBC
4613 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
4614 #else
4615 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
4616 #endif
4617
4618 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4619 break;
4620
4621
4622 /* Uses of on_failure_jump:
4623
4624 Each alternative starts with an on_failure_jump that points
4625 to the beginning of the next alternative. Each alternative
4626 except the last ends with a jump that in effect jumps past
4627 the rest of the alternatives. (They really jump to the
4628 ending jump of the following alternative, because tensioning
4629 these jumps is a hassle.)
4630
4631 Repeats start with an on_failure_jump that points past both
4632 the repetition text and either the following jump or
4633 pop_failure_jump back to this on_failure_jump. */
4634 case on_failure_jump:
4635 on_failure:
4636 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4637
4638 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4639 #ifdef _LIBC
4640 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
4641 #else
4642 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
4643 #endif
4644
4645 /* If this on_failure_jump comes right before a group (i.e.,
4646 the original * applied to a group), save the information
4647 for that group and all inner ones, so that if we fail back
4648 to this point, the group's information will be correct.
4649 For example, in \(a*\)*\1, we need the preceding group,
4650 and in \(zz\(a*\)b*\)\2, we need the inner group. */
4651
4652 /* We can't use `p' to check ahead because we push
4653 a failure point to `p + mcnt' after we do this. */
4654 p1 = p;
4655
4656 /* We need to skip no_op's before we look for the
4657 start_memory in case this on_failure_jump is happening as
4658 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4659 against aba. */
4660 while (p1 < pend && (re_opcode_t) *p1 == no_op)
4661 p1++;
4662
4663 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4664 {
4665 /* We have a new highest active register now. This will
4666 get reset at the start_memory we are about to get to,
4667 but we will have saved all the registers relevant to
4668 this repetition op, as described above. */
4669 highest_active_reg = *(p1 + 1) + *(p1 + 2);
4670 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4671 lowest_active_reg = *(p1 + 1);
4672 }
4673
4674 DEBUG_PRINT1 (":\n");
4675 PUSH_FAILURE_POINT (p + mcnt, d, -2);
4676 break;
4677
4678
4679 /* A smart repeat ends with `maybe_pop_jump'.
4680 We change it to either `pop_failure_jump' or `jump'. */
4681 case maybe_pop_jump:
4682 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4683 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4684 {
4685 register unsigned char *p2 = p;
4686
4687 /* Compare the beginning of the repeat with what in the
4688 pattern follows its end. If we can establish that there
4689 is nothing that they would both match, i.e., that we
4690 would have to backtrack because of (as in, e.g., `a*a')
4691 then we can change to pop_failure_jump, because we'll
4692 never have to backtrack.
4693
4694 This is not true in the case of alternatives: in
4695 `(a|ab)*' we do need to backtrack to the `ab' alternative
4696 (e.g., if the string was `ab'). But instead of trying to
4697 detect that here, the alternative has put on a dummy
4698 failure point which is what we will end up popping. */
4699
4700 /* Skip over open/close-group commands.
4701 If what follows this loop is a ...+ construct,
4702 look at what begins its body, since we will have to
4703 match at least one of that. */
4704 while (1)
4705 {
4706 if (p2 + 2 < pend
4707 && ((re_opcode_t) *p2 == stop_memory
4708 || (re_opcode_t) *p2 == start_memory))
4709 p2 += 3;
4710 else if (p2 + 6 < pend
4711 && (re_opcode_t) *p2 == dummy_failure_jump)
4712 p2 += 6;
4713 else
4714 break;
4715 }
4716
4717 p1 = p + mcnt;
4718 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4719 to the `maybe_finalize_jump' of this case. Examine what
4720 follows. */
4721
4722 /* If we're at the end of the pattern, we can change. */
4723 if (p2 == pend)
4724 {
4725 /* Consider what happens when matching ":\(.*\)"
4726 against ":/". I don't really understand this code
4727 yet. */
4728 p[-3] = (unsigned char) pop_failure_jump;
4729 DEBUG_PRINT1
4730 (" End of pattern: change to `pop_failure_jump'.\n");
4731 }
4732
4733 else if ((re_opcode_t) *p2 == exactn
4734 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4735 {
4736 register unsigned char c
4737 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4738
4739 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4740 {
4741 p[-3] = (unsigned char) pop_failure_jump;
4742 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4743 c, p1[5]);
4744 }
4745
4746 else if ((re_opcode_t) p1[3] == charset
4747 || (re_opcode_t) p1[3] == charset_not)
4748 {
4749 int not = (re_opcode_t) p1[3] == charset_not;
4750
4751 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4752 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4753 not = !not;
4754
4755 /* `not' is equal to 1 if c would match, which means
4756 that we can't change to pop_failure_jump. */
4757 if (!not)
4758 {
4759 p[-3] = (unsigned char) pop_failure_jump;
4760 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4761 }
4762 }
4763 }
4764 else if ((re_opcode_t) *p2 == charset)
4765 {
4766 #ifdef DEBUG
4767 register unsigned char c
4768 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4769 #endif
4770
4771 #if 0
4772 if ((re_opcode_t) p1[3] == exactn
4773 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
4774 && (p2[2 + p1[5] / BYTEWIDTH]
4775 & (1 << (p1[5] % BYTEWIDTH)))))
4776 #else
4777 if ((re_opcode_t) p1[3] == exactn
4778 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4779 && (p2[2 + p1[4] / BYTEWIDTH]
4780 & (1 << (p1[4] % BYTEWIDTH)))))
4781 #endif
4782 {
4783 p[-3] = (unsigned char) pop_failure_jump;
4784 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4785 c, p1[5]);
4786 }
4787
4788 else if ((re_opcode_t) p1[3] == charset_not)
4789 {
4790 int idx;
4791 /* We win if the charset_not inside the loop
4792 lists every character listed in the charset after. */
4793 for (idx = 0; idx < (int) p2[1]; idx++)
4794 if (! (p2[2 + idx] == 0
4795 || (idx < (int) p1[4]
4796 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
4797 break;
4798
4799 if (idx == p2[1])
4800 {
4801 p[-3] = (unsigned char) pop_failure_jump;
4802 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4803 }
4804 }
4805 else if ((re_opcode_t) p1[3] == charset)
4806 {
4807 int idx;
4808 /* We win if the charset inside the loop
4809 has no overlap with the one after the loop. */
4810 for (idx = 0;
4811 idx < (int) p2[1] && idx < (int) p1[4];
4812 idx++)
4813 if ((p2[2 + idx] & p1[5 + idx]) != 0)
4814 break;
4815
4816 if (idx == p2[1] || idx == p1[4])
4817 {
4818 p[-3] = (unsigned char) pop_failure_jump;
4819 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4820 }
4821 }
4822 }
4823 }
4824 p -= 2; /* Point at relative address again. */
4825 if ((re_opcode_t) p[-1] != pop_failure_jump)
4826 {
4827 p[-1] = (unsigned char) jump;
4828 DEBUG_PRINT1 (" Match => jump.\n");
4829 goto unconditional_jump;
4830 }
4831 /* Note fall through. */
4832
4833
4834 /* The end of a simple repeat has a pop_failure_jump back to
4835 its matching on_failure_jump, where the latter will push a
4836 failure point. The pop_failure_jump takes off failure
4837 points put on by this pop_failure_jump's matching
4838 on_failure_jump; we got through the pattern to here from the
4839 matching on_failure_jump, so didn't fail. */
4840 case pop_failure_jump:
4841 {
4842 /* We need to pass separate storage for the lowest and
4843 highest registers, even though we don't care about the
4844 actual values. Otherwise, we will restore only one
4845 register from the stack, since lowest will == highest in
4846 `pop_failure_point'. */
4847 active_reg_t dummy_low_reg, dummy_high_reg;
4848 unsigned char *pdummy;
4849 const char *sdummy;
4850
4851 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4852 POP_FAILURE_POINT (sdummy, pdummy,
4853 dummy_low_reg, dummy_high_reg,
4854 reg_dummy, reg_dummy, reg_info_dummy);
4855 }
4856 /* Note fall through. */
4857
4858 unconditional_jump:
4859 #ifdef _LIBC
4860 DEBUG_PRINT2 ("\n%p: ", p);
4861 #else
4862 DEBUG_PRINT2 ("\n0x%x: ", p);
4863 #endif
4864 /* Note fall through. */
4865
4866 /* Unconditionally jump (without popping any failure points). */
4867 case jump:
4868 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4869 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4870 p += mcnt; /* Do the jump. */
4871 #ifdef _LIBC
4872 DEBUG_PRINT2 ("(to %p).\n", p);
4873 #else
4874 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4875 #endif
4876 break;
4877
4878
4879 /* We need this opcode so we can detect where alternatives end
4880 in `group_match_null_string_p' et al. */
4881 case jump_past_alt:
4882 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4883 goto unconditional_jump;
4884
4885
4886 /* Normally, the on_failure_jump pushes a failure point, which
4887 then gets popped at pop_failure_jump. We will end up at
4888 pop_failure_jump, also, and with a pattern of, say, `a+', we
4889 are skipping over the on_failure_jump, so we have to push
4890 something meaningless for pop_failure_jump to pop. */
4891 case dummy_failure_jump:
4892 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4893 /* It doesn't matter what we push for the string here. What
4894 the code at `fail' tests is the value for the pattern. */
4895 PUSH_FAILURE_POINT (NULL, NULL, -2);
4896 goto unconditional_jump;
4897
4898
4899 /* At the end of an alternative, we need to push a dummy failure
4900 point in case we are followed by a `pop_failure_jump', because
4901 we don't want the failure point for the alternative to be
4902 popped. For example, matching `(a|ab)*' against `aab'
4903 requires that we match the `ab' alternative. */
4904 case push_dummy_failure:
4905 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4906 /* See comments just above at `dummy_failure_jump' about the
4907 two zeroes. */
4908 PUSH_FAILURE_POINT (NULL, NULL, -2);
4909 break;
4910
4911 /* Have to succeed matching what follows at least n times.
4912 After that, handle like `on_failure_jump'. */
4913 case succeed_n:
4914 EXTRACT_NUMBER (mcnt, p + 2);
4915 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4916
4917 assert (mcnt >= 0);
4918 /* Originally, this is how many times we HAVE to succeed. */
4919 if (mcnt > 0)
4920 {
4921 mcnt--;
4922 p += 2;
4923 STORE_NUMBER_AND_INCR (p, mcnt);
4924 #ifdef _LIBC
4925 DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt);
4926 #else
4927 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt);
4928 #endif
4929 }
4930 else if (mcnt == 0)
4931 {
4932 #ifdef _LIBC
4933 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
4934 #else
4935 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4936 #endif
4937 p[2] = (unsigned char) no_op;
4938 p[3] = (unsigned char) no_op;
4939 goto on_failure;
4940 }
4941 break;
4942
4943 case jump_n:
4944 EXTRACT_NUMBER (mcnt, p + 2);
4945 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4946
4947 /* Originally, this is how many times we CAN jump. */
4948 if (mcnt)
4949 {
4950 mcnt--;
4951 STORE_NUMBER (p + 2, mcnt);
4952 #ifdef _LIBC
4953 DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt);
4954 #else
4955 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt);
4956 #endif
4957 goto unconditional_jump;
4958 }
4959 /* If don't have to jump any more, skip over the rest of command. */
4960 else
4961 p += 4;
4962 break;
4963
4964 case set_number_at:
4965 {
4966 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4967
4968 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4969 p1 = p + mcnt;
4970 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4971 #ifdef _LIBC
4972 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
4973 #else
4974 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4975 #endif
4976 STORE_NUMBER (p1, mcnt);
4977 break;
4978 }
4979
4980 #if 0
4981 /* The DEC Alpha C compiler 3.x generates incorrect code for the
4982 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4983 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
4984 macro and introducing temporary variables works around the bug. */
4985
4986 case wordbound:
4987 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4988 if (AT_WORD_BOUNDARY (d))
4989 break;
4990 goto fail;
4991
4992 case notwordbound:
4993 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4994 if (AT_WORD_BOUNDARY (d))
4995 goto fail;
4996 break;
4997 #else
4998 case wordbound:
4999 {
5000 boolean prevchar, thischar;
5001
5002 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
5003 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5004 break;
5005
5006 prevchar = WORDCHAR_P (d - 1);
5007 thischar = WORDCHAR_P (d);
5008 if (prevchar != thischar)
5009 break;
5010 goto fail;
5011 }
5012
5013 case notwordbound:
5014 {
5015 boolean prevchar, thischar;
5016
5017 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
5018 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5019 goto fail;
5020
5021 prevchar = WORDCHAR_P (d - 1);
5022 thischar = WORDCHAR_P (d);
5023 if (prevchar != thischar)
5024 goto fail;
5025 break;
5026 }
5027 #endif
5028
5029 case wordbeg:
5030 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5031 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
5032 break;
5033 goto fail;
5034
5035 case wordend:
5036 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5037 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
5038 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
5039 break;
5040 goto fail;
5041
5042 #ifdef emacs
5043 case before_dot:
5044 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5045 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
5046 goto fail;
5047 break;
5048
5049 case at_dot:
5050 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5051 if (PTR_CHAR_POS ((unsigned char *) d) != point)
5052 goto fail;
5053 break;
5054
5055 case after_dot:
5056 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5057 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
5058 goto fail;
5059 break;
5060
5061 case syntaxspec:
5062 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
5063 mcnt = *p++;
5064 goto matchsyntax;
5065
5066 case wordchar:
5067 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
5068 mcnt = (int) Sword;
5069 matchsyntax:
5070 PREFETCH ();
5071 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5072 d++;
5073 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
5074 goto fail;
5075 SET_REGS_MATCHED ();
5076 break;
5077
5078 case notsyntaxspec:
5079 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
5080 mcnt = *p++;
5081 goto matchnotsyntax;
5082
5083 case notwordchar:
5084 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
5085 mcnt = (int) Sword;
5086 matchnotsyntax:
5087 PREFETCH ();
5088 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
5089 d++;
5090 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
5091 goto fail;
5092 SET_REGS_MATCHED ();
5093 break;
5094
5095 #else /* not emacs */
5096 case wordchar:
5097 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
5098 PREFETCH ();
5099 if (!WORDCHAR_P (d))
5100 goto fail;
5101 SET_REGS_MATCHED ();
5102 d++;
5103 break;
5104
5105 case notwordchar:
5106 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
5107 PREFETCH ();
5108 if (WORDCHAR_P (d))
5109 goto fail;
5110 SET_REGS_MATCHED ();
5111 d++;
5112 break;
5113 #endif /* not emacs */
5114
5115 default:
5116 abort ();
5117 }
5118 continue; /* Successfully executed one pattern command; keep going. */
5119
5120
5121 /* We goto here if a matching operation fails. */
5122 fail:
5123 if (!FAIL_STACK_EMPTY ())
5124 { /* A restart point is known. Restore to that state. */
5125 DEBUG_PRINT1 ("\nFAIL:\n");
5126 POP_FAILURE_POINT (d, p,
5127 lowest_active_reg, highest_active_reg,
5128 regstart, regend, reg_info);
5129
5130 /* If this failure point is a dummy, try the next one. */
5131 if (!p)
5132 goto fail;
5133
5134 /* If we failed to the end of the pattern, don't examine *p. */
5135 assert (p <= pend);
5136 if (p < pend)
5137 {
5138 boolean is_a_jump_n = false;
5139
5140 /* If failed to a backwards jump that's part of a repetition
5141 loop, need to pop this failure point and use the next one. */
5142 switch ((re_opcode_t) *p)
5143 {
5144 case jump_n:
5145 is_a_jump_n = true;
5146 case maybe_pop_jump:
5147 case pop_failure_jump:
5148 case jump:
5149 p1 = p + 1;
5150 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5151 p1 += mcnt;
5152
5153 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
5154 || (!is_a_jump_n
5155 && (re_opcode_t) *p1 == on_failure_jump))
5156 goto fail;
5157 break;
5158 default:
5159 /* do nothing */ ;
5160 }
5161 }
5162
5163 if (d >= string1 && d <= end1)
5164 dend = end_match_1;
5165 }
5166 else
5167 break; /* Matching at this starting point really fails. */
5168 } /* for (;;) */
5169
5170 if (best_regs_set)
5171 goto restore_best_regs;
5172
5173 FREE_VARIABLES ();
5174
5175 return -1; /* Failure to match. */
5176 } /* re_match_2 */
5177 \f
5178 /* Subroutine definitions for re_match_2. */
5179
5180
5181 /* We are passed P pointing to a register number after a start_memory.
5182
5183 Return true if the pattern up to the corresponding stop_memory can
5184 match the empty string, and false otherwise.
5185
5186 If we find the matching stop_memory, sets P to point to one past its number.
5187 Otherwise, sets P to an undefined byte less than or equal to END.
5188
5189 We don't handle duplicates properly (yet). */
5190
5191 static boolean
5192 group_match_null_string_p (p, end, reg_info)
5193 unsigned char **p, *end;
5194 register_info_type *reg_info;
5195 {
5196 int mcnt;
5197 /* Point to after the args to the start_memory. */
5198 unsigned char *p1 = *p + 2;
5199
5200 while (p1 < end)
5201 {
5202 /* Skip over opcodes that can match nothing, and return true or
5203 false, as appropriate, when we get to one that can't, or to the
5204 matching stop_memory. */
5205
5206 switch ((re_opcode_t) *p1)
5207 {
5208 /* Could be either a loop or a series of alternatives. */
5209 case on_failure_jump:
5210 p1++;
5211 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5212
5213 /* If the next operation is not a jump backwards in the
5214 pattern. */
5215
5216 if (mcnt >= 0)
5217 {
5218 /* Go through the on_failure_jumps of the alternatives,
5219 seeing if any of the alternatives cannot match nothing.
5220 The last alternative starts with only a jump,
5221 whereas the rest start with on_failure_jump and end
5222 with a jump, e.g., here is the pattern for `a|b|c':
5223
5224 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
5225 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
5226 /exactn/1/c
5227
5228 So, we have to first go through the first (n-1)
5229 alternatives and then deal with the last one separately. */
5230
5231
5232 /* Deal with the first (n-1) alternatives, which start
5233 with an on_failure_jump (see above) that jumps to right
5234 past a jump_past_alt. */
5235
5236 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
5237 {
5238 /* `mcnt' holds how many bytes long the alternative
5239 is, including the ending `jump_past_alt' and
5240 its number. */
5241
5242 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
5243 reg_info))
5244 return false;
5245
5246 /* Move to right after this alternative, including the
5247 jump_past_alt. */
5248 p1 += mcnt;
5249
5250 /* Break if it's the beginning of an n-th alternative
5251 that doesn't begin with an on_failure_jump. */
5252 if ((re_opcode_t) *p1 != on_failure_jump)
5253 break;
5254
5255 /* Still have to check that it's not an n-th
5256 alternative that starts with an on_failure_jump. */
5257 p1++;
5258 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5259 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
5260 {
5261 /* Get to the beginning of the n-th alternative. */
5262 p1 -= 3;
5263 break;
5264 }
5265 }
5266
5267 /* Deal with the last alternative: go back and get number
5268 of the `jump_past_alt' just before it. `mcnt' contains
5269 the length of the alternative. */
5270 EXTRACT_NUMBER (mcnt, p1 - 2);
5271
5272 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
5273 return false;
5274
5275 p1 += mcnt; /* Get past the n-th alternative. */
5276 } /* if mcnt > 0 */
5277 break;
5278
5279
5280 case stop_memory:
5281 assert (p1[1] == **p);
5282 *p = p1 + 2;
5283 return true;
5284
5285
5286 default:
5287 if (!common_op_match_null_string_p (&p1, end, reg_info))
5288 return false;
5289 }
5290 } /* while p1 < end */
5291
5292 return false;
5293 } /* group_match_null_string_p */
5294
5295
5296 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
5297 It expects P to be the first byte of a single alternative and END one
5298 byte past the last. The alternative can contain groups. */
5299
5300 static boolean
5301 alt_match_null_string_p (p, end, reg_info)
5302 unsigned char *p, *end;
5303 register_info_type *reg_info;
5304 {
5305 int mcnt;
5306 unsigned char *p1 = p;
5307
5308 while (p1 < end)
5309 {
5310 /* Skip over opcodes that can match nothing, and break when we get
5311 to one that can't. */
5312
5313 switch ((re_opcode_t) *p1)
5314 {
5315 /* It's a loop. */
5316 case on_failure_jump:
5317 p1++;
5318 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5319 p1 += mcnt;
5320 break;
5321
5322 default:
5323 if (!common_op_match_null_string_p (&p1, end, reg_info))
5324 return false;
5325 }
5326 } /* while p1 < end */
5327
5328 return true;
5329 } /* alt_match_null_string_p */
5330
5331
5332 /* Deals with the ops common to group_match_null_string_p and
5333 alt_match_null_string_p.
5334
5335 Sets P to one after the op and its arguments, if any. */
5336
5337 static boolean
5338 common_op_match_null_string_p (p, end, reg_info)
5339 unsigned char **p, *end;
5340 register_info_type *reg_info;
5341 {
5342 int mcnt;
5343 boolean ret;
5344 int reg_no;
5345 unsigned char *p1 = *p;
5346
5347 switch ((re_opcode_t) *p1++)
5348 {
5349 case no_op:
5350 case begline:
5351 case endline:
5352 case begbuf:
5353 case endbuf:
5354 case wordbeg:
5355 case wordend:
5356 case wordbound:
5357 case notwordbound:
5358 #ifdef emacs
5359 case before_dot:
5360 case at_dot:
5361 case after_dot:
5362 #endif
5363 break;
5364
5365 case start_memory:
5366 reg_no = *p1;
5367 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
5368 ret = group_match_null_string_p (&p1, end, reg_info);
5369
5370 /* Have to set this here in case we're checking a group which
5371 contains a group and a back reference to it. */
5372
5373 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
5374 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
5375
5376 if (!ret)
5377 return false;
5378 break;
5379
5380 /* If this is an optimized succeed_n for zero times, make the jump. */
5381 case jump:
5382 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5383 if (mcnt >= 0)
5384 p1 += mcnt;
5385 else
5386 return false;
5387 break;
5388
5389 case succeed_n:
5390 /* Get to the number of times to succeed. */
5391 p1 += 2;
5392 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5393
5394 if (mcnt == 0)
5395 {
5396 p1 -= 4;
5397 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
5398 p1 += mcnt;
5399 }
5400 else
5401 return false;
5402 break;
5403
5404 case duplicate:
5405 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
5406 return false;
5407 break;
5408
5409 case set_number_at:
5410 p1 += 4;
5411
5412 default:
5413 /* All other opcodes mean we cannot match the empty string. */
5414 return false;
5415 }
5416
5417 *p = p1;
5418 return true;
5419 } /* common_op_match_null_string_p */
5420
5421
5422 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5423 bytes; nonzero otherwise. */
5424
5425 static int
5426 bcmp_translate (s1, s2, len, translate)
5427 const char *s1, *s2;
5428 register int len;
5429 RE_TRANSLATE_TYPE translate;
5430 {
5431 register const unsigned char *p1 = (const unsigned char *) s1;
5432 register const unsigned char *p2 = (const unsigned char *) s2;
5433 while (len)
5434 {
5435 if (translate[*p1++] != translate[*p2++]) return 1;
5436 len--;
5437 }
5438 return 0;
5439 }
5440 \f
5441 /* Entry points for GNU code. */
5442
5443 /* re_compile_pattern is the GNU regular expression compiler: it
5444 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5445 Returns 0 if the pattern was valid, otherwise an error string.
5446
5447 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5448 are set in BUFP on entry.
5449
5450 We call regex_compile to do the actual compilation. */
5451
5452 const char *
5453 re_compile_pattern (pattern, length, bufp)
5454 const char *pattern;
5455 size_t length;
5456 struct re_pattern_buffer *bufp;
5457 {
5458 reg_errcode_t ret;
5459
5460 /* GNU code is written to assume at least RE_NREGS registers will be set
5461 (and at least one extra will be -1). */
5462 bufp->regs_allocated = REGS_UNALLOCATED;
5463
5464 /* And GNU code determines whether or not to get register information
5465 by passing null for the REGS argument to re_match, etc., not by
5466 setting no_sub. */
5467 bufp->no_sub = 0;
5468
5469 /* Match anchors at newline. */
5470 bufp->newline_anchor = 1;
5471
5472 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5473
5474 if (!ret)
5475 return NULL;
5476 return gettext (re_error_msgid[(int) ret]);
5477 }
5478 #ifdef _LIBC
5479 weak_alias (__re_compile_pattern, re_compile_pattern)
5480 #endif
5481 \f
5482 /* Entry points compatible with 4.2 BSD regex library. We don't define
5483 them unless specifically requested. */
5484
5485 #if defined _REGEX_RE_COMP || defined _LIBC
5486
5487 /* BSD has one and only one pattern buffer. */
5488 static struct re_pattern_buffer re_comp_buf;
5489
5490 char *
5491 #ifdef _LIBC
5492 /* Make these definitions weak in libc, so POSIX programs can redefine
5493 these names if they don't use our functions, and still use
5494 regcomp/regexec below without link errors. */
5495 weak_function
5496 #endif
5497 re_comp (s)
5498 const char *s;
5499 {
5500 reg_errcode_t ret;
5501
5502 if (!s)
5503 {
5504 if (!re_comp_buf.buffer)
5505 return gettext ("No previous regular expression");
5506 return 0;
5507 }
5508
5509 if (!re_comp_buf.buffer)
5510 {
5511 re_comp_buf.buffer = (unsigned char *) malloc (200);
5512 if (re_comp_buf.buffer == NULL)
5513 return gettext (re_error_msgid[(int) REG_ESPACE]);
5514 re_comp_buf.allocated = 200;
5515
5516 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5517 if (re_comp_buf.fastmap == NULL)
5518 return gettext (re_error_msgid[(int) REG_ESPACE]);
5519 }
5520
5521 /* Since `re_exec' always passes NULL for the `regs' argument, we
5522 don't need to initialize the pattern buffer fields which affect it. */
5523
5524 /* Match anchors at newlines. */
5525 re_comp_buf.newline_anchor = 1;
5526
5527 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5528
5529 if (!ret)
5530 return NULL;
5531
5532 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5533 return (char *) gettext (re_error_msgid[(int) ret]);
5534 }
5535
5536
5537 int
5538 #ifdef _LIBC
5539 weak_function
5540 #endif
5541 re_exec (s)
5542 const char *s;
5543 {
5544 const int len = strlen (s);
5545 return
5546 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5547 }
5548
5549 #endif /* _REGEX_RE_COMP */
5550 \f
5551 /* POSIX.2 functions. Don't define these for Emacs. */
5552
5553 #ifndef emacs
5554
5555 /* regcomp takes a regular expression as a string and compiles it.
5556
5557 PREG is a regex_t *. We do not expect any fields to be initialized,
5558 since POSIX says we shouldn't. Thus, we set
5559
5560 `buffer' to the compiled pattern;
5561 `used' to the length of the compiled pattern;
5562 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5563 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5564 RE_SYNTAX_POSIX_BASIC;
5565 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5566 `fastmap' and `fastmap_accurate' to zero;
5567 `re_nsub' to the number of subexpressions in PATTERN.
5568
5569 PATTERN is the address of the pattern string.
5570
5571 CFLAGS is a series of bits which affect compilation.
5572
5573 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5574 use POSIX basic syntax.
5575
5576 If REG_NEWLINE is set, then . and [^...] don't match newline.
5577 Also, regexec will try a match beginning after every newline.
5578
5579 If REG_ICASE is set, then we considers upper- and lowercase
5580 versions of letters to be equivalent when matching.
5581
5582 If REG_NOSUB is set, then when PREG is passed to regexec, that
5583 routine will report only success or failure, and nothing about the
5584 registers.
5585
5586 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5587 the return codes and their meanings.) */
5588
5589 int
5590 regcomp (preg, pattern, cflags)
5591 regex_t *preg;
5592 const char *pattern;
5593 int cflags;
5594 {
5595 reg_errcode_t ret;
5596 reg_syntax_t syntax
5597 = (cflags & REG_EXTENDED) ?
5598 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5599
5600 /* regex_compile will allocate the space for the compiled pattern. */
5601 preg->buffer = 0;
5602 preg->allocated = 0;
5603 preg->used = 0;
5604
5605 /* Don't bother to use a fastmap when searching. This simplifies the
5606 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5607 characters after newlines into the fastmap. This way, we just try
5608 every character. */
5609 preg->fastmap = 0;
5610
5611 if (cflags & REG_ICASE)
5612 {
5613 unsigned i;
5614
5615 preg->translate
5616 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5617 * sizeof (*(RE_TRANSLATE_TYPE)0));
5618 if (preg->translate == NULL)
5619 return (int) REG_ESPACE;
5620
5621 /* Map uppercase characters to corresponding lowercase ones. */
5622 for (i = 0; i < CHAR_SET_SIZE; i++)
5623 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
5624 }
5625 else
5626 preg->translate = NULL;
5627
5628 /* If REG_NEWLINE is set, newlines are treated differently. */
5629 if (cflags & REG_NEWLINE)
5630 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5631 syntax &= ~RE_DOT_NEWLINE;
5632 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5633 /* It also changes the matching behavior. */
5634 preg->newline_anchor = 1;
5635 }
5636 else
5637 preg->newline_anchor = 0;
5638
5639 preg->no_sub = !!(cflags & REG_NOSUB);
5640
5641 /* POSIX says a null character in the pattern terminates it, so we
5642 can use strlen here in compiling the pattern. */
5643 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5644
5645 /* POSIX doesn't distinguish between an unmatched open-group and an
5646 unmatched close-group: both are REG_EPAREN. */
5647 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5648
5649 return (int) ret;
5650 }
5651 #ifdef _LIBC
5652 weak_alias (__regcomp, regcomp)
5653 #endif
5654
5655
5656 /* regexec searches for a given pattern, specified by PREG, in the
5657 string STRING.
5658
5659 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5660 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5661 least NMATCH elements, and we set them to the offsets of the
5662 corresponding matched substrings.
5663
5664 EFLAGS specifies `execution flags' which affect matching: if
5665 REG_NOTBOL is set, then ^ does not match at the beginning of the
5666 string; if REG_NOTEOL is set, then $ does not match at the end.
5667
5668 We return 0 if we find a match and REG_NOMATCH if not. */
5669
5670 int
5671 regexec (preg, string, nmatch, pmatch, eflags)
5672 const regex_t *preg;
5673 const char *string;
5674 size_t nmatch;
5675 regmatch_t pmatch[];
5676 int eflags;
5677 {
5678 int ret;
5679 struct re_registers regs;
5680 regex_t private_preg;
5681 int len = strlen (string);
5682 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5683
5684 private_preg = *preg;
5685
5686 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5687 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5688
5689 /* The user has told us exactly how many registers to return
5690 information about, via `nmatch'. We have to pass that on to the
5691 matching routines. */
5692 private_preg.regs_allocated = REGS_FIXED;
5693
5694 if (want_reg_info)
5695 {
5696 regs.num_regs = nmatch;
5697 regs.start = TALLOC (nmatch, regoff_t);
5698 regs.end = TALLOC (nmatch, regoff_t);
5699 if (regs.start == NULL || regs.end == NULL)
5700 return (int) REG_NOMATCH;
5701 }
5702
5703 /* Perform the searching operation. */
5704 ret = re_search (&private_preg, string, len,
5705 /* start: */ 0, /* range: */ len,
5706 want_reg_info ? &regs : (struct re_registers *) 0);
5707
5708 /* Copy the register information to the POSIX structure. */
5709 if (want_reg_info)
5710 {
5711 if (ret >= 0)
5712 {
5713 unsigned r;
5714
5715 for (r = 0; r < nmatch; r++)
5716 {
5717 pmatch[r].rm_so = regs.start[r];
5718 pmatch[r].rm_eo = regs.end[r];
5719 }
5720 }
5721
5722 /* If we needed the temporary register info, free the space now. */
5723 free (regs.start);
5724 free (regs.end);
5725 }
5726
5727 /* We want zero return to mean success, unlike `re_search'. */
5728 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5729 }
5730 #ifdef _LIBC
5731 weak_alias (__regexec, regexec)
5732 #endif
5733
5734
5735 /* Returns a message corresponding to an error code, ERRCODE, returned
5736 from either regcomp or regexec. We don't use PREG here. */
5737
5738 size_t
5739 __regerror (errcode, preg, errbuf, errbuf_size)
5740 int errcode;
5741 const regex_t *preg;
5742 char *errbuf;
5743 size_t errbuf_size;
5744 {
5745 const char *msg;
5746 size_t msg_size;
5747
5748 if (errcode < 0
5749 || errcode >= (int) (sizeof (re_error_msgid)
5750 / sizeof (re_error_msgid[0])))
5751 /* Only error codes returned by the rest of the code should be passed
5752 to this routine. If we are given anything else, or if other regex
5753 code generates an invalid error code, then the program has a bug.
5754 Dump core so we can fix it. */
5755 abort ();
5756
5757 msg = gettext (re_error_msgid[errcode]);
5758
5759 msg_size = strlen (msg) + 1; /* Includes the null. */
5760
5761 if (errbuf_size != 0)
5762 {
5763 if (msg_size > errbuf_size)
5764 {
5765 #if defined HAVE_MEMPCPY || defined _LIBC
5766 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
5767 #else
5768 memcpy (errbuf, msg, errbuf_size - 1);
5769 errbuf[errbuf_size - 1] = 0;
5770 #endif
5771 }
5772 else
5773 memcpy (errbuf, msg, msg_size);
5774 }
5775
5776 return msg_size;
5777 }
5778 #ifdef _LIBC
5779 weak_alias (__regerror, regerror)
5780 #endif
5781
5782
5783 /* Free dynamically allocated space used by PREG. */
5784
5785 void
5786 regfree (preg)
5787 regex_t *preg;
5788 {
5789 if (preg->buffer != NULL)
5790 free (preg->buffer);
5791 preg->buffer = NULL;
5792
5793 preg->allocated = 0;
5794 preg->used = 0;
5795
5796 if (preg->fastmap != NULL)
5797 free (preg->fastmap);
5798 preg->fastmap = NULL;
5799 preg->fastmap_accurate = 0;
5800
5801 if (preg->translate != NULL)
5802 free (preg->translate);
5803 preg->translate = NULL;
5804 }
5805 #ifdef _LIBC
5806 weak_alias (__regfree, regfree)
5807 #endif
5808
5809 #endif /* not emacs */
This page took 0.139588 seconds and 5 git commands to generate.