f06775237913328cb8afea7d0e37b0e97d61d53b
[deliverable/binutils-gdb.git] / libiberty / regex.c
1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P1003.2/D11.2, except for some of the
4 internationalization features.)
5 Copyright (C) 1993-1999, 2000, 2001 Free Software Foundation, Inc.
6 This file is part of the GNU C Library.
7
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
12
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with the GNU C Library; if not, write to the Free
20 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 02111-1307 USA. */
22
23 /* This file has been modified for usage in libiberty. It includes "xregex.h"
24 instead of <regex.h>. The "xregex.h" header file renames all external
25 routines with an "x" prefix so they do not collide with the native regex
26 routines or with other components regex routines. */
27 /* AIX requires this to be the first thing in the file. */
28 #if defined _AIX && !defined REGEX_MALLOC
29 #pragma alloca
30 #endif
31
32 #undef _GNU_SOURCE
33 #define _GNU_SOURCE
34
35 #ifdef HAVE_CONFIG_H
36 # include <config.h>
37 #endif
38
39 #ifndef PARAMS
40 # if defined __GNUC__ || (defined __STDC__ && __STDC__)
41 # define PARAMS(args) args
42 # else
43 # define PARAMS(args) ()
44 # endif /* GCC. */
45 #endif /* Not PARAMS. */
46
47 #ifndef INSIDE_RECURSION
48
49 # if defined STDC_HEADERS && !defined emacs
50 # include <stddef.h>
51 # else
52 /* We need this for `regex.h', and perhaps for the Emacs include files. */
53 # include <sys/types.h>
54 # endif
55
56 # define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC)
57
58 /* For platform which support the ISO C amendement 1 functionality we
59 support user defined character classes. */
60 # if defined _LIBC || WIDE_CHAR_SUPPORT
61 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
62 # include <wchar.h>
63 # include <wctype.h>
64 # endif
65
66 # ifdef _LIBC
67 /* We have to keep the namespace clean. */
68 # define regfree(preg) __regfree (preg)
69 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
70 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
71 # define regerror(errcode, preg, errbuf, errbuf_size) \
72 __regerror(errcode, preg, errbuf, errbuf_size)
73 # define re_set_registers(bu, re, nu, st, en) \
74 __re_set_registers (bu, re, nu, st, en)
75 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
77 # define re_match(bufp, string, size, pos, regs) \
78 __re_match (bufp, string, size, pos, regs)
79 # define re_search(bufp, string, size, startpos, range, regs) \
80 __re_search (bufp, string, size, startpos, range, regs)
81 # define re_compile_pattern(pattern, length, bufp) \
82 __re_compile_pattern (pattern, length, bufp)
83 # define re_set_syntax(syntax) __re_set_syntax (syntax)
84 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
86 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
87
88 # define btowc __btowc
89
90 /* We are also using some library internals. */
91 # include <locale/localeinfo.h>
92 # include <locale/elem-hash.h>
93 # include <langinfo.h>
94 # include <locale/coll-lookup.h>
95 # endif
96
97 /* This is for other GNU distributions with internationalized messages. */
98 # if HAVE_LIBINTL_H || defined _LIBC
99 # include <libintl.h>
100 # ifdef _LIBC
101 # undef gettext
102 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES)
103 # endif
104 # else
105 # define gettext(msgid) (msgid)
106 # endif
107
108 # ifndef gettext_noop
109 /* This define is so xgettext can find the internationalizable
110 strings. */
111 # define gettext_noop(String) String
112 # endif
113
114 /* The `emacs' switch turns on certain matching commands
115 that make sense only in Emacs. */
116 # ifdef emacs
117
118 # include "lisp.h"
119 # include "buffer.h"
120 # include "syntax.h"
121
122 # else /* not emacs */
123
124 /* If we are not linking with Emacs proper,
125 we can't use the relocating allocator
126 even if config.h says that we can. */
127 # undef REL_ALLOC
128
129 # if defined STDC_HEADERS || defined _LIBC
130 # include <stdlib.h>
131 # else
132 char *malloc ();
133 char *realloc ();
134 # endif
135
136 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
137 If nothing else has been done, use the method below. */
138 # ifdef INHIBIT_STRING_HEADER
139 # if !(defined HAVE_BZERO && defined HAVE_BCOPY)
140 # if !defined bzero && !defined bcopy
141 # undef INHIBIT_STRING_HEADER
142 # endif
143 # endif
144 # endif
145
146 /* This is the normal way of making sure we have a bcopy and a bzero.
147 This is used in most programs--a few other programs avoid this
148 by defining INHIBIT_STRING_HEADER. */
149 # ifndef INHIBIT_STRING_HEADER
150 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
151 # include <string.h>
152 # ifndef bzero
153 # ifndef _LIBC
154 # define bzero(s, n) (memset (s, '\0', n), (s))
155 # else
156 # define bzero(s, n) __bzero (s, n)
157 # endif
158 # endif
159 # else
160 # include <strings.h>
161 # ifndef memcmp
162 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
163 # endif
164 # ifndef memcpy
165 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
166 # endif
167 # endif
168 # endif
169
170 /* Define the syntax stuff for \<, \>, etc. */
171
172 /* This must be nonzero for the wordchar and notwordchar pattern
173 commands in re_match_2. */
174 # ifndef Sword
175 # define Sword 1
176 # endif
177
178 # ifdef SWITCH_ENUM_BUG
179 # define SWITCH_ENUM_CAST(x) ((int)(x))
180 # else
181 # define SWITCH_ENUM_CAST(x) (x)
182 # endif
183
184 # endif /* not emacs */
185
186 # if defined _LIBC || HAVE_LIMITS_H
187 # include <limits.h>
188 # endif
189
190 # ifndef MB_LEN_MAX
191 # define MB_LEN_MAX 1
192 # endif
193 \f
194 /* Get the interface, including the syntax bits. */
195 # include "xregex.h" /* change for libiberty */
196
197 /* isalpha etc. are used for the character classes. */
198 # include <ctype.h>
199
200 /* Jim Meyering writes:
201
202 "... Some ctype macros are valid only for character codes that
203 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
204 using /bin/cc or gcc but without giving an ansi option). So, all
205 ctype uses should be through macros like ISPRINT... If
206 STDC_HEADERS is defined, then autoconf has verified that the ctype
207 macros don't need to be guarded with references to isascii. ...
208 Defining isascii to 1 should let any compiler worth its salt
209 eliminate the && through constant folding."
210 Solaris defines some of these symbols so we must undefine them first. */
211
212 # undef ISASCII
213 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
214 # define ISASCII(c) 1
215 # else
216 # define ISASCII(c) isascii(c)
217 # endif
218
219 # ifdef isblank
220 # define ISBLANK(c) (ISASCII (c) && isblank (c))
221 # else
222 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
223 # endif
224 # ifdef isgraph
225 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
226 # else
227 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
228 # endif
229
230 # undef ISPRINT
231 # define ISPRINT(c) (ISASCII (c) && isprint (c))
232 # define ISDIGIT(c) (ISASCII (c) && isdigit (c))
233 # define ISALNUM(c) (ISASCII (c) && isalnum (c))
234 # define ISALPHA(c) (ISASCII (c) && isalpha (c))
235 # define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
236 # define ISLOWER(c) (ISASCII (c) && islower (c))
237 # define ISPUNCT(c) (ISASCII (c) && ispunct (c))
238 # define ISSPACE(c) (ISASCII (c) && isspace (c))
239 # define ISUPPER(c) (ISASCII (c) && isupper (c))
240 # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
241
242 # ifdef _tolower
243 # define TOLOWER(c) _tolower(c)
244 # else
245 # define TOLOWER(c) tolower(c)
246 # endif
247
248 # ifndef NULL
249 # define NULL (void *)0
250 # endif
251
252 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
253 since ours (we hope) works properly with all combinations of
254 machines, compilers, `char' and `unsigned char' argument types.
255 (Per Bothner suggested the basic approach.) */
256 # undef SIGN_EXTEND_CHAR
257 # if __STDC__
258 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
259 # else /* not __STDC__ */
260 /* As in Harbison and Steele. */
261 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
262 # endif
263 \f
264 # ifndef emacs
265 /* How many characters in the character set. */
266 # define CHAR_SET_SIZE 256
267
268 # ifdef SYNTAX_TABLE
269
270 extern char *re_syntax_table;
271
272 # else /* not SYNTAX_TABLE */
273
274 static char re_syntax_table[CHAR_SET_SIZE];
275
276 static void init_syntax_once PARAMS ((void));
277
278 static void
279 init_syntax_once ()
280 {
281 register int c;
282 static int done = 0;
283
284 if (done)
285 return;
286 bzero (re_syntax_table, sizeof re_syntax_table);
287
288 for (c = 0; c < CHAR_SET_SIZE; ++c)
289 if (ISALNUM (c))
290 re_syntax_table[c] = Sword;
291
292 re_syntax_table['_'] = Sword;
293
294 done = 1;
295 }
296
297 # endif /* not SYNTAX_TABLE */
298
299 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)]
300
301 # endif /* emacs */
302 \f
303 /* Integer type for pointers. */
304 # if !defined _LIBC
305 typedef unsigned long int uintptr_t;
306 # endif
307
308 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
309 use `alloca' instead of `malloc'. This is because using malloc in
310 re_search* or re_match* could cause memory leaks when C-g is used in
311 Emacs; also, malloc is slower and causes storage fragmentation. On
312 the other hand, malloc is more portable, and easier to debug.
313
314 Because we sometimes use alloca, some routines have to be macros,
315 not functions -- `alloca'-allocated space disappears at the end of the
316 function it is called in. */
317
318 # ifdef REGEX_MALLOC
319
320 # define REGEX_ALLOCATE malloc
321 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
322 # define REGEX_FREE free
323
324 # else /* not REGEX_MALLOC */
325
326 /* Emacs already defines alloca, sometimes. */
327 # ifndef alloca
328
329 /* Make alloca work the best possible way. */
330 # ifdef __GNUC__
331 # define alloca __builtin_alloca
332 # else /* not __GNUC__ */
333 # if HAVE_ALLOCA_H
334 # include <alloca.h>
335 # endif /* HAVE_ALLOCA_H */
336 # endif /* not __GNUC__ */
337
338 # endif /* not alloca */
339
340 # define REGEX_ALLOCATE alloca
341
342 /* Assumes a `char *destination' variable. */
343 # define REGEX_REALLOCATE(source, osize, nsize) \
344 (destination = (char *) alloca (nsize), \
345 memcpy (destination, source, osize))
346
347 /* No need to do anything to free, after alloca. */
348 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
349
350 # endif /* not REGEX_MALLOC */
351
352 /* Define how to allocate the failure stack. */
353
354 # if defined REL_ALLOC && defined REGEX_MALLOC
355
356 # define REGEX_ALLOCATE_STACK(size) \
357 r_alloc (&failure_stack_ptr, (size))
358 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
359 r_re_alloc (&failure_stack_ptr, (nsize))
360 # define REGEX_FREE_STACK(ptr) \
361 r_alloc_free (&failure_stack_ptr)
362
363 # else /* not using relocating allocator */
364
365 # ifdef REGEX_MALLOC
366
367 # define REGEX_ALLOCATE_STACK malloc
368 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
369 # define REGEX_FREE_STACK free
370
371 # else /* not REGEX_MALLOC */
372
373 # define REGEX_ALLOCATE_STACK alloca
374
375 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
376 REGEX_REALLOCATE (source, osize, nsize)
377 /* No need to explicitly free anything. */
378 # define REGEX_FREE_STACK(arg)
379
380 # endif /* not REGEX_MALLOC */
381 # endif /* not using relocating allocator */
382
383
384 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
385 `string1' or just past its end. This works if PTR is NULL, which is
386 a good thing. */
387 # define FIRST_STRING_P(ptr) \
388 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
389
390 /* (Re)Allocate N items of type T using malloc, or fail. */
391 # define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
392 # define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
393 # define RETALLOC_IF(addr, n, t) \
394 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
395 # define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
396
397 # define BYTEWIDTH 8 /* In bits. */
398
399 # define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
400
401 # undef MAX
402 # undef MIN
403 # define MAX(a, b) ((a) > (b) ? (a) : (b))
404 # define MIN(a, b) ((a) < (b) ? (a) : (b))
405
406 typedef char boolean;
407 # define false 0
408 # define true 1
409
410 static reg_errcode_t byte_regex_compile _RE_ARGS ((const char *pattern, size_t size,
411 reg_syntax_t syntax,
412 struct re_pattern_buffer *bufp));
413 static reg_errcode_t wcs_regex_compile _RE_ARGS ((const char *pattern, size_t size,
414 reg_syntax_t syntax,
415 struct re_pattern_buffer *bufp));
416
417 static int byte_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
418 const char *string1, int size1,
419 const char *string2, int size2,
420 int pos,
421 struct re_registers *regs,
422 int stop));
423 static int wcs_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp,
424 const char *cstring1, int csize1,
425 const char *cstring2, int csize2,
426 int pos,
427 struct re_registers *regs,
428 int stop,
429 wchar_t *string1, int size1,
430 wchar_t *string2, int size2,
431 int *mbs_offset1, int *mbs_offset2));
432 static int byte_re_search_2 PARAMS ((struct re_pattern_buffer *bufp,
433 const char *string1, int size1,
434 const char *string2, int size2,
435 int startpos, int range,
436 struct re_registers *regs, int stop));
437 static int wcs_re_search_2 PARAMS ((struct re_pattern_buffer *bufp,
438 const char *string1, int size1,
439 const char *string2, int size2,
440 int startpos, int range,
441 struct re_registers *regs, int stop));
442 static int byte_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp));
443 static int wcs_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp));
444
445 \f
446 /* These are the command codes that appear in compiled regular
447 expressions. Some opcodes are followed by argument bytes. A
448 command code can specify any interpretation whatsoever for its
449 arguments. Zero bytes may appear in the compiled regular expression. */
450
451 typedef enum
452 {
453 no_op = 0,
454
455 /* Succeed right away--no more backtracking. */
456 succeed,
457
458 /* Followed by one byte giving n, then by n literal bytes. */
459 exactn,
460
461 # ifdef MBS_SUPPORT
462 /* Same as exactn, but contains binary data. */
463 exactn_bin,
464 # endif
465
466 /* Matches any (more or less) character. */
467 anychar,
468
469 /* Matches any one char belonging to specified set. First
470 following byte is number of bitmap bytes. Then come bytes
471 for a bitmap saying which chars are in. Bits in each byte
472 are ordered low-bit-first. A character is in the set if its
473 bit is 1. A character too large to have a bit in the map is
474 automatically not in the set. */
475 /* ifdef MBS_SUPPORT, following element is length of character
476 classes, length of collating symbols, length of equivalence
477 classes, length of character ranges, and length of characters.
478 Next, character class element, collating symbols elements,
479 equivalence class elements, range elements, and character
480 elements follow.
481 See regex_compile function. */
482 charset,
483
484 /* Same parameters as charset, but match any character that is
485 not one of those specified. */
486 charset_not,
487
488 /* Start remembering the text that is matched, for storing in a
489 register. Followed by one byte with the register number, in
490 the range 0 to one less than the pattern buffer's re_nsub
491 field. Then followed by one byte with the number of groups
492 inner to this one. (This last has to be part of the
493 start_memory only because we need it in the on_failure_jump
494 of re_match_2.) */
495 start_memory,
496
497 /* Stop remembering the text that is matched and store it in a
498 memory register. Followed by one byte with the register
499 number, in the range 0 to one less than `re_nsub' in the
500 pattern buffer, and one byte with the number of inner groups,
501 just like `start_memory'. (We need the number of inner
502 groups here because we don't have any easy way of finding the
503 corresponding start_memory when we're at a stop_memory.) */
504 stop_memory,
505
506 /* Match a duplicate of something remembered. Followed by one
507 byte containing the register number. */
508 duplicate,
509
510 /* Fail unless at beginning of line. */
511 begline,
512
513 /* Fail unless at end of line. */
514 endline,
515
516 /* Succeeds if at beginning of buffer (if emacs) or at beginning
517 of string to be matched (if not). */
518 begbuf,
519
520 /* Analogously, for end of buffer/string. */
521 endbuf,
522
523 /* Followed by two byte relative address to which to jump. */
524 jump,
525
526 /* Same as jump, but marks the end of an alternative. */
527 jump_past_alt,
528
529 /* Followed by two-byte relative address of place to resume at
530 in case of failure. */
531 /* ifdef MBS_SUPPORT, the size of address is 1. */
532 on_failure_jump,
533
534 /* Like on_failure_jump, but pushes a placeholder instead of the
535 current string position when executed. */
536 on_failure_keep_string_jump,
537
538 /* Throw away latest failure point and then jump to following
539 two-byte relative address. */
540 /* ifdef MBS_SUPPORT, the size of address is 1. */
541 pop_failure_jump,
542
543 /* Change to pop_failure_jump if know won't have to backtrack to
544 match; otherwise change to jump. This is used to jump
545 back to the beginning of a repeat. If what follows this jump
546 clearly won't match what the repeat does, such that we can be
547 sure that there is no use backtracking out of repetitions
548 already matched, then we change it to a pop_failure_jump.
549 Followed by two-byte address. */
550 /* ifdef MBS_SUPPORT, the size of address is 1. */
551 maybe_pop_jump,
552
553 /* Jump to following two-byte address, and push a dummy failure
554 point. This failure point will be thrown away if an attempt
555 is made to use it for a failure. A `+' construct makes this
556 before the first repeat. Also used as an intermediary kind
557 of jump when compiling an alternative. */
558 /* ifdef MBS_SUPPORT, the size of address is 1. */
559 dummy_failure_jump,
560
561 /* Push a dummy failure point and continue. Used at the end of
562 alternatives. */
563 push_dummy_failure,
564
565 /* Followed by two-byte relative address and two-byte number n.
566 After matching N times, jump to the address upon failure. */
567 /* ifdef MBS_SUPPORT, the size of address is 1. */
568 succeed_n,
569
570 /* Followed by two-byte relative address, and two-byte number n.
571 Jump to the address N times, then fail. */
572 /* ifdef MBS_SUPPORT, the size of address is 1. */
573 jump_n,
574
575 /* Set the following two-byte relative address to the
576 subsequent two-byte number. The address *includes* the two
577 bytes of number. */
578 /* ifdef MBS_SUPPORT, the size of address is 1. */
579 set_number_at,
580
581 wordchar, /* Matches any word-constituent character. */
582 notwordchar, /* Matches any char that is not a word-constituent. */
583
584 wordbeg, /* Succeeds if at word beginning. */
585 wordend, /* Succeeds if at word end. */
586
587 wordbound, /* Succeeds if at a word boundary. */
588 notwordbound /* Succeeds if not at a word boundary. */
589
590 # ifdef emacs
591 ,before_dot, /* Succeeds if before point. */
592 at_dot, /* Succeeds if at point. */
593 after_dot, /* Succeeds if after point. */
594
595 /* Matches any character whose syntax is specified. Followed by
596 a byte which contains a syntax code, e.g., Sword. */
597 syntaxspec,
598
599 /* Matches any character whose syntax is not that specified. */
600 notsyntaxspec
601 # endif /* emacs */
602 } re_opcode_t;
603 #endif /* not INSIDE_RECURSION */
604 \f
605
606 #ifdef BYTE
607 # define CHAR_T char
608 # define UCHAR_T unsigned char
609 # define COMPILED_BUFFER_VAR bufp->buffer
610 # define OFFSET_ADDRESS_SIZE 2
611 # define PREFIX(name) byte_##name
612 # define ARG_PREFIX(name) name
613 # define PUT_CHAR(c) putchar (c)
614 #elif defined WCHAR
615 # define CHAR_T wchar_t
616 # define UCHAR_T wchar_t
617 # define COMPILED_BUFFER_VAR wc_buffer
618 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */
619 # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1)
620 # define PREFIX(name) wcs_##name
621 # define ARG_PREFIX(name) c##name
622 /* Should we use wide stream?? */
623 # define PUT_CHAR(c) printf ("%C", c);
624 # define TRUE 1
625 # define FALSE 0
626 #else
627 # ifdef MBS_SUPPORT
628 # define WCHAR
629 # define INSIDE_RECURSION
630 # include "regex.c"
631 # undef INSIDE_RECURSION
632 # endif
633 # define BYTE
634 # define INSIDE_RECURSION
635 # include "regex.c"
636 # undef INSIDE_RECURSION
637 #endif
638
639 #ifdef INSIDE_RECURSION
640 /* Common operations on the compiled pattern. */
641
642 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
643 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
644
645 # ifdef WCHAR
646 # define STORE_NUMBER(destination, number) \
647 do { \
648 *(destination) = (UCHAR_T)(number); \
649 } while (0)
650 # else /* BYTE */
651 # define STORE_NUMBER(destination, number) \
652 do { \
653 (destination)[0] = (number) & 0377; \
654 (destination)[1] = (number) >> 8; \
655 } while (0)
656 # endif /* WCHAR */
657
658 /* Same as STORE_NUMBER, except increment DESTINATION to
659 the byte after where the number is stored. Therefore, DESTINATION
660 must be an lvalue. */
661 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
662
663 # define STORE_NUMBER_AND_INCR(destination, number) \
664 do { \
665 STORE_NUMBER (destination, number); \
666 (destination) += OFFSET_ADDRESS_SIZE; \
667 } while (0)
668
669 /* Put into DESTINATION a number stored in two contiguous bytes starting
670 at SOURCE. */
671 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */
672
673 # ifdef WCHAR
674 # define EXTRACT_NUMBER(destination, source) \
675 do { \
676 (destination) = *(source); \
677 } while (0)
678 # else /* BYTE */
679 # define EXTRACT_NUMBER(destination, source) \
680 do { \
681 (destination) = *(source) & 0377; \
682 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
683 } while (0)
684 # endif
685
686 # ifdef DEBUG
687 static void PREFIX(extract_number) _RE_ARGS ((int *dest, UCHAR_T *source));
688 static void
689 PREFIX(extract_number) (dest, source)
690 int *dest;
691 UCHAR_T *source;
692 {
693 # ifdef WCHAR
694 *dest = *source;
695 # else /* BYTE */
696 int temp = SIGN_EXTEND_CHAR (*(source + 1));
697 *dest = *source & 0377;
698 *dest += temp << 8;
699 # endif
700 }
701
702 # ifndef EXTRACT_MACROS /* To debug the macros. */
703 # undef EXTRACT_NUMBER
704 # define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src)
705 # endif /* not EXTRACT_MACROS */
706
707 # endif /* DEBUG */
708
709 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
710 SOURCE must be an lvalue. */
711
712 # define EXTRACT_NUMBER_AND_INCR(destination, source) \
713 do { \
714 EXTRACT_NUMBER (destination, source); \
715 (source) += OFFSET_ADDRESS_SIZE; \
716 } while (0)
717
718 # ifdef DEBUG
719 static void PREFIX(extract_number_and_incr) _RE_ARGS ((int *destination,
720 UCHAR_T **source));
721 static void
722 PREFIX(extract_number_and_incr) (destination, source)
723 int *destination;
724 UCHAR_T **source;
725 {
726 PREFIX(extract_number) (destination, *source);
727 *source += OFFSET_ADDRESS_SIZE;
728 }
729
730 # ifndef EXTRACT_MACROS
731 # undef EXTRACT_NUMBER_AND_INCR
732 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
733 PREFIX(extract_number_and_incr) (&dest, &src)
734 # endif /* not EXTRACT_MACROS */
735
736 # endif /* DEBUG */
737
738 \f
739
740 /* If DEBUG is defined, Regex prints many voluminous messages about what
741 it is doing (if the variable `debug' is nonzero). If linked with the
742 main program in `iregex.c', you can enter patterns and strings
743 interactively. And if linked with the main program in `main.c' and
744 the other test files, you can run the already-written tests. */
745
746 # ifdef DEBUG
747
748 # ifndef DEFINED_ONCE
749
750 /* We use standard I/O for debugging. */
751 # include <stdio.h>
752
753 /* It is useful to test things that ``must'' be true when debugging. */
754 # include <assert.h>
755
756 static int debug;
757
758 # define DEBUG_STATEMENT(e) e
759 # define DEBUG_PRINT1(x) if (debug) printf (x)
760 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
761 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
762 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
763 # endif /* not DEFINED_ONCE */
764
765 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
766 if (debug) PREFIX(print_partial_compiled_pattern) (s, e)
767 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
768 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2)
769
770
771 /* Print the fastmap in human-readable form. */
772
773 # ifndef DEFINED_ONCE
774 void
775 print_fastmap (fastmap)
776 char *fastmap;
777 {
778 unsigned was_a_range = 0;
779 unsigned i = 0;
780
781 while (i < (1 << BYTEWIDTH))
782 {
783 if (fastmap[i++])
784 {
785 was_a_range = 0;
786 putchar (i - 1);
787 while (i < (1 << BYTEWIDTH) && fastmap[i])
788 {
789 was_a_range = 1;
790 i++;
791 }
792 if (was_a_range)
793 {
794 printf ("-");
795 putchar (i - 1);
796 }
797 }
798 }
799 putchar ('\n');
800 }
801 # endif /* not DEFINED_ONCE */
802
803
804 /* Print a compiled pattern string in human-readable form, starting at
805 the START pointer into it and ending just before the pointer END. */
806
807 void
808 PREFIX(print_partial_compiled_pattern) (start, end)
809 UCHAR_T *start;
810 UCHAR_T *end;
811 {
812 int mcnt, mcnt2;
813 UCHAR_T *p1;
814 UCHAR_T *p = start;
815 UCHAR_T *pend = end;
816
817 if (start == NULL)
818 {
819 printf ("(null)\n");
820 return;
821 }
822
823 /* Loop over pattern commands. */
824 while (p < pend)
825 {
826 # ifdef _LIBC
827 printf ("%td:\t", p - start);
828 # else
829 printf ("%ld:\t", (long int) (p - start));
830 # endif
831
832 switch ((re_opcode_t) *p++)
833 {
834 case no_op:
835 printf ("/no_op");
836 break;
837
838 case exactn:
839 mcnt = *p++;
840 printf ("/exactn/%d", mcnt);
841 do
842 {
843 putchar ('/');
844 PUT_CHAR (*p++);
845 }
846 while (--mcnt);
847 break;
848
849 # ifdef MBS_SUPPORT
850 case exactn_bin:
851 mcnt = *p++;
852 printf ("/exactn_bin/%d", mcnt);
853 do
854 {
855 printf("/%lx", (long int) *p++);
856 }
857 while (--mcnt);
858 break;
859 # endif /* MBS_SUPPORT */
860
861 case start_memory:
862 mcnt = *p++;
863 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++);
864 break;
865
866 case stop_memory:
867 mcnt = *p++;
868 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++);
869 break;
870
871 case duplicate:
872 printf ("/duplicate/%ld", (long int) *p++);
873 break;
874
875 case anychar:
876 printf ("/anychar");
877 break;
878
879 case charset:
880 case charset_not:
881 {
882 # ifdef WCHAR
883 int i, length;
884 wchar_t *workp = p;
885 printf ("/charset [%s",
886 (re_opcode_t) *(workp - 1) == charset_not ? "^" : "");
887 p += 5;
888 length = *workp++; /* the length of char_classes */
889 for (i=0 ; i<length ; i++)
890 printf("[:%lx:]", (long int) *p++);
891 length = *workp++; /* the length of collating_symbol */
892 for (i=0 ; i<length ;)
893 {
894 printf("[.");
895 while(*p != 0)
896 PUT_CHAR((i++,*p++));
897 i++,p++;
898 printf(".]");
899 }
900 length = *workp++; /* the length of equivalence_class */
901 for (i=0 ; i<length ;)
902 {
903 printf("[=");
904 while(*p != 0)
905 PUT_CHAR((i++,*p++));
906 i++,p++;
907 printf("=]");
908 }
909 length = *workp++; /* the length of char_range */
910 for (i=0 ; i<length ; i++)
911 {
912 wchar_t range_start = *p++;
913 wchar_t range_end = *p++;
914 printf("%C-%C", range_start, range_end);
915 }
916 length = *workp++; /* the length of char */
917 for (i=0 ; i<length ; i++)
918 printf("%C", *p++);
919 putchar (']');
920 # else
921 register int c, last = -100;
922 register int in_range = 0;
923
924 printf ("/charset [%s",
925 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
926
927 assert (p + *p < pend);
928
929 for (c = 0; c < 256; c++)
930 if (c / 8 < *p
931 && (p[1 + (c/8)] & (1 << (c % 8))))
932 {
933 /* Are we starting a range? */
934 if (last + 1 == c && ! in_range)
935 {
936 putchar ('-');
937 in_range = 1;
938 }
939 /* Have we broken a range? */
940 else if (last + 1 != c && in_range)
941 {
942 putchar (last);
943 in_range = 0;
944 }
945
946 if (! in_range)
947 putchar (c);
948
949 last = c;
950 }
951
952 if (in_range)
953 putchar (last);
954
955 putchar (']');
956
957 p += 1 + *p;
958 # endif /* WCHAR */
959 }
960 break;
961
962 case begline:
963 printf ("/begline");
964 break;
965
966 case endline:
967 printf ("/endline");
968 break;
969
970 case on_failure_jump:
971 PREFIX(extract_number_and_incr) (&mcnt, &p);
972 # ifdef _LIBC
973 printf ("/on_failure_jump to %td", p + mcnt - start);
974 # else
975 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start));
976 # endif
977 break;
978
979 case on_failure_keep_string_jump:
980 PREFIX(extract_number_and_incr) (&mcnt, &p);
981 # ifdef _LIBC
982 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start);
983 # else
984 printf ("/on_failure_keep_string_jump to %ld",
985 (long int) (p + mcnt - start));
986 # endif
987 break;
988
989 case dummy_failure_jump:
990 PREFIX(extract_number_and_incr) (&mcnt, &p);
991 # ifdef _LIBC
992 printf ("/dummy_failure_jump to %td", p + mcnt - start);
993 # else
994 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start));
995 # endif
996 break;
997
998 case push_dummy_failure:
999 printf ("/push_dummy_failure");
1000 break;
1001
1002 case maybe_pop_jump:
1003 PREFIX(extract_number_and_incr) (&mcnt, &p);
1004 # ifdef _LIBC
1005 printf ("/maybe_pop_jump to %td", p + mcnt - start);
1006 # else
1007 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start));
1008 # endif
1009 break;
1010
1011 case pop_failure_jump:
1012 PREFIX(extract_number_and_incr) (&mcnt, &p);
1013 # ifdef _LIBC
1014 printf ("/pop_failure_jump to %td", p + mcnt - start);
1015 # else
1016 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start));
1017 # endif
1018 break;
1019
1020 case jump_past_alt:
1021 PREFIX(extract_number_and_incr) (&mcnt, &p);
1022 # ifdef _LIBC
1023 printf ("/jump_past_alt to %td", p + mcnt - start);
1024 # else
1025 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start));
1026 # endif
1027 break;
1028
1029 case jump:
1030 PREFIX(extract_number_and_incr) (&mcnt, &p);
1031 # ifdef _LIBC
1032 printf ("/jump to %td", p + mcnt - start);
1033 # else
1034 printf ("/jump to %ld", (long int) (p + mcnt - start));
1035 # endif
1036 break;
1037
1038 case succeed_n:
1039 PREFIX(extract_number_and_incr) (&mcnt, &p);
1040 p1 = p + mcnt;
1041 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1042 # ifdef _LIBC
1043 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2);
1044 # else
1045 printf ("/succeed_n to %ld, %d times",
1046 (long int) (p1 - start), mcnt2);
1047 # endif
1048 break;
1049
1050 case jump_n:
1051 PREFIX(extract_number_and_incr) (&mcnt, &p);
1052 p1 = p + mcnt;
1053 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1054 printf ("/jump_n to %d, %d times", p1 - start, mcnt2);
1055 break;
1056
1057 case set_number_at:
1058 PREFIX(extract_number_and_incr) (&mcnt, &p);
1059 p1 = p + mcnt;
1060 PREFIX(extract_number_and_incr) (&mcnt2, &p);
1061 # ifdef _LIBC
1062 printf ("/set_number_at location %td to %d", p1 - start, mcnt2);
1063 # else
1064 printf ("/set_number_at location %ld to %d",
1065 (long int) (p1 - start), mcnt2);
1066 # endif
1067 break;
1068
1069 case wordbound:
1070 printf ("/wordbound");
1071 break;
1072
1073 case notwordbound:
1074 printf ("/notwordbound");
1075 break;
1076
1077 case wordbeg:
1078 printf ("/wordbeg");
1079 break;
1080
1081 case wordend:
1082 printf ("/wordend");
1083 break;
1084
1085 # ifdef emacs
1086 case before_dot:
1087 printf ("/before_dot");
1088 break;
1089
1090 case at_dot:
1091 printf ("/at_dot");
1092 break;
1093
1094 case after_dot:
1095 printf ("/after_dot");
1096 break;
1097
1098 case syntaxspec:
1099 printf ("/syntaxspec");
1100 mcnt = *p++;
1101 printf ("/%d", mcnt);
1102 break;
1103
1104 case notsyntaxspec:
1105 printf ("/notsyntaxspec");
1106 mcnt = *p++;
1107 printf ("/%d", mcnt);
1108 break;
1109 # endif /* emacs */
1110
1111 case wordchar:
1112 printf ("/wordchar");
1113 break;
1114
1115 case notwordchar:
1116 printf ("/notwordchar");
1117 break;
1118
1119 case begbuf:
1120 printf ("/begbuf");
1121 break;
1122
1123 case endbuf:
1124 printf ("/endbuf");
1125 break;
1126
1127 default:
1128 printf ("?%ld", (long int) *(p-1));
1129 }
1130
1131 putchar ('\n');
1132 }
1133
1134 # ifdef _LIBC
1135 printf ("%td:\tend of pattern.\n", p - start);
1136 # else
1137 printf ("%ld:\tend of pattern.\n", (long int) (p - start));
1138 # endif
1139 }
1140
1141
1142 void
1143 PREFIX(print_compiled_pattern) (bufp)
1144 struct re_pattern_buffer *bufp;
1145 {
1146 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer;
1147
1148 PREFIX(print_partial_compiled_pattern) (buffer, buffer
1149 + bufp->used / sizeof(UCHAR_T));
1150 printf ("%ld bytes used/%ld bytes allocated.\n",
1151 bufp->used, bufp->allocated);
1152
1153 if (bufp->fastmap_accurate && bufp->fastmap)
1154 {
1155 printf ("fastmap: ");
1156 print_fastmap (bufp->fastmap);
1157 }
1158
1159 # ifdef _LIBC
1160 printf ("re_nsub: %Zd\t", bufp->re_nsub);
1161 # else
1162 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub);
1163 # endif
1164 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1165 printf ("can_be_null: %d\t", bufp->can_be_null);
1166 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1167 printf ("no_sub: %d\t", bufp->no_sub);
1168 printf ("not_bol: %d\t", bufp->not_bol);
1169 printf ("not_eol: %d\t", bufp->not_eol);
1170 printf ("syntax: %lx\n", bufp->syntax);
1171 /* Perhaps we should print the translate table? */
1172 }
1173
1174
1175 void
1176 PREFIX(print_double_string) (where, string1, size1, string2, size2)
1177 const CHAR_T *where;
1178 const CHAR_T *string1;
1179 const CHAR_T *string2;
1180 int size1;
1181 int size2;
1182 {
1183 int this_char;
1184
1185 if (where == NULL)
1186 printf ("(null)");
1187 else
1188 {
1189 int cnt;
1190
1191 if (FIRST_STRING_P (where))
1192 {
1193 for (this_char = where - string1; this_char < size1; this_char++)
1194 PUT_CHAR (string1[this_char]);
1195
1196 where = string2;
1197 }
1198
1199 cnt = 0;
1200 for (this_char = where - string2; this_char < size2; this_char++)
1201 {
1202 PUT_CHAR (string2[this_char]);
1203 if (++cnt > 100)
1204 {
1205 fputs ("...", stdout);
1206 break;
1207 }
1208 }
1209 }
1210 }
1211
1212 # ifndef DEFINED_ONCE
1213 void
1214 printchar (c)
1215 int c;
1216 {
1217 putc (c, stderr);
1218 }
1219 # endif
1220
1221 # else /* not DEBUG */
1222
1223 # ifndef DEFINED_ONCE
1224 # undef assert
1225 # define assert(e)
1226
1227 # define DEBUG_STATEMENT(e)
1228 # define DEBUG_PRINT1(x)
1229 # define DEBUG_PRINT2(x1, x2)
1230 # define DEBUG_PRINT3(x1, x2, x3)
1231 # define DEBUG_PRINT4(x1, x2, x3, x4)
1232 # endif /* not DEFINED_ONCE */
1233 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1234 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1235
1236 # endif /* not DEBUG */
1237
1238 \f
1239
1240 # ifdef WCHAR
1241 /* This convert a multibyte string to a wide character string.
1242 And write their correspondances to offset_buffer(see below)
1243 and write whether each wchar_t is binary data to is_binary.
1244 This assume invalid multibyte sequences as binary data.
1245 We assume offset_buffer and is_binary is already allocated
1246 enough space. */
1247
1248 static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src,
1249 size_t len, int *offset_buffer,
1250 char *is_binary);
1251 static size_t
1252 convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary)
1253 CHAR_T *dest;
1254 const unsigned char* src;
1255 size_t len; /* the length of multibyte string. */
1256
1257 /* It hold correspondances between src(char string) and
1258 dest(wchar_t string) for optimization.
1259 e.g. src = "xxxyzz"
1260 dest = {'X', 'Y', 'Z'}
1261 (each "xxx", "y" and "zz" represent one multibyte character
1262 corresponding to 'X', 'Y' and 'Z'.)
1263 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")}
1264 = {0, 3, 4, 6}
1265 */
1266 int *offset_buffer;
1267 char *is_binary;
1268 {
1269 wchar_t *pdest = dest;
1270 const unsigned char *psrc = src;
1271 size_t wc_count = 0;
1272
1273 mbstate_t mbs;
1274 int i, consumed;
1275 size_t mb_remain = len;
1276 size_t mb_count = 0;
1277
1278 /* Initialize the conversion state. */
1279 memset (&mbs, 0, sizeof (mbstate_t));
1280
1281 offset_buffer[0] = 0;
1282 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed,
1283 psrc += consumed)
1284 {
1285 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs);
1286
1287 if (consumed <= 0)
1288 /* failed to convert. maybe src contains binary data.
1289 So we consume 1 byte manualy. */
1290 {
1291 *pdest = *psrc;
1292 consumed = 1;
1293 is_binary[wc_count] = TRUE;
1294 }
1295 else
1296 is_binary[wc_count] = FALSE;
1297 /* In sjis encoding, we use yen sign as escape character in
1298 place of reverse solidus. So we convert 0x5c(yen sign in
1299 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse
1300 solidus in UCS2). */
1301 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5)
1302 *pdest = (wchar_t) *psrc;
1303
1304 offset_buffer[wc_count + 1] = mb_count += consumed;
1305 }
1306
1307 /* Fill remain of the buffer with sentinel. */
1308 for (i = wc_count + 1 ; i <= len ; i++)
1309 offset_buffer[i] = mb_count + 1;
1310
1311 return wc_count;
1312 }
1313
1314 # endif /* WCHAR */
1315
1316 #else /* not INSIDE_RECURSION */
1317
1318 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1319 also be assigned to arbitrarily: each pattern buffer stores its own
1320 syntax, so it can be changed between regex compilations. */
1321 /* This has no initializer because initialized variables in Emacs
1322 become read-only after dumping. */
1323 reg_syntax_t re_syntax_options;
1324
1325
1326 /* Specify the precise syntax of regexps for compilation. This provides
1327 for compatibility for various utilities which historically have
1328 different, incompatible syntaxes.
1329
1330 The argument SYNTAX is a bit mask comprised of the various bits
1331 defined in regex.h. We return the old syntax. */
1332
1333 reg_syntax_t
1334 re_set_syntax (syntax)
1335 reg_syntax_t syntax;
1336 {
1337 reg_syntax_t ret = re_syntax_options;
1338
1339 re_syntax_options = syntax;
1340 # ifdef DEBUG
1341 if (syntax & RE_DEBUG)
1342 debug = 1;
1343 else if (debug) /* was on but now is not */
1344 debug = 0;
1345 # endif /* DEBUG */
1346 return ret;
1347 }
1348 # ifdef _LIBC
1349 weak_alias (__re_set_syntax, re_set_syntax)
1350 # endif
1351 \f
1352 /* This table gives an error message for each of the error codes listed
1353 in regex.h. Obviously the order here has to be same as there.
1354 POSIX doesn't require that we do anything for REG_NOERROR,
1355 but why not be nice? */
1356
1357 static const char re_error_msgid[] =
1358 {
1359 # define REG_NOERROR_IDX 0
1360 gettext_noop ("Success") /* REG_NOERROR */
1361 "\0"
1362 # define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
1363 gettext_noop ("No match") /* REG_NOMATCH */
1364 "\0"
1365 # define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
1366 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
1367 "\0"
1368 # define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
1369 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
1370 "\0"
1371 # define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
1372 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
1373 "\0"
1374 # define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
1375 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
1376 "\0"
1377 # define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
1378 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
1379 "\0"
1380 # define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
1381 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
1382 "\0"
1383 # define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
1384 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
1385 "\0"
1386 # define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
1387 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
1388 "\0"
1389 # define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
1390 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
1391 "\0"
1392 # define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
1393 gettext_noop ("Invalid range end") /* REG_ERANGE */
1394 "\0"
1395 # define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
1396 gettext_noop ("Memory exhausted") /* REG_ESPACE */
1397 "\0"
1398 # define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
1399 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
1400 "\0"
1401 # define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
1402 gettext_noop ("Premature end of regular expression") /* REG_EEND */
1403 "\0"
1404 # define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
1405 gettext_noop ("Regular expression too big") /* REG_ESIZE */
1406 "\0"
1407 # define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
1408 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
1409 };
1410
1411 static const size_t re_error_msgid_idx[] =
1412 {
1413 REG_NOERROR_IDX,
1414 REG_NOMATCH_IDX,
1415 REG_BADPAT_IDX,
1416 REG_ECOLLATE_IDX,
1417 REG_ECTYPE_IDX,
1418 REG_EESCAPE_IDX,
1419 REG_ESUBREG_IDX,
1420 REG_EBRACK_IDX,
1421 REG_EPAREN_IDX,
1422 REG_EBRACE_IDX,
1423 REG_BADBR_IDX,
1424 REG_ERANGE_IDX,
1425 REG_ESPACE_IDX,
1426 REG_BADRPT_IDX,
1427 REG_EEND_IDX,
1428 REG_ESIZE_IDX,
1429 REG_ERPAREN_IDX
1430 };
1431 \f
1432 #endif /* INSIDE_RECURSION */
1433
1434 #ifndef DEFINED_ONCE
1435 /* Avoiding alloca during matching, to placate r_alloc. */
1436
1437 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1438 searching and matching functions should not call alloca. On some
1439 systems, alloca is implemented in terms of malloc, and if we're
1440 using the relocating allocator routines, then malloc could cause a
1441 relocation, which might (if the strings being searched are in the
1442 ralloc heap) shift the data out from underneath the regexp
1443 routines.
1444
1445 Here's another reason to avoid allocation: Emacs
1446 processes input from X in a signal handler; processing X input may
1447 call malloc; if input arrives while a matching routine is calling
1448 malloc, then we're scrod. But Emacs can't just block input while
1449 calling matching routines; then we don't notice interrupts when
1450 they come in. So, Emacs blocks input around all regexp calls
1451 except the matching calls, which it leaves unprotected, in the
1452 faith that they will not malloc. */
1453
1454 /* Normally, this is fine. */
1455 # define MATCH_MAY_ALLOCATE
1456
1457 /* When using GNU C, we are not REALLY using the C alloca, no matter
1458 what config.h may say. So don't take precautions for it. */
1459 # ifdef __GNUC__
1460 # undef C_ALLOCA
1461 # endif
1462
1463 /* The match routines may not allocate if (1) they would do it with malloc
1464 and (2) it's not safe for them to use malloc.
1465 Note that if REL_ALLOC is defined, matching would not use malloc for the
1466 failure stack, but we would still use it for the register vectors;
1467 so REL_ALLOC should not affect this. */
1468 # if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
1469 # undef MATCH_MAY_ALLOCATE
1470 # endif
1471 #endif /* not DEFINED_ONCE */
1472 \f
1473 #ifdef INSIDE_RECURSION
1474 /* Failure stack declarations and macros; both re_compile_fastmap and
1475 re_match_2 use a failure stack. These have to be macros because of
1476 REGEX_ALLOCATE_STACK. */
1477
1478
1479 /* Number of failure points for which to initially allocate space
1480 when matching. If this number is exceeded, we allocate more
1481 space, so it is not a hard limit. */
1482 # ifndef INIT_FAILURE_ALLOC
1483 # define INIT_FAILURE_ALLOC 5
1484 # endif
1485
1486 /* Roughly the maximum number of failure points on the stack. Would be
1487 exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
1488 This is a variable only so users of regex can assign to it; we never
1489 change it ourselves. */
1490
1491 # ifdef INT_IS_16BIT
1492
1493 # ifndef DEFINED_ONCE
1494 # if defined MATCH_MAY_ALLOCATE
1495 /* 4400 was enough to cause a crash on Alpha OSF/1,
1496 whose default stack limit is 2mb. */
1497 long int re_max_failures = 4000;
1498 # else
1499 long int re_max_failures = 2000;
1500 # endif
1501 # endif
1502
1503 union PREFIX(fail_stack_elt)
1504 {
1505 UCHAR_T *pointer;
1506 long int integer;
1507 };
1508
1509 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1510
1511 typedef struct
1512 {
1513 PREFIX(fail_stack_elt_t) *stack;
1514 unsigned long int size;
1515 unsigned long int avail; /* Offset of next open position. */
1516 } PREFIX(fail_stack_type);
1517
1518 # else /* not INT_IS_16BIT */
1519
1520 # ifndef DEFINED_ONCE
1521 # if defined MATCH_MAY_ALLOCATE
1522 /* 4400 was enough to cause a crash on Alpha OSF/1,
1523 whose default stack limit is 2mb. */
1524 int re_max_failures = 4000;
1525 # else
1526 int re_max_failures = 2000;
1527 # endif
1528 # endif
1529
1530 union PREFIX(fail_stack_elt)
1531 {
1532 UCHAR_T *pointer;
1533 int integer;
1534 };
1535
1536 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t);
1537
1538 typedef struct
1539 {
1540 PREFIX(fail_stack_elt_t) *stack;
1541 unsigned size;
1542 unsigned avail; /* Offset of next open position. */
1543 } PREFIX(fail_stack_type);
1544
1545 # endif /* INT_IS_16BIT */
1546
1547 # ifndef DEFINED_ONCE
1548 # define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1549 # define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1550 # define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1551 # endif
1552
1553
1554 /* Define macros to initialize and free the failure stack.
1555 Do `return -2' if the alloc fails. */
1556
1557 # ifdef MATCH_MAY_ALLOCATE
1558 # define INIT_FAIL_STACK() \
1559 do { \
1560 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \
1561 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \
1562 \
1563 if (fail_stack.stack == NULL) \
1564 return -2; \
1565 \
1566 fail_stack.size = INIT_FAILURE_ALLOC; \
1567 fail_stack.avail = 0; \
1568 } while (0)
1569
1570 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1571 # else
1572 # define INIT_FAIL_STACK() \
1573 do { \
1574 fail_stack.avail = 0; \
1575 } while (0)
1576
1577 # define RESET_FAIL_STACK()
1578 # endif
1579
1580
1581 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1582
1583 Return 1 if succeeds, and 0 if either ran out of memory
1584 allocating space for it or it was already too large.
1585
1586 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1587
1588 # define DOUBLE_FAIL_STACK(fail_stack) \
1589 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \
1590 ? 0 \
1591 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \
1592 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1593 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \
1594 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\
1595 \
1596 (fail_stack).stack == NULL \
1597 ? 0 \
1598 : ((fail_stack).size <<= 1, \
1599 1)))
1600
1601
1602 /* Push pointer POINTER on FAIL_STACK.
1603 Return 1 if was able to do so and 0 if ran out of memory allocating
1604 space to do so. */
1605 # define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1606 ((FAIL_STACK_FULL () \
1607 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1608 ? 0 \
1609 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1610 1))
1611
1612 /* Push a pointer value onto the failure stack.
1613 Assumes the variable `fail_stack'. Probably should only
1614 be called from within `PUSH_FAILURE_POINT'. */
1615 # define PUSH_FAILURE_POINTER(item) \
1616 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item)
1617
1618 /* This pushes an integer-valued item onto the failure stack.
1619 Assumes the variable `fail_stack'. Probably should only
1620 be called from within `PUSH_FAILURE_POINT'. */
1621 # define PUSH_FAILURE_INT(item) \
1622 fail_stack.stack[fail_stack.avail++].integer = (item)
1623
1624 /* Push a fail_stack_elt_t value onto the failure stack.
1625 Assumes the variable `fail_stack'. Probably should only
1626 be called from within `PUSH_FAILURE_POINT'. */
1627 # define PUSH_FAILURE_ELT(item) \
1628 fail_stack.stack[fail_stack.avail++] = (item)
1629
1630 /* These three POP... operations complement the three PUSH... operations.
1631 All assume that `fail_stack' is nonempty. */
1632 # define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1633 # define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1634 # define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1635
1636 /* Used to omit pushing failure point id's when we're not debugging. */
1637 # ifdef DEBUG
1638 # define DEBUG_PUSH PUSH_FAILURE_INT
1639 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1640 # else
1641 # define DEBUG_PUSH(item)
1642 # define DEBUG_POP(item_addr)
1643 # endif
1644
1645
1646 /* Push the information about the state we will need
1647 if we ever fail back to it.
1648
1649 Requires variables fail_stack, regstart, regend, reg_info, and
1650 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination'
1651 be declared.
1652
1653 Does `return FAILURE_CODE' if runs out of memory. */
1654
1655 # define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1656 do { \
1657 char *destination; \
1658 /* Must be int, so when we don't save any registers, the arithmetic \
1659 of 0 + -1 isn't done as unsigned. */ \
1660 /* Can't be int, since there is not a shred of a guarantee that int \
1661 is wide enough to hold a value of something to which pointer can \
1662 be assigned */ \
1663 active_reg_t this_reg; \
1664 \
1665 DEBUG_STATEMENT (failure_id++); \
1666 DEBUG_STATEMENT (nfailure_points_pushed++); \
1667 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1668 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1669 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1670 \
1671 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \
1672 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1673 \
1674 /* Ensure we have enough space allocated for what we will push. */ \
1675 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1676 { \
1677 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1678 return failure_code; \
1679 \
1680 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1681 (fail_stack).size); \
1682 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1683 } \
1684 \
1685 /* Push the info, starting with the registers. */ \
1686 DEBUG_PRINT1 ("\n"); \
1687 \
1688 if (1) \
1689 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1690 this_reg++) \
1691 { \
1692 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \
1693 DEBUG_STATEMENT (num_regs_pushed++); \
1694 \
1695 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1696 PUSH_FAILURE_POINTER (regstart[this_reg]); \
1697 \
1698 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1699 PUSH_FAILURE_POINTER (regend[this_reg]); \
1700 \
1701 DEBUG_PRINT2 (" info: %p\n ", \
1702 reg_info[this_reg].word.pointer); \
1703 DEBUG_PRINT2 (" match_null=%d", \
1704 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1705 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1706 DEBUG_PRINT2 (" matched_something=%d", \
1707 MATCHED_SOMETHING (reg_info[this_reg])); \
1708 DEBUG_PRINT2 (" ever_matched=%d", \
1709 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1710 DEBUG_PRINT1 ("\n"); \
1711 PUSH_FAILURE_ELT (reg_info[this_reg].word); \
1712 } \
1713 \
1714 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\
1715 PUSH_FAILURE_INT (lowest_active_reg); \
1716 \
1717 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\
1718 PUSH_FAILURE_INT (highest_active_reg); \
1719 \
1720 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \
1721 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1722 PUSH_FAILURE_POINTER (pattern_place); \
1723 \
1724 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \
1725 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1726 size2); \
1727 DEBUG_PRINT1 ("'\n"); \
1728 PUSH_FAILURE_POINTER (string_place); \
1729 \
1730 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1731 DEBUG_PUSH (failure_id); \
1732 } while (0)
1733
1734 # ifndef DEFINED_ONCE
1735 /* This is the number of items that are pushed and popped on the stack
1736 for each register. */
1737 # define NUM_REG_ITEMS 3
1738
1739 /* Individual items aside from the registers. */
1740 # ifdef DEBUG
1741 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1742 # else
1743 # define NUM_NONREG_ITEMS 4
1744 # endif
1745
1746 /* We push at most this many items on the stack. */
1747 /* We used to use (num_regs - 1), which is the number of registers
1748 this regexp will save; but that was changed to 5
1749 to avoid stack overflow for a regexp with lots of parens. */
1750 # define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1751
1752 /* We actually push this many items. */
1753 # define NUM_FAILURE_ITEMS \
1754 (((0 \
1755 ? 0 : highest_active_reg - lowest_active_reg + 1) \
1756 * NUM_REG_ITEMS) \
1757 + NUM_NONREG_ITEMS)
1758
1759 /* How many items can still be added to the stack without overflowing it. */
1760 # define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1761 # endif /* not DEFINED_ONCE */
1762
1763
1764 /* Pops what PUSH_FAIL_STACK pushes.
1765
1766 We restore into the parameters, all of which should be lvalues:
1767 STR -- the saved data position.
1768 PAT -- the saved pattern position.
1769 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1770 REGSTART, REGEND -- arrays of string positions.
1771 REG_INFO -- array of information about each subexpression.
1772
1773 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1774 `pend', `string1', `size1', `string2', and `size2'. */
1775 # define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1776 { \
1777 DEBUG_STATEMENT (unsigned failure_id;) \
1778 active_reg_t this_reg; \
1779 const UCHAR_T *string_temp; \
1780 \
1781 assert (!FAIL_STACK_EMPTY ()); \
1782 \
1783 /* Remove failure points and point to how many regs pushed. */ \
1784 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1785 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1786 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1787 \
1788 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1789 \
1790 DEBUG_POP (&failure_id); \
1791 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1792 \
1793 /* If the saved string location is NULL, it came from an \
1794 on_failure_keep_string_jump opcode, and we want to throw away the \
1795 saved NULL, thus retaining our current position in the string. */ \
1796 string_temp = POP_FAILURE_POINTER (); \
1797 if (string_temp != NULL) \
1798 str = (const CHAR_T *) string_temp; \
1799 \
1800 DEBUG_PRINT2 (" Popping string %p: `", str); \
1801 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1802 DEBUG_PRINT1 ("'\n"); \
1803 \
1804 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \
1805 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \
1806 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1807 \
1808 /* Restore register info. */ \
1809 high_reg = (active_reg_t) POP_FAILURE_INT (); \
1810 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \
1811 \
1812 low_reg = (active_reg_t) POP_FAILURE_INT (); \
1813 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \
1814 \
1815 if (1) \
1816 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1817 { \
1818 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \
1819 \
1820 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1821 DEBUG_PRINT2 (" info: %p\n", \
1822 reg_info[this_reg].word.pointer); \
1823 \
1824 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1825 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \
1826 \
1827 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \
1828 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \
1829 } \
1830 else \
1831 { \
1832 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \
1833 { \
1834 reg_info[this_reg].word.integer = 0; \
1835 regend[this_reg] = 0; \
1836 regstart[this_reg] = 0; \
1837 } \
1838 highest_active_reg = high_reg; \
1839 } \
1840 \
1841 set_regs_matched_done = 0; \
1842 DEBUG_STATEMENT (nfailure_points_popped++); \
1843 } /* POP_FAILURE_POINT */
1844 \f
1845 /* Structure for per-register (a.k.a. per-group) information.
1846 Other register information, such as the
1847 starting and ending positions (which are addresses), and the list of
1848 inner groups (which is a bits list) are maintained in separate
1849 variables.
1850
1851 We are making a (strictly speaking) nonportable assumption here: that
1852 the compiler will pack our bit fields into something that fits into
1853 the type of `word', i.e., is something that fits into one item on the
1854 failure stack. */
1855
1856
1857 /* Declarations and macros for re_match_2. */
1858
1859 typedef union
1860 {
1861 PREFIX(fail_stack_elt_t) word;
1862 struct
1863 {
1864 /* This field is one if this group can match the empty string,
1865 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1866 # define MATCH_NULL_UNSET_VALUE 3
1867 unsigned match_null_string_p : 2;
1868 unsigned is_active : 1;
1869 unsigned matched_something : 1;
1870 unsigned ever_matched_something : 1;
1871 } bits;
1872 } PREFIX(register_info_type);
1873
1874 # ifndef DEFINED_ONCE
1875 # define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1876 # define IS_ACTIVE(R) ((R).bits.is_active)
1877 # define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1878 # define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1879
1880
1881 /* Call this when have matched a real character; it sets `matched' flags
1882 for the subexpressions which we are currently inside. Also records
1883 that those subexprs have matched. */
1884 # define SET_REGS_MATCHED() \
1885 do \
1886 { \
1887 if (!set_regs_matched_done) \
1888 { \
1889 active_reg_t r; \
1890 set_regs_matched_done = 1; \
1891 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1892 { \
1893 MATCHED_SOMETHING (reg_info[r]) \
1894 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1895 = 1; \
1896 } \
1897 } \
1898 } \
1899 while (0)
1900 # endif /* not DEFINED_ONCE */
1901
1902 /* Registers are set to a sentinel when they haven't yet matched. */
1903 static CHAR_T PREFIX(reg_unset_dummy);
1904 # define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy))
1905 # define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1906
1907 /* Subroutine declarations and macros for regex_compile. */
1908 static void PREFIX(store_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc, int arg));
1909 static void PREFIX(store_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1910 int arg1, int arg2));
1911 static void PREFIX(insert_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1912 int arg, UCHAR_T *end));
1913 static void PREFIX(insert_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc,
1914 int arg1, int arg2, UCHAR_T *end));
1915 static boolean PREFIX(at_begline_loc_p) _RE_ARGS ((const CHAR_T *pattern,
1916 const CHAR_T *p,
1917 reg_syntax_t syntax));
1918 static boolean PREFIX(at_endline_loc_p) _RE_ARGS ((const CHAR_T *p,
1919 const CHAR_T *pend,
1920 reg_syntax_t syntax));
1921 # ifdef WCHAR
1922 static reg_errcode_t wcs_compile_range _RE_ARGS ((CHAR_T range_start,
1923 const CHAR_T **p_ptr,
1924 const CHAR_T *pend,
1925 char *translate,
1926 reg_syntax_t syntax,
1927 UCHAR_T *b,
1928 CHAR_T *char_set));
1929 static void insert_space _RE_ARGS ((int num, CHAR_T *loc, CHAR_T *end));
1930 # else /* BYTE */
1931 static reg_errcode_t byte_compile_range _RE_ARGS ((unsigned int range_start,
1932 const char **p_ptr,
1933 const char *pend,
1934 char *translate,
1935 reg_syntax_t syntax,
1936 unsigned char *b));
1937 # endif /* WCHAR */
1938
1939 /* Fetch the next character in the uncompiled pattern---translating it
1940 if necessary. Also cast from a signed character in the constant
1941 string passed to us by the user to an unsigned char that we can use
1942 as an array index (in, e.g., `translate'). */
1943 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1944 because it is impossible to allocate 4GB array for some encodings
1945 which have 4 byte character_set like UCS4. */
1946 # ifndef PATFETCH
1947 # ifdef WCHAR
1948 # define PATFETCH(c) \
1949 do {if (p == pend) return REG_EEND; \
1950 c = (UCHAR_T) *p++; \
1951 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \
1952 } while (0)
1953 # else /* BYTE */
1954 # define PATFETCH(c) \
1955 do {if (p == pend) return REG_EEND; \
1956 c = (unsigned char) *p++; \
1957 if (translate) c = (unsigned char) translate[c]; \
1958 } while (0)
1959 # endif /* WCHAR */
1960 # endif
1961
1962 /* Fetch the next character in the uncompiled pattern, with no
1963 translation. */
1964 # define PATFETCH_RAW(c) \
1965 do {if (p == pend) return REG_EEND; \
1966 c = (UCHAR_T) *p++; \
1967 } while (0)
1968
1969 /* Go backwards one character in the pattern. */
1970 # define PATUNFETCH p--
1971
1972
1973 /* If `translate' is non-null, return translate[D], else just D. We
1974 cast the subscript to translate because some data is declared as
1975 `char *', to avoid warnings when a string constant is passed. But
1976 when we use a character as a subscript we must make it unsigned. */
1977 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff,
1978 because it is impossible to allocate 4GB array for some encodings
1979 which have 4 byte character_set like UCS4. */
1980
1981 # ifndef TRANSLATE
1982 # ifdef WCHAR
1983 # define TRANSLATE(d) \
1984 ((translate && ((UCHAR_T) (d)) <= 0xff) \
1985 ? (char) translate[(unsigned char) (d)] : (d))
1986 # else /* BYTE */
1987 # define TRANSLATE(d) \
1988 (translate ? (char) translate[(unsigned char) (d)] : (d))
1989 # endif /* WCHAR */
1990 # endif
1991
1992
1993 /* Macros for outputting the compiled pattern into `buffer'. */
1994
1995 /* If the buffer isn't allocated when it comes in, use this. */
1996 # define INIT_BUF_SIZE (32 * sizeof(UCHAR_T))
1997
1998 /* Make sure we have at least N more bytes of space in buffer. */
1999 # ifdef WCHAR
2000 # define GET_BUFFER_SPACE(n) \
2001 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \
2002 + (n)*sizeof(CHAR_T)) > bufp->allocated) \
2003 EXTEND_BUFFER ()
2004 # else /* BYTE */
2005 # define GET_BUFFER_SPACE(n) \
2006 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \
2007 EXTEND_BUFFER ()
2008 # endif /* WCHAR */
2009
2010 /* Make sure we have one more byte of buffer space and then add C to it. */
2011 # define BUF_PUSH(c) \
2012 do { \
2013 GET_BUFFER_SPACE (1); \
2014 *b++ = (UCHAR_T) (c); \
2015 } while (0)
2016
2017
2018 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
2019 # define BUF_PUSH_2(c1, c2) \
2020 do { \
2021 GET_BUFFER_SPACE (2); \
2022 *b++ = (UCHAR_T) (c1); \
2023 *b++ = (UCHAR_T) (c2); \
2024 } while (0)
2025
2026
2027 /* As with BUF_PUSH_2, except for three bytes. */
2028 # define BUF_PUSH_3(c1, c2, c3) \
2029 do { \
2030 GET_BUFFER_SPACE (3); \
2031 *b++ = (UCHAR_T) (c1); \
2032 *b++ = (UCHAR_T) (c2); \
2033 *b++ = (UCHAR_T) (c3); \
2034 } while (0)
2035
2036 /* Store a jump with opcode OP at LOC to location TO. We store a
2037 relative address offset by the three bytes the jump itself occupies. */
2038 # define STORE_JUMP(op, loc, to) \
2039 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)))
2040
2041 /* Likewise, for a two-argument jump. */
2042 # define STORE_JUMP2(op, loc, to, arg) \
2043 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg)
2044
2045 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
2046 # define INSERT_JUMP(op, loc, to) \
2047 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b)
2048
2049 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
2050 # define INSERT_JUMP2(op, loc, to, arg) \
2051 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\
2052 arg, b)
2053
2054 /* This is not an arbitrary limit: the arguments which represent offsets
2055 into the pattern are two bytes long. So if 2^16 bytes turns out to
2056 be too small, many things would have to change. */
2057 /* Any other compiler which, like MSC, has allocation limit below 2^16
2058 bytes will have to use approach similar to what was done below for
2059 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
2060 reallocating to 0 bytes. Such thing is not going to work too well.
2061 You have been warned!! */
2062 # ifndef DEFINED_ONCE
2063 # if defined _MSC_VER && !defined WIN32
2064 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes.
2065 The REALLOC define eliminates a flurry of conversion warnings,
2066 but is not required. */
2067 # define MAX_BUF_SIZE 65500L
2068 # define REALLOC(p,s) realloc ((p), (size_t) (s))
2069 # else
2070 # define MAX_BUF_SIZE (1L << 16)
2071 # define REALLOC(p,s) realloc ((p), (s))
2072 # endif
2073
2074 /* Extend the buffer by twice its current size via realloc and
2075 reset the pointers that pointed into the old block to point to the
2076 correct places in the new one. If extending the buffer results in it
2077 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
2078 # if __BOUNDED_POINTERS__
2079 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
2080 # define MOVE_BUFFER_POINTER(P) \
2081 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
2082 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
2083 else \
2084 { \
2085 SET_HIGH_BOUND (b); \
2086 SET_HIGH_BOUND (begalt); \
2087 if (fixup_alt_jump) \
2088 SET_HIGH_BOUND (fixup_alt_jump); \
2089 if (laststart) \
2090 SET_HIGH_BOUND (laststart); \
2091 if (pending_exact) \
2092 SET_HIGH_BOUND (pending_exact); \
2093 }
2094 # else
2095 # define MOVE_BUFFER_POINTER(P) (P) += incr
2096 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
2097 # endif
2098 # endif /* not DEFINED_ONCE */
2099
2100 # ifdef WCHAR
2101 # define EXTEND_BUFFER() \
2102 do { \
2103 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2104 int wchar_count; \
2105 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \
2106 return REG_ESIZE; \
2107 bufp->allocated <<= 1; \
2108 if (bufp->allocated > MAX_BUF_SIZE) \
2109 bufp->allocated = MAX_BUF_SIZE; \
2110 /* How many characters the new buffer can have? */ \
2111 wchar_count = bufp->allocated / sizeof(UCHAR_T); \
2112 if (wchar_count == 0) wchar_count = 1; \
2113 /* Truncate the buffer to CHAR_T align. */ \
2114 bufp->allocated = wchar_count * sizeof(UCHAR_T); \
2115 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \
2116 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \
2117 if (COMPILED_BUFFER_VAR == NULL) \
2118 return REG_ESPACE; \
2119 /* If the buffer moved, move all the pointers into it. */ \
2120 if (old_buffer != COMPILED_BUFFER_VAR) \
2121 { \
2122 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2123 MOVE_BUFFER_POINTER (b); \
2124 MOVE_BUFFER_POINTER (begalt); \
2125 if (fixup_alt_jump) \
2126 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2127 if (laststart) \
2128 MOVE_BUFFER_POINTER (laststart); \
2129 if (pending_exact) \
2130 MOVE_BUFFER_POINTER (pending_exact); \
2131 } \
2132 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2133 } while (0)
2134 # else /* BYTE */
2135 # define EXTEND_BUFFER() \
2136 do { \
2137 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \
2138 if (bufp->allocated == MAX_BUF_SIZE) \
2139 return REG_ESIZE; \
2140 bufp->allocated <<= 1; \
2141 if (bufp->allocated > MAX_BUF_SIZE) \
2142 bufp->allocated = MAX_BUF_SIZE; \
2143 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \
2144 bufp->allocated); \
2145 if (COMPILED_BUFFER_VAR == NULL) \
2146 return REG_ESPACE; \
2147 /* If the buffer moved, move all the pointers into it. */ \
2148 if (old_buffer != COMPILED_BUFFER_VAR) \
2149 { \
2150 int incr = COMPILED_BUFFER_VAR - old_buffer; \
2151 MOVE_BUFFER_POINTER (b); \
2152 MOVE_BUFFER_POINTER (begalt); \
2153 if (fixup_alt_jump) \
2154 MOVE_BUFFER_POINTER (fixup_alt_jump); \
2155 if (laststart) \
2156 MOVE_BUFFER_POINTER (laststart); \
2157 if (pending_exact) \
2158 MOVE_BUFFER_POINTER (pending_exact); \
2159 } \
2160 ELSE_EXTEND_BUFFER_HIGH_BOUND \
2161 } while (0)
2162 # endif /* WCHAR */
2163
2164 # ifndef DEFINED_ONCE
2165 /* Since we have one byte reserved for the register number argument to
2166 {start,stop}_memory, the maximum number of groups we can report
2167 things about is what fits in that byte. */
2168 # define MAX_REGNUM 255
2169
2170 /* But patterns can have more than `MAX_REGNUM' registers. We just
2171 ignore the excess. */
2172 typedef unsigned regnum_t;
2173
2174
2175 /* Macros for the compile stack. */
2176
2177 /* Since offsets can go either forwards or backwards, this type needs to
2178 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
2179 /* int may be not enough when sizeof(int) == 2. */
2180 typedef long pattern_offset_t;
2181
2182 typedef struct
2183 {
2184 pattern_offset_t begalt_offset;
2185 pattern_offset_t fixup_alt_jump;
2186 pattern_offset_t inner_group_offset;
2187 pattern_offset_t laststart_offset;
2188 regnum_t regnum;
2189 } compile_stack_elt_t;
2190
2191
2192 typedef struct
2193 {
2194 compile_stack_elt_t *stack;
2195 unsigned size;
2196 unsigned avail; /* Offset of next open position. */
2197 } compile_stack_type;
2198
2199
2200 # define INIT_COMPILE_STACK_SIZE 32
2201
2202 # define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
2203 # define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
2204
2205 /* The next available element. */
2206 # define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
2207
2208 # endif /* not DEFINED_ONCE */
2209
2210 /* Set the bit for character C in a list. */
2211 # ifndef DEFINED_ONCE
2212 # define SET_LIST_BIT(c) \
2213 (b[((unsigned char) (c)) / BYTEWIDTH] \
2214 |= 1 << (((unsigned char) c) % BYTEWIDTH))
2215 # endif /* DEFINED_ONCE */
2216
2217 /* Get the next unsigned number in the uncompiled pattern. */
2218 # define GET_UNSIGNED_NUMBER(num) \
2219 { \
2220 while (p != pend) \
2221 { \
2222 PATFETCH (c); \
2223 if (c < '0' || c > '9') \
2224 break; \
2225 if (num <= RE_DUP_MAX) \
2226 { \
2227 if (num < 0) \
2228 num = 0; \
2229 num = num * 10 + c - '0'; \
2230 } \
2231 } \
2232 }
2233
2234 # ifndef DEFINED_ONCE
2235 # if defined _LIBC || WIDE_CHAR_SUPPORT
2236 /* The GNU C library provides support for user-defined character classes
2237 and the functions from ISO C amendement 1. */
2238 # ifdef CHARCLASS_NAME_MAX
2239 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
2240 # else
2241 /* This shouldn't happen but some implementation might still have this
2242 problem. Use a reasonable default value. */
2243 # define CHAR_CLASS_MAX_LENGTH 256
2244 # endif
2245
2246 # ifdef _LIBC
2247 # define IS_CHAR_CLASS(string) __wctype (string)
2248 # else
2249 # define IS_CHAR_CLASS(string) wctype (string)
2250 # endif
2251 # else
2252 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
2253
2254 # define IS_CHAR_CLASS(string) \
2255 (STREQ (string, "alpha") || STREQ (string, "upper") \
2256 || STREQ (string, "lower") || STREQ (string, "digit") \
2257 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
2258 || STREQ (string, "space") || STREQ (string, "print") \
2259 || STREQ (string, "punct") || STREQ (string, "graph") \
2260 || STREQ (string, "cntrl") || STREQ (string, "blank"))
2261 # endif
2262 # endif /* DEFINED_ONCE */
2263 \f
2264 # ifndef MATCH_MAY_ALLOCATE
2265
2266 /* If we cannot allocate large objects within re_match_2_internal,
2267 we make the fail stack and register vectors global.
2268 The fail stack, we grow to the maximum size when a regexp
2269 is compiled.
2270 The register vectors, we adjust in size each time we
2271 compile a regexp, according to the number of registers it needs. */
2272
2273 static PREFIX(fail_stack_type) fail_stack;
2274
2275 /* Size with which the following vectors are currently allocated.
2276 That is so we can make them bigger as needed,
2277 but never make them smaller. */
2278 # ifdef DEFINED_ONCE
2279 static int regs_allocated_size;
2280
2281 static const char ** regstart, ** regend;
2282 static const char ** old_regstart, ** old_regend;
2283 static const char **best_regstart, **best_regend;
2284 static const char **reg_dummy;
2285 # endif /* DEFINED_ONCE */
2286
2287 static PREFIX(register_info_type) *PREFIX(reg_info);
2288 static PREFIX(register_info_type) *PREFIX(reg_info_dummy);
2289
2290 /* Make the register vectors big enough for NUM_REGS registers,
2291 but don't make them smaller. */
2292
2293 static void
2294 PREFIX(regex_grow_registers) (num_regs)
2295 int num_regs;
2296 {
2297 if (num_regs > regs_allocated_size)
2298 {
2299 RETALLOC_IF (regstart, num_regs, const char *);
2300 RETALLOC_IF (regend, num_regs, const char *);
2301 RETALLOC_IF (old_regstart, num_regs, const char *);
2302 RETALLOC_IF (old_regend, num_regs, const char *);
2303 RETALLOC_IF (best_regstart, num_regs, const char *);
2304 RETALLOC_IF (best_regend, num_regs, const char *);
2305 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type));
2306 RETALLOC_IF (reg_dummy, num_regs, const char *);
2307 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type));
2308
2309 regs_allocated_size = num_regs;
2310 }
2311 }
2312
2313 # endif /* not MATCH_MAY_ALLOCATE */
2314 \f
2315 # ifndef DEFINED_ONCE
2316 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2317 compile_stack,
2318 regnum_t regnum));
2319 # endif /* not DEFINED_ONCE */
2320
2321 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2322 Returns one of error codes defined in `regex.h', or zero for success.
2323
2324 Assumes the `allocated' (and perhaps `buffer') and `translate'
2325 fields are set in BUFP on entry.
2326
2327 If it succeeds, results are put in BUFP (if it returns an error, the
2328 contents of BUFP are undefined):
2329 `buffer' is the compiled pattern;
2330 `syntax' is set to SYNTAX;
2331 `used' is set to the length of the compiled pattern;
2332 `fastmap_accurate' is zero;
2333 `re_nsub' is the number of subexpressions in PATTERN;
2334 `not_bol' and `not_eol' are zero;
2335
2336 The `fastmap' and `newline_anchor' fields are neither
2337 examined nor set. */
2338
2339 /* Return, freeing storage we allocated. */
2340 # ifdef WCHAR
2341 # define FREE_STACK_RETURN(value) \
2342 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value)
2343 # else
2344 # define FREE_STACK_RETURN(value) \
2345 return (free (compile_stack.stack), value)
2346 # endif /* WCHAR */
2347
2348 static reg_errcode_t
2349 PREFIX(regex_compile) (ARG_PREFIX(pattern), ARG_PREFIX(size), syntax, bufp)
2350 const char *ARG_PREFIX(pattern);
2351 size_t ARG_PREFIX(size);
2352 reg_syntax_t syntax;
2353 struct re_pattern_buffer *bufp;
2354 {
2355 /* We fetch characters from PATTERN here. Even though PATTERN is
2356 `char *' (i.e., signed), we declare these variables as unsigned, so
2357 they can be reliably used as array indices. */
2358 register UCHAR_T c, c1;
2359
2360 #ifdef WCHAR
2361 /* A temporary space to keep wchar_t pattern and compiled pattern. */
2362 CHAR_T *pattern, *COMPILED_BUFFER_VAR;
2363 size_t size;
2364 /* offset buffer for optimization. See convert_mbs_to_wc. */
2365 int *mbs_offset = NULL;
2366 /* It hold whether each wchar_t is binary data or not. */
2367 char *is_binary = NULL;
2368 /* A flag whether exactn is handling binary data or not. */
2369 char is_exactn_bin = FALSE;
2370 #endif /* WCHAR */
2371
2372 /* A random temporary spot in PATTERN. */
2373 const CHAR_T *p1;
2374
2375 /* Points to the end of the buffer, where we should append. */
2376 register UCHAR_T *b;
2377
2378 /* Keeps track of unclosed groups. */
2379 compile_stack_type compile_stack;
2380
2381 /* Points to the current (ending) position in the pattern. */
2382 #ifdef WCHAR
2383 const CHAR_T *p;
2384 const CHAR_T *pend;
2385 #else /* BYTE */
2386 const CHAR_T *p = pattern;
2387 const CHAR_T *pend = pattern + size;
2388 #endif /* WCHAR */
2389
2390 /* How to translate the characters in the pattern. */
2391 RE_TRANSLATE_TYPE translate = bufp->translate;
2392
2393 /* Address of the count-byte of the most recently inserted `exactn'
2394 command. This makes it possible to tell if a new exact-match
2395 character can be added to that command or if the character requires
2396 a new `exactn' command. */
2397 UCHAR_T *pending_exact = 0;
2398
2399 /* Address of start of the most recently finished expression.
2400 This tells, e.g., postfix * where to find the start of its
2401 operand. Reset at the beginning of groups and alternatives. */
2402 UCHAR_T *laststart = 0;
2403
2404 /* Address of beginning of regexp, or inside of last group. */
2405 UCHAR_T *begalt;
2406
2407 /* Address of the place where a forward jump should go to the end of
2408 the containing expression. Each alternative of an `or' -- except the
2409 last -- ends with a forward jump of this sort. */
2410 UCHAR_T *fixup_alt_jump = 0;
2411
2412 /* Counts open-groups as they are encountered. Remembered for the
2413 matching close-group on the compile stack, so the same register
2414 number is put in the stop_memory as the start_memory. */
2415 regnum_t regnum = 0;
2416
2417 #ifdef WCHAR
2418 /* Initialize the wchar_t PATTERN and offset_buffer. */
2419 p = pend = pattern = TALLOC(csize + 1, CHAR_T);
2420 mbs_offset = TALLOC(csize + 1, int);
2421 is_binary = TALLOC(csize + 1, char);
2422 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL)
2423 {
2424 free(pattern);
2425 free(mbs_offset);
2426 free(is_binary);
2427 return REG_ESPACE;
2428 }
2429 pattern[csize] = L'\0'; /* sentinel */
2430 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary);
2431 pend = p + size;
2432 if (size < 0)
2433 {
2434 free(pattern);
2435 free(mbs_offset);
2436 free(is_binary);
2437 return REG_BADPAT;
2438 }
2439 #endif
2440
2441 #ifdef DEBUG
2442 DEBUG_PRINT1 ("\nCompiling pattern: ");
2443 if (debug)
2444 {
2445 unsigned debug_count;
2446
2447 for (debug_count = 0; debug_count < size; debug_count++)
2448 PUT_CHAR (pattern[debug_count]);
2449 putchar ('\n');
2450 }
2451 #endif /* DEBUG */
2452
2453 /* Initialize the compile stack. */
2454 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2455 if (compile_stack.stack == NULL)
2456 {
2457 #ifdef WCHAR
2458 free(pattern);
2459 free(mbs_offset);
2460 free(is_binary);
2461 #endif
2462 return REG_ESPACE;
2463 }
2464
2465 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2466 compile_stack.avail = 0;
2467
2468 /* Initialize the pattern buffer. */
2469 bufp->syntax = syntax;
2470 bufp->fastmap_accurate = 0;
2471 bufp->not_bol = bufp->not_eol = 0;
2472
2473 /* Set `used' to zero, so that if we return an error, the pattern
2474 printer (for debugging) will think there's no pattern. We reset it
2475 at the end. */
2476 bufp->used = 0;
2477
2478 /* Always count groups, whether or not bufp->no_sub is set. */
2479 bufp->re_nsub = 0;
2480
2481 #if !defined emacs && !defined SYNTAX_TABLE
2482 /* Initialize the syntax table. */
2483 init_syntax_once ();
2484 #endif
2485
2486 if (bufp->allocated == 0)
2487 {
2488 if (bufp->buffer)
2489 { /* If zero allocated, but buffer is non-null, try to realloc
2490 enough space. This loses if buffer's address is bogus, but
2491 that is the user's responsibility. */
2492 #ifdef WCHAR
2493 /* Free bufp->buffer and allocate an array for wchar_t pattern
2494 buffer. */
2495 free(bufp->buffer);
2496 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T),
2497 UCHAR_T);
2498 #else
2499 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T);
2500 #endif /* WCHAR */
2501 }
2502 else
2503 { /* Caller did not allocate a buffer. Do it for them. */
2504 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T),
2505 UCHAR_T);
2506 }
2507
2508 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE);
2509 #ifdef WCHAR
2510 bufp->buffer = (char*)COMPILED_BUFFER_VAR;
2511 #endif /* WCHAR */
2512 bufp->allocated = INIT_BUF_SIZE;
2513 }
2514 #ifdef WCHAR
2515 else
2516 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer;
2517 #endif
2518
2519 begalt = b = COMPILED_BUFFER_VAR;
2520
2521 /* Loop through the uncompiled pattern until we're at the end. */
2522 while (p != pend)
2523 {
2524 PATFETCH (c);
2525
2526 switch (c)
2527 {
2528 case '^':
2529 {
2530 if ( /* If at start of pattern, it's an operator. */
2531 p == pattern + 1
2532 /* If context independent, it's an operator. */
2533 || syntax & RE_CONTEXT_INDEP_ANCHORS
2534 /* Otherwise, depends on what's come before. */
2535 || PREFIX(at_begline_loc_p) (pattern, p, syntax))
2536 BUF_PUSH (begline);
2537 else
2538 goto normal_char;
2539 }
2540 break;
2541
2542
2543 case '$':
2544 {
2545 if ( /* If at end of pattern, it's an operator. */
2546 p == pend
2547 /* If context independent, it's an operator. */
2548 || syntax & RE_CONTEXT_INDEP_ANCHORS
2549 /* Otherwise, depends on what's next. */
2550 || PREFIX(at_endline_loc_p) (p, pend, syntax))
2551 BUF_PUSH (endline);
2552 else
2553 goto normal_char;
2554 }
2555 break;
2556
2557
2558 case '+':
2559 case '?':
2560 if ((syntax & RE_BK_PLUS_QM)
2561 || (syntax & RE_LIMITED_OPS))
2562 goto normal_char;
2563 handle_plus:
2564 case '*':
2565 /* If there is no previous pattern... */
2566 if (!laststart)
2567 {
2568 if (syntax & RE_CONTEXT_INVALID_OPS)
2569 FREE_STACK_RETURN (REG_BADRPT);
2570 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2571 goto normal_char;
2572 }
2573
2574 {
2575 /* Are we optimizing this jump? */
2576 boolean keep_string_p = false;
2577
2578 /* 1 means zero (many) matches is allowed. */
2579 char zero_times_ok = 0, many_times_ok = 0;
2580
2581 /* If there is a sequence of repetition chars, collapse it
2582 down to just one (the right one). We can't combine
2583 interval operators with these because of, e.g., `a{2}*',
2584 which should only match an even number of `a's. */
2585
2586 for (;;)
2587 {
2588 zero_times_ok |= c != '+';
2589 many_times_ok |= c != '?';
2590
2591 if (p == pend)
2592 break;
2593
2594 PATFETCH (c);
2595
2596 if (c == '*'
2597 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
2598 ;
2599
2600 else if (syntax & RE_BK_PLUS_QM && c == '\\')
2601 {
2602 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2603
2604 PATFETCH (c1);
2605 if (!(c1 == '+' || c1 == '?'))
2606 {
2607 PATUNFETCH;
2608 PATUNFETCH;
2609 break;
2610 }
2611
2612 c = c1;
2613 }
2614 else
2615 {
2616 PATUNFETCH;
2617 break;
2618 }
2619
2620 /* If we get here, we found another repeat character. */
2621 }
2622
2623 /* Star, etc. applied to an empty pattern is equivalent
2624 to an empty pattern. */
2625 if (!laststart)
2626 break;
2627
2628 /* Now we know whether or not zero matches is allowed
2629 and also whether or not two or more matches is allowed. */
2630 if (many_times_ok)
2631 { /* More than one repetition is allowed, so put in at the
2632 end a backward relative jump from `b' to before the next
2633 jump we're going to put in below (which jumps from
2634 laststart to after this jump).
2635
2636 But if we are at the `*' in the exact sequence `.*\n',
2637 insert an unconditional jump backwards to the .,
2638 instead of the beginning of the loop. This way we only
2639 push a failure point once, instead of every time
2640 through the loop. */
2641 assert (p - 1 > pattern);
2642
2643 /* Allocate the space for the jump. */
2644 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2645
2646 /* We know we are not at the first character of the pattern,
2647 because laststart was nonzero. And we've already
2648 incremented `p', by the way, to be the character after
2649 the `*'. Do we have to do something analogous here
2650 for null bytes, because of RE_DOT_NOT_NULL? */
2651 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
2652 && zero_times_ok
2653 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
2654 && !(syntax & RE_DOT_NEWLINE))
2655 { /* We have .*\n. */
2656 STORE_JUMP (jump, b, laststart);
2657 keep_string_p = true;
2658 }
2659 else
2660 /* Anything else. */
2661 STORE_JUMP (maybe_pop_jump, b, laststart -
2662 (1 + OFFSET_ADDRESS_SIZE));
2663
2664 /* We've added more stuff to the buffer. */
2665 b += 1 + OFFSET_ADDRESS_SIZE;
2666 }
2667
2668 /* On failure, jump from laststart to b + 3, which will be the
2669 end of the buffer after this jump is inserted. */
2670 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of
2671 'b + 3'. */
2672 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2673 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
2674 : on_failure_jump,
2675 laststart, b + 1 + OFFSET_ADDRESS_SIZE);
2676 pending_exact = 0;
2677 b += 1 + OFFSET_ADDRESS_SIZE;
2678
2679 if (!zero_times_ok)
2680 {
2681 /* At least one repetition is required, so insert a
2682 `dummy_failure_jump' before the initial
2683 `on_failure_jump' instruction of the loop. This
2684 effects a skip over that instruction the first time
2685 we hit that loop. */
2686 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
2687 INSERT_JUMP (dummy_failure_jump, laststart, laststart +
2688 2 + 2 * OFFSET_ADDRESS_SIZE);
2689 b += 1 + OFFSET_ADDRESS_SIZE;
2690 }
2691 }
2692 break;
2693
2694
2695 case '.':
2696 laststart = b;
2697 BUF_PUSH (anychar);
2698 break;
2699
2700
2701 case '[':
2702 {
2703 boolean had_char_class = false;
2704 #ifdef WCHAR
2705 CHAR_T range_start = 0xffffffff;
2706 #else
2707 unsigned int range_start = 0xffffffff;
2708 #endif
2709 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2710
2711 #ifdef WCHAR
2712 /* We assume a charset(_not) structure as a wchar_t array.
2713 charset[0] = (re_opcode_t) charset(_not)
2714 charset[1] = l (= length of char_classes)
2715 charset[2] = m (= length of collating_symbols)
2716 charset[3] = n (= length of equivalence_classes)
2717 charset[4] = o (= length of char_ranges)
2718 charset[5] = p (= length of chars)
2719
2720 charset[6] = char_class (wctype_t)
2721 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t)
2722 ...
2723 charset[l+5] = char_class (wctype_t)
2724
2725 charset[l+6] = collating_symbol (wchar_t)
2726 ...
2727 charset[l+m+5] = collating_symbol (wchar_t)
2728 ifdef _LIBC we use the index if
2729 _NL_COLLATE_SYMB_EXTRAMB instead of
2730 wchar_t string.
2731
2732 charset[l+m+6] = equivalence_classes (wchar_t)
2733 ...
2734 charset[l+m+n+5] = equivalence_classes (wchar_t)
2735 ifdef _LIBC we use the index in
2736 _NL_COLLATE_WEIGHT instead of
2737 wchar_t string.
2738
2739 charset[l+m+n+6] = range_start
2740 charset[l+m+n+7] = range_end
2741 ...
2742 charset[l+m+n+2o+4] = range_start
2743 charset[l+m+n+2o+5] = range_end
2744 ifdef _LIBC we use the value looked up
2745 in _NL_COLLATE_COLLSEQ instead of
2746 wchar_t character.
2747
2748 charset[l+m+n+2o+6] = char
2749 ...
2750 charset[l+m+n+2o+p+5] = char
2751
2752 */
2753
2754 /* We need at least 6 spaces: the opcode, the length of
2755 char_classes, the length of collating_symbols, the length of
2756 equivalence_classes, the length of char_ranges, the length of
2757 chars. */
2758 GET_BUFFER_SPACE (6);
2759
2760 /* Save b as laststart. And We use laststart as the pointer
2761 to the first element of the charset here.
2762 In other words, laststart[i] indicates charset[i]. */
2763 laststart = b;
2764
2765 /* We test `*p == '^' twice, instead of using an if
2766 statement, so we only need one BUF_PUSH. */
2767 BUF_PUSH (*p == '^' ? charset_not : charset);
2768 if (*p == '^')
2769 p++;
2770
2771 /* Push the length of char_classes, the length of
2772 collating_symbols, the length of equivalence_classes, the
2773 length of char_ranges and the length of chars. */
2774 BUF_PUSH_3 (0, 0, 0);
2775 BUF_PUSH_2 (0, 0);
2776
2777 /* Remember the first position in the bracket expression. */
2778 p1 = p;
2779
2780 /* charset_not matches newline according to a syntax bit. */
2781 if ((re_opcode_t) b[-6] == charset_not
2782 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2783 {
2784 BUF_PUSH('\n');
2785 laststart[5]++; /* Update the length of characters */
2786 }
2787
2788 /* Read in characters and ranges, setting map bits. */
2789 for (;;)
2790 {
2791 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2792
2793 PATFETCH (c);
2794
2795 /* \ might escape characters inside [...] and [^...]. */
2796 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2797 {
2798 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2799
2800 PATFETCH (c1);
2801 BUF_PUSH(c1);
2802 laststart[5]++; /* Update the length of chars */
2803 range_start = c1;
2804 continue;
2805 }
2806
2807 /* Could be the end of the bracket expression. If it's
2808 not (i.e., when the bracket expression is `[]' so
2809 far), the ']' character bit gets set way below. */
2810 if (c == ']' && p != p1 + 1)
2811 break;
2812
2813 /* Look ahead to see if it's a range when the last thing
2814 was a character class. */
2815 if (had_char_class && c == '-' && *p != ']')
2816 FREE_STACK_RETURN (REG_ERANGE);
2817
2818 /* Look ahead to see if it's a range when the last thing
2819 was a character: if this is a hyphen not at the
2820 beginning or the end of a list, then it's the range
2821 operator. */
2822 if (c == '-'
2823 && !(p - 2 >= pattern && p[-2] == '[')
2824 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
2825 && *p != ']')
2826 {
2827 reg_errcode_t ret;
2828 /* Allocate the space for range_start and range_end. */
2829 GET_BUFFER_SPACE (2);
2830 /* Update the pointer to indicate end of buffer. */
2831 b += 2;
2832 ret = wcs_compile_range (range_start, &p, pend, translate,
2833 syntax, b, laststart);
2834 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2835 range_start = 0xffffffff;
2836 }
2837 else if (p[0] == '-' && p[1] != ']')
2838 { /* This handles ranges made up of characters only. */
2839 reg_errcode_t ret;
2840
2841 /* Move past the `-'. */
2842 PATFETCH (c1);
2843 /* Allocate the space for range_start and range_end. */
2844 GET_BUFFER_SPACE (2);
2845 /* Update the pointer to indicate end of buffer. */
2846 b += 2;
2847 ret = wcs_compile_range (c, &p, pend, translate, syntax, b,
2848 laststart);
2849 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
2850 range_start = 0xffffffff;
2851 }
2852
2853 /* See if we're at the beginning of a possible character
2854 class. */
2855 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2856 { /* Leave room for the null. */
2857 char str[CHAR_CLASS_MAX_LENGTH + 1];
2858
2859 PATFETCH (c);
2860 c1 = 0;
2861
2862 /* If pattern is `[[:'. */
2863 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2864
2865 for (;;)
2866 {
2867 PATFETCH (c);
2868 if ((c == ':' && *p == ']') || p == pend)
2869 break;
2870 if (c1 < CHAR_CLASS_MAX_LENGTH)
2871 str[c1++] = c;
2872 else
2873 /* This is in any case an invalid class name. */
2874 str[0] = '\0';
2875 }
2876 str[c1] = '\0';
2877
2878 /* If isn't a word bracketed by `[:' and `:]':
2879 undo the ending character, the letters, and leave
2880 the leading `:' and `[' (but store them as character). */
2881 if (c == ':' && *p == ']')
2882 {
2883 wctype_t wt;
2884 uintptr_t alignedp;
2885
2886 /* Query the character class as wctype_t. */
2887 wt = IS_CHAR_CLASS (str);
2888 if (wt == 0)
2889 FREE_STACK_RETURN (REG_ECTYPE);
2890
2891 /* Throw away the ] at the end of the character
2892 class. */
2893 PATFETCH (c);
2894
2895 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2896
2897 /* Allocate the space for character class. */
2898 GET_BUFFER_SPACE(CHAR_CLASS_SIZE);
2899 /* Update the pointer to indicate end of buffer. */
2900 b += CHAR_CLASS_SIZE;
2901 /* Move data which follow character classes
2902 not to violate the data. */
2903 insert_space(CHAR_CLASS_SIZE,
2904 laststart + 6 + laststart[1],
2905 b - 1);
2906 alignedp = ((uintptr_t)(laststart + 6 + laststart[1])
2907 + __alignof__(wctype_t) - 1)
2908 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
2909 /* Store the character class. */
2910 *((wctype_t*)alignedp) = wt;
2911 /* Update length of char_classes */
2912 laststart[1] += CHAR_CLASS_SIZE;
2913
2914 had_char_class = true;
2915 }
2916 else
2917 {
2918 c1++;
2919 while (c1--)
2920 PATUNFETCH;
2921 BUF_PUSH ('[');
2922 BUF_PUSH (':');
2923 laststart[5] += 2; /* Update the length of characters */
2924 range_start = ':';
2925 had_char_class = false;
2926 }
2927 }
2928 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '='
2929 || *p == '.'))
2930 {
2931 CHAR_T str[128]; /* Should be large enough. */
2932 CHAR_T delim = *p; /* '=' or '.' */
2933 # ifdef _LIBC
2934 uint32_t nrules =
2935 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2936 # endif
2937 PATFETCH (c);
2938 c1 = 0;
2939
2940 /* If pattern is `[[=' or '[[.'. */
2941 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2942
2943 for (;;)
2944 {
2945 PATFETCH (c);
2946 if ((c == delim && *p == ']') || p == pend)
2947 break;
2948 if (c1 < sizeof (str) - 1)
2949 str[c1++] = c;
2950 else
2951 /* This is in any case an invalid class name. */
2952 str[0] = '\0';
2953 }
2954 str[c1] = '\0';
2955
2956 if (c == delim && *p == ']' && str[0] != '\0')
2957 {
2958 unsigned int i, offset;
2959 /* If we have no collation data we use the default
2960 collation in which each character is in a class
2961 by itself. It also means that ASCII is the
2962 character set and therefore we cannot have character
2963 with more than one byte in the multibyte
2964 representation. */
2965
2966 /* If not defined _LIBC, we push the name and
2967 `\0' for the sake of matching performance. */
2968 int datasize = c1 + 1;
2969
2970 # ifdef _LIBC
2971 int32_t idx = 0;
2972 if (nrules == 0)
2973 # endif
2974 {
2975 if (c1 != 1)
2976 FREE_STACK_RETURN (REG_ECOLLATE);
2977 }
2978 # ifdef _LIBC
2979 else
2980 {
2981 const int32_t *table;
2982 const int32_t *weights;
2983 const int32_t *extra;
2984 const int32_t *indirect;
2985 wint_t *cp;
2986
2987 /* This #include defines a local function! */
2988 # include <locale/weightwc.h>
2989
2990 if(delim == '=')
2991 {
2992 /* We push the index for equivalence class. */
2993 cp = (wint_t*)str;
2994
2995 table = (const int32_t *)
2996 _NL_CURRENT (LC_COLLATE,
2997 _NL_COLLATE_TABLEWC);
2998 weights = (const int32_t *)
2999 _NL_CURRENT (LC_COLLATE,
3000 _NL_COLLATE_WEIGHTWC);
3001 extra = (const int32_t *)
3002 _NL_CURRENT (LC_COLLATE,
3003 _NL_COLLATE_EXTRAWC);
3004 indirect = (const int32_t *)
3005 _NL_CURRENT (LC_COLLATE,
3006 _NL_COLLATE_INDIRECTWC);
3007
3008 idx = findidx ((const wint_t**)&cp);
3009 if (idx == 0 || cp < (wint_t*) str + c1)
3010 /* This is no valid character. */
3011 FREE_STACK_RETURN (REG_ECOLLATE);
3012
3013 str[0] = (wchar_t)idx;
3014 }
3015 else /* delim == '.' */
3016 {
3017 /* We push collation sequence value
3018 for collating symbol. */
3019 int32_t table_size;
3020 const int32_t *symb_table;
3021 const unsigned char *extra;
3022 int32_t idx;
3023 int32_t elem;
3024 int32_t second;
3025 int32_t hash;
3026 char char_str[c1];
3027
3028 /* We have to convert the name to a single-byte
3029 string. This is possible since the names
3030 consist of ASCII characters and the internal
3031 representation is UCS4. */
3032 for (i = 0; i < c1; ++i)
3033 char_str[i] = str[i];
3034
3035 table_size =
3036 _NL_CURRENT_WORD (LC_COLLATE,
3037 _NL_COLLATE_SYMB_HASH_SIZEMB);
3038 symb_table = (const int32_t *)
3039 _NL_CURRENT (LC_COLLATE,
3040 _NL_COLLATE_SYMB_TABLEMB);
3041 extra = (const unsigned char *)
3042 _NL_CURRENT (LC_COLLATE,
3043 _NL_COLLATE_SYMB_EXTRAMB);
3044
3045 /* Locate the character in the hashing table. */
3046 hash = elem_hash (char_str, c1);
3047
3048 idx = 0;
3049 elem = hash % table_size;
3050 second = hash % (table_size - 2);
3051 while (symb_table[2 * elem] != 0)
3052 {
3053 /* First compare the hashing value. */
3054 if (symb_table[2 * elem] == hash
3055 && c1 == extra[symb_table[2 * elem + 1]]
3056 && memcmp (str,
3057 &extra[symb_table[2 * elem + 1]
3058 + 1], c1) == 0)
3059 {
3060 /* Yep, this is the entry. */
3061 idx = symb_table[2 * elem + 1];
3062 idx += 1 + extra[idx];
3063 break;
3064 }
3065
3066 /* Next entry. */
3067 elem += second;
3068 }
3069
3070 if (symb_table[2 * elem] != 0)
3071 {
3072 /* Compute the index of the byte sequence
3073 in the table. */
3074 idx += 1 + extra[idx];
3075 /* Adjust for the alignment. */
3076 idx = (idx + 3) & ~4;
3077
3078 str[0] = (wchar_t) idx + 4;
3079 }
3080 else if (symb_table[2 * elem] == 0 && c1 == 1)
3081 {
3082 /* No valid character. Match it as a
3083 single byte character. */
3084 had_char_class = false;
3085 BUF_PUSH(str[0]);
3086 /* Update the length of characters */
3087 laststart[5]++;
3088 range_start = str[0];
3089
3090 /* Throw away the ] at the end of the
3091 collating symbol. */
3092 PATFETCH (c);
3093 /* exit from the switch block. */
3094 continue;
3095 }
3096 else
3097 FREE_STACK_RETURN (REG_ECOLLATE);
3098 }
3099 datasize = 1;
3100 }
3101 # endif
3102 /* Throw away the ] at the end of the equivalence
3103 class (or collating symbol). */
3104 PATFETCH (c);
3105
3106 /* Allocate the space for the equivalence class
3107 (or collating symbol) (and '\0' if needed). */
3108 GET_BUFFER_SPACE(datasize);
3109 /* Update the pointer to indicate end of buffer. */
3110 b += datasize;
3111
3112 if (delim == '=')
3113 { /* equivalence class */
3114 /* Calculate the offset of char_ranges,
3115 which is next to equivalence_classes. */
3116 offset = laststart[1] + laststart[2]
3117 + laststart[3] +6;
3118 /* Insert space. */
3119 insert_space(datasize, laststart + offset, b - 1);
3120
3121 /* Write the equivalence_class and \0. */
3122 for (i = 0 ; i < datasize ; i++)
3123 laststart[offset + i] = str[i];
3124
3125 /* Update the length of equivalence_classes. */
3126 laststart[3] += datasize;
3127 had_char_class = true;
3128 }
3129 else /* delim == '.' */
3130 { /* collating symbol */
3131 /* Calculate the offset of the equivalence_classes,
3132 which is next to collating_symbols. */
3133 offset = laststart[1] + laststart[2] + 6;
3134 /* Insert space and write the collationg_symbol
3135 and \0. */
3136 insert_space(datasize, laststart + offset, b-1);
3137 for (i = 0 ; i < datasize ; i++)
3138 laststart[offset + i] = str[i];
3139
3140 /* In re_match_2_internal if range_start < -1, we
3141 assume -range_start is the offset of the
3142 collating symbol which is specified as
3143 the character of the range start. So we assign
3144 -(laststart[1] + laststart[2] + 6) to
3145 range_start. */
3146 range_start = -(laststart[1] + laststart[2] + 6);
3147 /* Update the length of collating_symbol. */
3148 laststart[2] += datasize;
3149 had_char_class = false;
3150 }
3151 }
3152 else
3153 {
3154 c1++;
3155 while (c1--)
3156 PATUNFETCH;
3157 BUF_PUSH ('[');
3158 BUF_PUSH (delim);
3159 laststart[5] += 2; /* Update the length of characters */
3160 range_start = delim;
3161 had_char_class = false;
3162 }
3163 }
3164 else
3165 {
3166 had_char_class = false;
3167 BUF_PUSH(c);
3168 laststart[5]++; /* Update the length of characters */
3169 range_start = c;
3170 }
3171 }
3172
3173 #else /* BYTE */
3174 /* Ensure that we have enough space to push a charset: the
3175 opcode, the length count, and the bitset; 34 bytes in all. */
3176 GET_BUFFER_SPACE (34);
3177
3178 laststart = b;
3179
3180 /* We test `*p == '^' twice, instead of using an if
3181 statement, so we only need one BUF_PUSH. */
3182 BUF_PUSH (*p == '^' ? charset_not : charset);
3183 if (*p == '^')
3184 p++;
3185
3186 /* Remember the first position in the bracket expression. */
3187 p1 = p;
3188
3189 /* Push the number of bytes in the bitmap. */
3190 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
3191
3192 /* Clear the whole map. */
3193 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
3194
3195 /* charset_not matches newline according to a syntax bit. */
3196 if ((re_opcode_t) b[-2] == charset_not
3197 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
3198 SET_LIST_BIT ('\n');
3199
3200 /* Read in characters and ranges, setting map bits. */
3201 for (;;)
3202 {
3203 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3204
3205 PATFETCH (c);
3206
3207 /* \ might escape characters inside [...] and [^...]. */
3208 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
3209 {
3210 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3211
3212 PATFETCH (c1);
3213 SET_LIST_BIT (c1);
3214 range_start = c1;
3215 continue;
3216 }
3217
3218 /* Could be the end of the bracket expression. If it's
3219 not (i.e., when the bracket expression is `[]' so
3220 far), the ']' character bit gets set way below. */
3221 if (c == ']' && p != p1 + 1)
3222 break;
3223
3224 /* Look ahead to see if it's a range when the last thing
3225 was a character class. */
3226 if (had_char_class && c == '-' && *p != ']')
3227 FREE_STACK_RETURN (REG_ERANGE);
3228
3229 /* Look ahead to see if it's a range when the last thing
3230 was a character: if this is a hyphen not at the
3231 beginning or the end of a list, then it's the range
3232 operator. */
3233 if (c == '-'
3234 && !(p - 2 >= pattern && p[-2] == '[')
3235 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
3236 && *p != ']')
3237 {
3238 reg_errcode_t ret
3239 = byte_compile_range (range_start, &p, pend, translate,
3240 syntax, b);
3241 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3242 range_start = 0xffffffff;
3243 }
3244
3245 else if (p[0] == '-' && p[1] != ']')
3246 { /* This handles ranges made up of characters only. */
3247 reg_errcode_t ret;
3248
3249 /* Move past the `-'. */
3250 PATFETCH (c1);
3251
3252 ret = byte_compile_range (c, &p, pend, translate, syntax, b);
3253 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
3254 range_start = 0xffffffff;
3255 }
3256
3257 /* See if we're at the beginning of a possible character
3258 class. */
3259
3260 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
3261 { /* Leave room for the null. */
3262 char str[CHAR_CLASS_MAX_LENGTH + 1];
3263
3264 PATFETCH (c);
3265 c1 = 0;
3266
3267 /* If pattern is `[[:'. */
3268 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3269
3270 for (;;)
3271 {
3272 PATFETCH (c);
3273 if ((c == ':' && *p == ']') || p == pend)
3274 break;
3275 if (c1 < CHAR_CLASS_MAX_LENGTH)
3276 str[c1++] = c;
3277 else
3278 /* This is in any case an invalid class name. */
3279 str[0] = '\0';
3280 }
3281 str[c1] = '\0';
3282
3283 /* If isn't a word bracketed by `[:' and `:]':
3284 undo the ending character, the letters, and leave
3285 the leading `:' and `[' (but set bits for them). */
3286 if (c == ':' && *p == ']')
3287 {
3288 # if defined _LIBC || WIDE_CHAR_SUPPORT
3289 boolean is_lower = STREQ (str, "lower");
3290 boolean is_upper = STREQ (str, "upper");
3291 wctype_t wt;
3292 int ch;
3293
3294 wt = IS_CHAR_CLASS (str);
3295 if (wt == 0)
3296 FREE_STACK_RETURN (REG_ECTYPE);
3297
3298 /* Throw away the ] at the end of the character
3299 class. */
3300 PATFETCH (c);
3301
3302 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3303
3304 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
3305 {
3306 # ifdef _LIBC
3307 if (__iswctype (__btowc (ch), wt))
3308 SET_LIST_BIT (ch);
3309 # else
3310 if (iswctype (btowc (ch), wt))
3311 SET_LIST_BIT (ch);
3312 # endif
3313
3314 if (translate && (is_upper || is_lower)
3315 && (ISUPPER (ch) || ISLOWER (ch)))
3316 SET_LIST_BIT (ch);
3317 }
3318
3319 had_char_class = true;
3320 # else
3321 int ch;
3322 boolean is_alnum = STREQ (str, "alnum");
3323 boolean is_alpha = STREQ (str, "alpha");
3324 boolean is_blank = STREQ (str, "blank");
3325 boolean is_cntrl = STREQ (str, "cntrl");
3326 boolean is_digit = STREQ (str, "digit");
3327 boolean is_graph = STREQ (str, "graph");
3328 boolean is_lower = STREQ (str, "lower");
3329 boolean is_print = STREQ (str, "print");
3330 boolean is_punct = STREQ (str, "punct");
3331 boolean is_space = STREQ (str, "space");
3332 boolean is_upper = STREQ (str, "upper");
3333 boolean is_xdigit = STREQ (str, "xdigit");
3334
3335 if (!IS_CHAR_CLASS (str))
3336 FREE_STACK_RETURN (REG_ECTYPE);
3337
3338 /* Throw away the ] at the end of the character
3339 class. */
3340 PATFETCH (c);
3341
3342 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3343
3344 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
3345 {
3346 /* This was split into 3 if's to
3347 avoid an arbitrary limit in some compiler. */
3348 if ( (is_alnum && ISALNUM (ch))
3349 || (is_alpha && ISALPHA (ch))
3350 || (is_blank && ISBLANK (ch))
3351 || (is_cntrl && ISCNTRL (ch)))
3352 SET_LIST_BIT (ch);
3353 if ( (is_digit && ISDIGIT (ch))
3354 || (is_graph && ISGRAPH (ch))
3355 || (is_lower && ISLOWER (ch))
3356 || (is_print && ISPRINT (ch)))
3357 SET_LIST_BIT (ch);
3358 if ( (is_punct && ISPUNCT (ch))
3359 || (is_space && ISSPACE (ch))
3360 || (is_upper && ISUPPER (ch))
3361 || (is_xdigit && ISXDIGIT (ch)))
3362 SET_LIST_BIT (ch);
3363 if ( translate && (is_upper || is_lower)
3364 && (ISUPPER (ch) || ISLOWER (ch)))
3365 SET_LIST_BIT (ch);
3366 }
3367 had_char_class = true;
3368 # endif /* libc || wctype.h */
3369 }
3370 else
3371 {
3372 c1++;
3373 while (c1--)
3374 PATUNFETCH;
3375 SET_LIST_BIT ('[');
3376 SET_LIST_BIT (':');
3377 range_start = ':';
3378 had_char_class = false;
3379 }
3380 }
3381 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=')
3382 {
3383 unsigned char str[MB_LEN_MAX + 1];
3384 # ifdef _LIBC
3385 uint32_t nrules =
3386 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3387 # endif
3388
3389 PATFETCH (c);
3390 c1 = 0;
3391
3392 /* If pattern is `[[='. */
3393 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3394
3395 for (;;)
3396 {
3397 PATFETCH (c);
3398 if ((c == '=' && *p == ']') || p == pend)
3399 break;
3400 if (c1 < MB_LEN_MAX)
3401 str[c1++] = c;
3402 else
3403 /* This is in any case an invalid class name. */
3404 str[0] = '\0';
3405 }
3406 str[c1] = '\0';
3407
3408 if (c == '=' && *p == ']' && str[0] != '\0')
3409 {
3410 /* If we have no collation data we use the default
3411 collation in which each character is in a class
3412 by itself. It also means that ASCII is the
3413 character set and therefore we cannot have character
3414 with more than one byte in the multibyte
3415 representation. */
3416 # ifdef _LIBC
3417 if (nrules == 0)
3418 # endif
3419 {
3420 if (c1 != 1)
3421 FREE_STACK_RETURN (REG_ECOLLATE);
3422
3423 /* Throw away the ] at the end of the equivalence
3424 class. */
3425 PATFETCH (c);
3426
3427 /* Set the bit for the character. */
3428 SET_LIST_BIT (str[0]);
3429 }
3430 # ifdef _LIBC
3431 else
3432 {
3433 /* Try to match the byte sequence in `str' against
3434 those known to the collate implementation.
3435 First find out whether the bytes in `str' are
3436 actually from exactly one character. */
3437 const int32_t *table;
3438 const unsigned char *weights;
3439 const unsigned char *extra;
3440 const int32_t *indirect;
3441 int32_t idx;
3442 const unsigned char *cp = str;
3443 int ch;
3444
3445 /* This #include defines a local function! */
3446 # include <locale/weight.h>
3447
3448 table = (const int32_t *)
3449 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3450 weights = (const unsigned char *)
3451 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3452 extra = (const unsigned char *)
3453 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3454 indirect = (const int32_t *)
3455 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3456
3457 idx = findidx (&cp);
3458 if (idx == 0 || cp < str + c1)
3459 /* This is no valid character. */
3460 FREE_STACK_RETURN (REG_ECOLLATE);
3461
3462 /* Throw away the ] at the end of the equivalence
3463 class. */
3464 PATFETCH (c);
3465
3466 /* Now we have to go throught the whole table
3467 and find all characters which have the same
3468 first level weight.
3469
3470 XXX Note that this is not entirely correct.
3471 we would have to match multibyte sequences
3472 but this is not possible with the current
3473 implementation. */
3474 for (ch = 1; ch < 256; ++ch)
3475 /* XXX This test would have to be changed if we
3476 would allow matching multibyte sequences. */
3477 if (table[ch] > 0)
3478 {
3479 int32_t idx2 = table[ch];
3480 size_t len = weights[idx2];
3481
3482 /* Test whether the lenghts match. */
3483 if (weights[idx] == len)
3484 {
3485 /* They do. New compare the bytes of
3486 the weight. */
3487 size_t cnt = 0;
3488
3489 while (cnt < len
3490 && (weights[idx + 1 + cnt]
3491 == weights[idx2 + 1 + cnt]))
3492 ++cnt;
3493
3494 if (cnt == len)
3495 /* They match. Mark the character as
3496 acceptable. */
3497 SET_LIST_BIT (ch);
3498 }
3499 }
3500 }
3501 # endif
3502 had_char_class = true;
3503 }
3504 else
3505 {
3506 c1++;
3507 while (c1--)
3508 PATUNFETCH;
3509 SET_LIST_BIT ('[');
3510 SET_LIST_BIT ('=');
3511 range_start = '=';
3512 had_char_class = false;
3513 }
3514 }
3515 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.')
3516 {
3517 unsigned char str[128]; /* Should be large enough. */
3518 # ifdef _LIBC
3519 uint32_t nrules =
3520 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3521 # endif
3522
3523 PATFETCH (c);
3524 c1 = 0;
3525
3526 /* If pattern is `[[.'. */
3527 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3528
3529 for (;;)
3530 {
3531 PATFETCH (c);
3532 if ((c == '.' && *p == ']') || p == pend)
3533 break;
3534 if (c1 < sizeof (str))
3535 str[c1++] = c;
3536 else
3537 /* This is in any case an invalid class name. */
3538 str[0] = '\0';
3539 }
3540 str[c1] = '\0';
3541
3542 if (c == '.' && *p == ']' && str[0] != '\0')
3543 {
3544 /* If we have no collation data we use the default
3545 collation in which each character is the name
3546 for its own class which contains only the one
3547 character. It also means that ASCII is the
3548 character set and therefore we cannot have character
3549 with more than one byte in the multibyte
3550 representation. */
3551 # ifdef _LIBC
3552 if (nrules == 0)
3553 # endif
3554 {
3555 if (c1 != 1)
3556 FREE_STACK_RETURN (REG_ECOLLATE);
3557
3558 /* Throw away the ] at the end of the equivalence
3559 class. */
3560 PATFETCH (c);
3561
3562 /* Set the bit for the character. */
3563 SET_LIST_BIT (str[0]);
3564 range_start = ((const unsigned char *) str)[0];
3565 }
3566 # ifdef _LIBC
3567 else
3568 {
3569 /* Try to match the byte sequence in `str' against
3570 those known to the collate implementation.
3571 First find out whether the bytes in `str' are
3572 actually from exactly one character. */
3573 int32_t table_size;
3574 const int32_t *symb_table;
3575 const unsigned char *extra;
3576 int32_t idx;
3577 int32_t elem;
3578 int32_t second;
3579 int32_t hash;
3580
3581 table_size =
3582 _NL_CURRENT_WORD (LC_COLLATE,
3583 _NL_COLLATE_SYMB_HASH_SIZEMB);
3584 symb_table = (const int32_t *)
3585 _NL_CURRENT (LC_COLLATE,
3586 _NL_COLLATE_SYMB_TABLEMB);
3587 extra = (const unsigned char *)
3588 _NL_CURRENT (LC_COLLATE,
3589 _NL_COLLATE_SYMB_EXTRAMB);
3590
3591 /* Locate the character in the hashing table. */
3592 hash = elem_hash (str, c1);
3593
3594 idx = 0;
3595 elem = hash % table_size;
3596 second = hash % (table_size - 2);
3597 while (symb_table[2 * elem] != 0)
3598 {
3599 /* First compare the hashing value. */
3600 if (symb_table[2 * elem] == hash
3601 && c1 == extra[symb_table[2 * elem + 1]]
3602 && memcmp (str,
3603 &extra[symb_table[2 * elem + 1]
3604 + 1],
3605 c1) == 0)
3606 {
3607 /* Yep, this is the entry. */
3608 idx = symb_table[2 * elem + 1];
3609 idx += 1 + extra[idx];
3610 break;
3611 }
3612
3613 /* Next entry. */
3614 elem += second;
3615 }
3616
3617 if (symb_table[2 * elem] == 0)
3618 /* This is no valid character. */
3619 FREE_STACK_RETURN (REG_ECOLLATE);
3620
3621 /* Throw away the ] at the end of the equivalence
3622 class. */
3623 PATFETCH (c);
3624
3625 /* Now add the multibyte character(s) we found
3626 to the accept list.
3627
3628 XXX Note that this is not entirely correct.
3629 we would have to match multibyte sequences
3630 but this is not possible with the current
3631 implementation. Also, we have to match
3632 collating symbols, which expand to more than
3633 one file, as a whole and not allow the
3634 individual bytes. */
3635 c1 = extra[idx++];
3636 if (c1 == 1)
3637 range_start = extra[idx];
3638 while (c1-- > 0)
3639 {
3640 SET_LIST_BIT (extra[idx]);
3641 ++idx;
3642 }
3643 }
3644 # endif
3645 had_char_class = false;
3646 }
3647 else
3648 {
3649 c1++;
3650 while (c1--)
3651 PATUNFETCH;
3652 SET_LIST_BIT ('[');
3653 SET_LIST_BIT ('.');
3654 range_start = '.';
3655 had_char_class = false;
3656 }
3657 }
3658 else
3659 {
3660 had_char_class = false;
3661 SET_LIST_BIT (c);
3662 range_start = c;
3663 }
3664 }
3665
3666 /* Discard any (non)matching list bytes that are all 0 at the
3667 end of the map. Decrease the map-length byte too. */
3668 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3669 b[-1]--;
3670 b += b[-1];
3671 #endif /* WCHAR */
3672 }
3673 break;
3674
3675
3676 case '(':
3677 if (syntax & RE_NO_BK_PARENS)
3678 goto handle_open;
3679 else
3680 goto normal_char;
3681
3682
3683 case ')':
3684 if (syntax & RE_NO_BK_PARENS)
3685 goto handle_close;
3686 else
3687 goto normal_char;
3688
3689
3690 case '\n':
3691 if (syntax & RE_NEWLINE_ALT)
3692 goto handle_alt;
3693 else
3694 goto normal_char;
3695
3696
3697 case '|':
3698 if (syntax & RE_NO_BK_VBAR)
3699 goto handle_alt;
3700 else
3701 goto normal_char;
3702
3703
3704 case '{':
3705 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3706 goto handle_interval;
3707 else
3708 goto normal_char;
3709
3710
3711 case '\\':
3712 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3713
3714 /* Do not translate the character after the \, so that we can
3715 distinguish, e.g., \B from \b, even if we normally would
3716 translate, e.g., B to b. */
3717 PATFETCH_RAW (c);
3718
3719 switch (c)
3720 {
3721 case '(':
3722 if (syntax & RE_NO_BK_PARENS)
3723 goto normal_backslash;
3724
3725 handle_open:
3726 bufp->re_nsub++;
3727 regnum++;
3728
3729 if (COMPILE_STACK_FULL)
3730 {
3731 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3732 compile_stack_elt_t);
3733 if (compile_stack.stack == NULL) return REG_ESPACE;
3734
3735 compile_stack.size <<= 1;
3736 }
3737
3738 /* These are the values to restore when we hit end of this
3739 group. They are all relative offsets, so that if the
3740 whole pattern moves because of realloc, they will still
3741 be valid. */
3742 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR;
3743 COMPILE_STACK_TOP.fixup_alt_jump
3744 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0;
3745 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR;
3746 COMPILE_STACK_TOP.regnum = regnum;
3747
3748 /* We will eventually replace the 0 with the number of
3749 groups inner to this one. But do not push a
3750 start_memory for groups beyond the last one we can
3751 represent in the compiled pattern. */
3752 if (regnum <= MAX_REGNUM)
3753 {
3754 COMPILE_STACK_TOP.inner_group_offset = b
3755 - COMPILED_BUFFER_VAR + 2;
3756 BUF_PUSH_3 (start_memory, regnum, 0);
3757 }
3758
3759 compile_stack.avail++;
3760
3761 fixup_alt_jump = 0;
3762 laststart = 0;
3763 begalt = b;
3764 /* If we've reached MAX_REGNUM groups, then this open
3765 won't actually generate any code, so we'll have to
3766 clear pending_exact explicitly. */
3767 pending_exact = 0;
3768 break;
3769
3770
3771 case ')':
3772 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3773
3774 if (COMPILE_STACK_EMPTY)
3775 {
3776 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3777 goto normal_backslash;
3778 else
3779 FREE_STACK_RETURN (REG_ERPAREN);
3780 }
3781
3782 handle_close:
3783 if (fixup_alt_jump)
3784 { /* Push a dummy failure point at the end of the
3785 alternative for a possible future
3786 `pop_failure_jump' to pop. See comments at
3787 `push_dummy_failure' in `re_match_2'. */
3788 BUF_PUSH (push_dummy_failure);
3789
3790 /* We allocated space for this jump when we assigned
3791 to `fixup_alt_jump', in the `handle_alt' case below. */
3792 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
3793 }
3794
3795 /* See similar code for backslashed left paren above. */
3796 if (COMPILE_STACK_EMPTY)
3797 {
3798 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3799 goto normal_char;
3800 else
3801 FREE_STACK_RETURN (REG_ERPAREN);
3802 }
3803
3804 /* Since we just checked for an empty stack above, this
3805 ``can't happen''. */
3806 assert (compile_stack.avail != 0);
3807 {
3808 /* We don't just want to restore into `regnum', because
3809 later groups should continue to be numbered higher,
3810 as in `(ab)c(de)' -- the second group is #2. */
3811 regnum_t this_group_regnum;
3812
3813 compile_stack.avail--;
3814 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset;
3815 fixup_alt_jump
3816 = COMPILE_STACK_TOP.fixup_alt_jump
3817 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1
3818 : 0;
3819 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset;
3820 this_group_regnum = COMPILE_STACK_TOP.regnum;
3821 /* If we've reached MAX_REGNUM groups, then this open
3822 won't actually generate any code, so we'll have to
3823 clear pending_exact explicitly. */
3824 pending_exact = 0;
3825
3826 /* We're at the end of the group, so now we know how many
3827 groups were inside this one. */
3828 if (this_group_regnum <= MAX_REGNUM)
3829 {
3830 UCHAR_T *inner_group_loc
3831 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset;
3832
3833 *inner_group_loc = regnum - this_group_regnum;
3834 BUF_PUSH_3 (stop_memory, this_group_regnum,
3835 regnum - this_group_regnum);
3836 }
3837 }
3838 break;
3839
3840
3841 case '|': /* `\|'. */
3842 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3843 goto normal_backslash;
3844 handle_alt:
3845 if (syntax & RE_LIMITED_OPS)
3846 goto normal_char;
3847
3848 /* Insert before the previous alternative a jump which
3849 jumps to this alternative if the former fails. */
3850 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3851 INSERT_JUMP (on_failure_jump, begalt,
3852 b + 2 + 2 * OFFSET_ADDRESS_SIZE);
3853 pending_exact = 0;
3854 b += 1 + OFFSET_ADDRESS_SIZE;
3855
3856 /* The alternative before this one has a jump after it
3857 which gets executed if it gets matched. Adjust that
3858 jump so it will jump to this alternative's analogous
3859 jump (put in below, which in turn will jump to the next
3860 (if any) alternative's such jump, etc.). The last such
3861 jump jumps to the correct final destination. A picture:
3862 _____ _____
3863 | | | |
3864 | v | v
3865 a | b | c
3866
3867 If we are at `b', then fixup_alt_jump right now points to a
3868 three-byte space after `a'. We'll put in the jump, set
3869 fixup_alt_jump to right after `b', and leave behind three
3870 bytes which we'll fill in when we get to after `c'. */
3871
3872 if (fixup_alt_jump)
3873 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
3874
3875 /* Mark and leave space for a jump after this alternative,
3876 to be filled in later either by next alternative or
3877 when know we're at the end of a series of alternatives. */
3878 fixup_alt_jump = b;
3879 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3880 b += 1 + OFFSET_ADDRESS_SIZE;
3881
3882 laststart = 0;
3883 begalt = b;
3884 break;
3885
3886
3887 case '{':
3888 /* If \{ is a literal. */
3889 if (!(syntax & RE_INTERVALS)
3890 /* If we're at `\{' and it's not the open-interval
3891 operator. */
3892 || (syntax & RE_NO_BK_BRACES))
3893 goto normal_backslash;
3894
3895 handle_interval:
3896 {
3897 /* If got here, then the syntax allows intervals. */
3898
3899 /* At least (most) this many matches must be made. */
3900 int lower_bound = -1, upper_bound = -1;
3901
3902 /* Place in the uncompiled pattern (i.e., just after
3903 the '{') to go back to if the interval is invalid. */
3904 const CHAR_T *beg_interval = p;
3905
3906 if (p == pend)
3907 goto invalid_interval;
3908
3909 GET_UNSIGNED_NUMBER (lower_bound);
3910
3911 if (c == ',')
3912 {
3913 GET_UNSIGNED_NUMBER (upper_bound);
3914 if (upper_bound < 0)
3915 upper_bound = RE_DUP_MAX;
3916 }
3917 else
3918 /* Interval such as `{1}' => match exactly once. */
3919 upper_bound = lower_bound;
3920
3921 if (! (0 <= lower_bound && lower_bound <= upper_bound))
3922 goto invalid_interval;
3923
3924 if (!(syntax & RE_NO_BK_BRACES))
3925 {
3926 if (c != '\\' || p == pend)
3927 goto invalid_interval;
3928 PATFETCH (c);
3929 }
3930
3931 if (c != '}')
3932 goto invalid_interval;
3933
3934 /* If it's invalid to have no preceding re. */
3935 if (!laststart)
3936 {
3937 if (syntax & RE_CONTEXT_INVALID_OPS
3938 && !(syntax & RE_INVALID_INTERVAL_ORD))
3939 FREE_STACK_RETURN (REG_BADRPT);
3940 else if (syntax & RE_CONTEXT_INDEP_OPS)
3941 laststart = b;
3942 else
3943 goto unfetch_interval;
3944 }
3945
3946 /* We just parsed a valid interval. */
3947
3948 if (RE_DUP_MAX < upper_bound)
3949 FREE_STACK_RETURN (REG_BADBR);
3950
3951 /* If the upper bound is zero, don't want to succeed at
3952 all; jump from `laststart' to `b + 3', which will be
3953 the end of the buffer after we insert the jump. */
3954 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE'
3955 instead of 'b + 3'. */
3956 if (upper_bound == 0)
3957 {
3958 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE);
3959 INSERT_JUMP (jump, laststart, b + 1
3960 + OFFSET_ADDRESS_SIZE);
3961 b += 1 + OFFSET_ADDRESS_SIZE;
3962 }
3963
3964 /* Otherwise, we have a nontrivial interval. When
3965 we're all done, the pattern will look like:
3966 set_number_at <jump count> <upper bound>
3967 set_number_at <succeed_n count> <lower bound>
3968 succeed_n <after jump addr> <succeed_n count>
3969 <body of loop>
3970 jump_n <succeed_n addr> <jump count>
3971 (The upper bound and `jump_n' are omitted if
3972 `upper_bound' is 1, though.) */
3973 else
3974 { /* If the upper bound is > 1, we need to insert
3975 more at the end of the loop. */
3976 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE +
3977 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE);
3978
3979 GET_BUFFER_SPACE (nbytes);
3980
3981 /* Initialize lower bound of the `succeed_n', even
3982 though it will be set during matching by its
3983 attendant `set_number_at' (inserted next),
3984 because `re_compile_fastmap' needs to know.
3985 Jump to the `jump_n' we might insert below. */
3986 INSERT_JUMP2 (succeed_n, laststart,
3987 b + 1 + 2 * OFFSET_ADDRESS_SIZE
3988 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE)
3989 , lower_bound);
3990 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
3991
3992 /* Code to initialize the lower bound. Insert
3993 before the `succeed_n'. The `5' is the last two
3994 bytes of this `set_number_at', plus 3 bytes of
3995 the following `succeed_n'. */
3996 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE'
3997 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE'
3998 of the following `succeed_n'. */
3999 PREFIX(insert_op2) (set_number_at, laststart, 1
4000 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b);
4001 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4002
4003 if (upper_bound > 1)
4004 { /* More than one repetition is allowed, so
4005 append a backward jump to the `succeed_n'
4006 that starts this interval.
4007
4008 When we've reached this during matching,
4009 we'll have matched the interval once, so
4010 jump back only `upper_bound - 1' times. */
4011 STORE_JUMP2 (jump_n, b, laststart
4012 + 2 * OFFSET_ADDRESS_SIZE + 1,
4013 upper_bound - 1);
4014 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4015
4016 /* The location we want to set is the second
4017 parameter of the `jump_n'; that is `b-2' as
4018 an absolute address. `laststart' will be
4019 the `set_number_at' we're about to insert;
4020 `laststart+3' the number to set, the source
4021 for the relative address. But we are
4022 inserting into the middle of the pattern --
4023 so everything is getting moved up by 5.
4024 Conclusion: (b - 2) - (laststart + 3) + 5,
4025 i.e., b - laststart.
4026
4027 We insert this at the beginning of the loop
4028 so that if we fail during matching, we'll
4029 reinitialize the bounds. */
4030 PREFIX(insert_op2) (set_number_at, laststart,
4031 b - laststart,
4032 upper_bound - 1, b);
4033 b += 1 + 2 * OFFSET_ADDRESS_SIZE;
4034 }
4035 }
4036 pending_exact = 0;
4037 break;
4038
4039 invalid_interval:
4040 if (!(syntax & RE_INVALID_INTERVAL_ORD))
4041 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR);
4042 unfetch_interval:
4043 /* Match the characters as literals. */
4044 p = beg_interval;
4045 c = '{';
4046 if (syntax & RE_NO_BK_BRACES)
4047 goto normal_char;
4048 else
4049 goto normal_backslash;
4050 }
4051
4052 #ifdef emacs
4053 /* There is no way to specify the before_dot and after_dot
4054 operators. rms says this is ok. --karl */
4055 case '=':
4056 BUF_PUSH (at_dot);
4057 break;
4058
4059 case 's':
4060 laststart = b;
4061 PATFETCH (c);
4062 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
4063 break;
4064
4065 case 'S':
4066 laststart = b;
4067 PATFETCH (c);
4068 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
4069 break;
4070 #endif /* emacs */
4071
4072
4073 case 'w':
4074 if (syntax & RE_NO_GNU_OPS)
4075 goto normal_char;
4076 laststart = b;
4077 BUF_PUSH (wordchar);
4078 break;
4079
4080
4081 case 'W':
4082 if (syntax & RE_NO_GNU_OPS)
4083 goto normal_char;
4084 laststart = b;
4085 BUF_PUSH (notwordchar);
4086 break;
4087
4088
4089 case '<':
4090 if (syntax & RE_NO_GNU_OPS)
4091 goto normal_char;
4092 BUF_PUSH (wordbeg);
4093 break;
4094
4095 case '>':
4096 if (syntax & RE_NO_GNU_OPS)
4097 goto normal_char;
4098 BUF_PUSH (wordend);
4099 break;
4100
4101 case 'b':
4102 if (syntax & RE_NO_GNU_OPS)
4103 goto normal_char;
4104 BUF_PUSH (wordbound);
4105 break;
4106
4107 case 'B':
4108 if (syntax & RE_NO_GNU_OPS)
4109 goto normal_char;
4110 BUF_PUSH (notwordbound);
4111 break;
4112
4113 case '`':
4114 if (syntax & RE_NO_GNU_OPS)
4115 goto normal_char;
4116 BUF_PUSH (begbuf);
4117 break;
4118
4119 case '\'':
4120 if (syntax & RE_NO_GNU_OPS)
4121 goto normal_char;
4122 BUF_PUSH (endbuf);
4123 break;
4124
4125 case '1': case '2': case '3': case '4': case '5':
4126 case '6': case '7': case '8': case '9':
4127 if (syntax & RE_NO_BK_REFS)
4128 goto normal_char;
4129
4130 c1 = c - '0';
4131
4132 if (c1 > regnum)
4133 FREE_STACK_RETURN (REG_ESUBREG);
4134
4135 /* Can't back reference to a subexpression if inside of it. */
4136 if (group_in_compile_stack (compile_stack, (regnum_t) c1))
4137 goto normal_char;
4138
4139 laststart = b;
4140 BUF_PUSH_2 (duplicate, c1);
4141 break;
4142
4143
4144 case '+':
4145 case '?':
4146 if (syntax & RE_BK_PLUS_QM)
4147 goto handle_plus;
4148 else
4149 goto normal_backslash;
4150
4151 default:
4152 normal_backslash:
4153 /* You might think it would be useful for \ to mean
4154 not to translate; but if we don't translate it
4155 it will never match anything. */
4156 c = TRANSLATE (c);
4157 goto normal_char;
4158 }
4159 break;
4160
4161
4162 default:
4163 /* Expects the character in `c'. */
4164 normal_char:
4165 /* If no exactn currently being built. */
4166 if (!pending_exact
4167 #ifdef WCHAR
4168 /* If last exactn handle binary(or character) and
4169 new exactn handle character(or binary). */
4170 || is_exactn_bin != is_binary[p - 1 - pattern]
4171 #endif /* WCHAR */
4172
4173 /* If last exactn not at current position. */
4174 || pending_exact + *pending_exact + 1 != b
4175
4176 /* We have only one byte following the exactn for the count. */
4177 || *pending_exact == (1 << BYTEWIDTH) - 1
4178
4179 /* If followed by a repetition operator. */
4180 || *p == '*' || *p == '^'
4181 || ((syntax & RE_BK_PLUS_QM)
4182 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
4183 : (*p == '+' || *p == '?'))
4184 || ((syntax & RE_INTERVALS)
4185 && ((syntax & RE_NO_BK_BRACES)
4186 ? *p == '{'
4187 : (p[0] == '\\' && p[1] == '{'))))
4188 {
4189 /* Start building a new exactn. */
4190
4191 laststart = b;
4192
4193 #ifdef WCHAR
4194 /* Is this exactn binary data or character? */
4195 is_exactn_bin = is_binary[p - 1 - pattern];
4196 if (is_exactn_bin)
4197 BUF_PUSH_2 (exactn_bin, 0);
4198 else
4199 BUF_PUSH_2 (exactn, 0);
4200 #else
4201 BUF_PUSH_2 (exactn, 0);
4202 #endif /* WCHAR */
4203 pending_exact = b - 1;
4204 }
4205
4206 BUF_PUSH (c);
4207 (*pending_exact)++;
4208 break;
4209 } /* switch (c) */
4210 } /* while p != pend */
4211
4212
4213 /* Through the pattern now. */
4214
4215 if (fixup_alt_jump)
4216 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
4217
4218 if (!COMPILE_STACK_EMPTY)
4219 FREE_STACK_RETURN (REG_EPAREN);
4220
4221 /* If we don't want backtracking, force success
4222 the first time we reach the end of the compiled pattern. */
4223 if (syntax & RE_NO_POSIX_BACKTRACKING)
4224 BUF_PUSH (succeed);
4225
4226 #ifdef WCHAR
4227 free (pattern);
4228 free (mbs_offset);
4229 free (is_binary);
4230 #endif
4231 free (compile_stack.stack);
4232
4233 /* We have succeeded; set the length of the buffer. */
4234 #ifdef WCHAR
4235 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR;
4236 #else
4237 bufp->used = b - bufp->buffer;
4238 #endif
4239
4240 #ifdef DEBUG
4241 if (debug)
4242 {
4243 DEBUG_PRINT1 ("\nCompiled pattern: \n");
4244 PREFIX(print_compiled_pattern) (bufp);
4245 }
4246 #endif /* DEBUG */
4247
4248 #ifndef MATCH_MAY_ALLOCATE
4249 /* Initialize the failure stack to the largest possible stack. This
4250 isn't necessary unless we're trying to avoid calling alloca in
4251 the search and match routines. */
4252 {
4253 int num_regs = bufp->re_nsub + 1;
4254
4255 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
4256 is strictly greater than re_max_failures, the largest possible stack
4257 is 2 * re_max_failures failure points. */
4258 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
4259 {
4260 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
4261
4262 # ifdef emacs
4263 if (! fail_stack.stack)
4264 fail_stack.stack
4265 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size
4266 * sizeof (PREFIX(fail_stack_elt_t)));
4267 else
4268 fail_stack.stack
4269 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack,
4270 (fail_stack.size
4271 * sizeof (PREFIX(fail_stack_elt_t))));
4272 # else /* not emacs */
4273 if (! fail_stack.stack)
4274 fail_stack.stack
4275 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size
4276 * sizeof (PREFIX(fail_stack_elt_t)));
4277 else
4278 fail_stack.stack
4279 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack,
4280 (fail_stack.size
4281 * sizeof (PREFIX(fail_stack_elt_t))));
4282 # endif /* not emacs */
4283 }
4284
4285 PREFIX(regex_grow_registers) (num_regs);
4286 }
4287 #endif /* not MATCH_MAY_ALLOCATE */
4288
4289 return REG_NOERROR;
4290 } /* regex_compile */
4291
4292 /* Subroutines for `regex_compile'. */
4293
4294 /* Store OP at LOC followed by two-byte integer parameter ARG. */
4295 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4296
4297 static void
4298 PREFIX(store_op1) (op, loc, arg)
4299 re_opcode_t op;
4300 UCHAR_T *loc;
4301 int arg;
4302 {
4303 *loc = (UCHAR_T) op;
4304 STORE_NUMBER (loc + 1, arg);
4305 }
4306
4307
4308 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
4309 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4310
4311 static void
4312 PREFIX(store_op2) (op, loc, arg1, arg2)
4313 re_opcode_t op;
4314 UCHAR_T *loc;
4315 int arg1, arg2;
4316 {
4317 *loc = (UCHAR_T) op;
4318 STORE_NUMBER (loc + 1, arg1);
4319 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2);
4320 }
4321
4322
4323 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
4324 for OP followed by two-byte integer parameter ARG. */
4325 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4326
4327 static void
4328 PREFIX(insert_op1) (op, loc, arg, end)
4329 re_opcode_t op;
4330 UCHAR_T *loc;
4331 int arg;
4332 UCHAR_T *end;
4333 {
4334 register UCHAR_T *pfrom = end;
4335 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE;
4336
4337 while (pfrom != loc)
4338 *--pto = *--pfrom;
4339
4340 PREFIX(store_op1) (op, loc, arg);
4341 }
4342
4343
4344 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
4345 /* ifdef WCHAR, integer parameter is 1 wchar_t. */
4346
4347 static void
4348 PREFIX(insert_op2) (op, loc, arg1, arg2, end)
4349 re_opcode_t op;
4350 UCHAR_T *loc;
4351 int arg1, arg2;
4352 UCHAR_T *end;
4353 {
4354 register UCHAR_T *pfrom = end;
4355 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE;
4356
4357 while (pfrom != loc)
4358 *--pto = *--pfrom;
4359
4360 PREFIX(store_op2) (op, loc, arg1, arg2);
4361 }
4362
4363
4364 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
4365 after an alternative or a begin-subexpression. We assume there is at
4366 least one character before the ^. */
4367
4368 static boolean
4369 PREFIX(at_begline_loc_p) (pattern, p, syntax)
4370 const CHAR_T *pattern, *p;
4371 reg_syntax_t syntax;
4372 {
4373 const CHAR_T *prev = p - 2;
4374 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
4375
4376 return
4377 /* After a subexpression? */
4378 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
4379 /* After an alternative? */
4380 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
4381 }
4382
4383
4384 /* The dual of at_begline_loc_p. This one is for $. We assume there is
4385 at least one character after the $, i.e., `P < PEND'. */
4386
4387 static boolean
4388 PREFIX(at_endline_loc_p) (p, pend, syntax)
4389 const CHAR_T *p, *pend;
4390 reg_syntax_t syntax;
4391 {
4392 const CHAR_T *next = p;
4393 boolean next_backslash = *next == '\\';
4394 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0;
4395
4396 return
4397 /* Before a subexpression? */
4398 (syntax & RE_NO_BK_PARENS ? *next == ')'
4399 : next_backslash && next_next && *next_next == ')')
4400 /* Before an alternative? */
4401 || (syntax & RE_NO_BK_VBAR ? *next == '|'
4402 : next_backslash && next_next && *next_next == '|');
4403 }
4404
4405 #else /* not INSIDE_RECURSION */
4406
4407 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
4408 false if it's not. */
4409
4410 static boolean
4411 group_in_compile_stack (compile_stack, regnum)
4412 compile_stack_type compile_stack;
4413 regnum_t regnum;
4414 {
4415 int this_element;
4416
4417 for (this_element = compile_stack.avail - 1;
4418 this_element >= 0;
4419 this_element--)
4420 if (compile_stack.stack[this_element].regnum == regnum)
4421 return true;
4422
4423 return false;
4424 }
4425 #endif /* not INSIDE_RECURSION */
4426
4427 #ifdef INSIDE_RECURSION
4428
4429 #ifdef WCHAR
4430 /* This insert space, which size is "num", into the pattern at "loc".
4431 "end" must point the end of the allocated buffer. */
4432 static void
4433 insert_space (num, loc, end)
4434 int num;
4435 CHAR_T *loc;
4436 CHAR_T *end;
4437 {
4438 register CHAR_T *pto = end;
4439 register CHAR_T *pfrom = end - num;
4440
4441 while (pfrom >= loc)
4442 *pto-- = *pfrom--;
4443 }
4444 #endif /* WCHAR */
4445
4446 #ifdef WCHAR
4447 static reg_errcode_t
4448 wcs_compile_range (range_start_char, p_ptr, pend, translate, syntax, b,
4449 char_set)
4450 CHAR_T range_start_char;
4451 const CHAR_T **p_ptr, *pend;
4452 CHAR_T *char_set, *b;
4453 RE_TRANSLATE_TYPE translate;
4454 reg_syntax_t syntax;
4455 {
4456 const CHAR_T *p = *p_ptr;
4457 CHAR_T range_start, range_end;
4458 reg_errcode_t ret;
4459 # ifdef _LIBC
4460 uint32_t nrules;
4461 uint32_t start_val, end_val;
4462 # endif
4463 if (p == pend)
4464 return REG_ERANGE;
4465
4466 # ifdef _LIBC
4467 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
4468 if (nrules != 0)
4469 {
4470 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE,
4471 _NL_COLLATE_COLLSEQWC);
4472 const unsigned char *extra = (const unsigned char *)
4473 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4474
4475 if (range_start_char < -1)
4476 {
4477 /* range_start is a collating symbol. */
4478 int32_t *wextra;
4479 /* Retreive the index and get collation sequence value. */
4480 wextra = (int32_t*)(extra + char_set[-range_start_char]);
4481 start_val = wextra[1 + *wextra];
4482 }
4483 else
4484 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char));
4485
4486 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0]));
4487
4488 /* Report an error if the range is empty and the syntax prohibits
4489 this. */
4490 ret = ((syntax & RE_NO_EMPTY_RANGES)
4491 && (start_val > end_val))? REG_ERANGE : REG_NOERROR;
4492
4493 /* Insert space to the end of the char_ranges. */
4494 insert_space(2, b - char_set[5] - 2, b - 1);
4495 *(b - char_set[5] - 2) = (wchar_t)start_val;
4496 *(b - char_set[5] - 1) = (wchar_t)end_val;
4497 char_set[4]++; /* ranges_index */
4498 }
4499 else
4500 # endif
4501 {
4502 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char):
4503 range_start_char;
4504 range_end = TRANSLATE (p[0]);
4505 /* Report an error if the range is empty and the syntax prohibits
4506 this. */
4507 ret = ((syntax & RE_NO_EMPTY_RANGES)
4508 && (range_start > range_end))? REG_ERANGE : REG_NOERROR;
4509
4510 /* Insert space to the end of the char_ranges. */
4511 insert_space(2, b - char_set[5] - 2, b - 1);
4512 *(b - char_set[5] - 2) = range_start;
4513 *(b - char_set[5] - 1) = range_end;
4514 char_set[4]++; /* ranges_index */
4515 }
4516 /* Have to increment the pointer into the pattern string, so the
4517 caller isn't still at the ending character. */
4518 (*p_ptr)++;
4519
4520 return ret;
4521 }
4522 #else /* BYTE */
4523 /* Read the ending character of a range (in a bracket expression) from the
4524 uncompiled pattern *P_PTR (which ends at PEND). We assume the
4525 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
4526 Then we set the translation of all bits between the starting and
4527 ending characters (inclusive) in the compiled pattern B.
4528
4529 Return an error code.
4530
4531 We use these short variable names so we can use the same macros as
4532 `regex_compile' itself. */
4533
4534 static reg_errcode_t
4535 byte_compile_range (range_start_char, p_ptr, pend, translate, syntax, b)
4536 unsigned int range_start_char;
4537 const char **p_ptr, *pend;
4538 RE_TRANSLATE_TYPE translate;
4539 reg_syntax_t syntax;
4540 unsigned char *b;
4541 {
4542 unsigned this_char;
4543 const char *p = *p_ptr;
4544 reg_errcode_t ret;
4545 # if _LIBC
4546 const unsigned char *collseq;
4547 unsigned int start_colseq;
4548 unsigned int end_colseq;
4549 # else
4550 unsigned end_char;
4551 # endif
4552
4553 if (p == pend)
4554 return REG_ERANGE;
4555
4556 /* Have to increment the pointer into the pattern string, so the
4557 caller isn't still at the ending character. */
4558 (*p_ptr)++;
4559
4560 /* Report an error if the range is empty and the syntax prohibits this. */
4561 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
4562
4563 # if _LIBC
4564 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
4565 _NL_COLLATE_COLLSEQMB);
4566
4567 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)];
4568 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])];
4569 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char)
4570 {
4571 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)];
4572
4573 if (start_colseq <= this_colseq && this_colseq <= end_colseq)
4574 {
4575 SET_LIST_BIT (TRANSLATE (this_char));
4576 ret = REG_NOERROR;
4577 }
4578 }
4579 # else
4580 /* Here we see why `this_char' has to be larger than an `unsigned
4581 char' -- we would otherwise go into an infinite loop, since all
4582 characters <= 0xff. */
4583 range_start_char = TRANSLATE (range_start_char);
4584 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE,
4585 and some compilers cast it to int implicitly, so following for_loop
4586 may fall to (almost) infinite loop.
4587 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff.
4588 To avoid this, we cast p[0] to unsigned int and truncate it. */
4589 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1));
4590
4591 for (this_char = range_start_char; this_char <= end_char; ++this_char)
4592 {
4593 SET_LIST_BIT (TRANSLATE (this_char));
4594 ret = REG_NOERROR;
4595 }
4596 # endif
4597
4598 return ret;
4599 }
4600 #endif /* WCHAR */
4601 \f
4602 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4603 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4604 characters can start a string that matches the pattern. This fastmap
4605 is used by re_search to skip quickly over impossible starting points.
4606
4607 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4608 area as BUFP->fastmap.
4609
4610 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4611 the pattern buffer.
4612
4613 Returns 0 if we succeed, -2 if an internal error. */
4614
4615 #ifdef WCHAR
4616 /* local function for re_compile_fastmap.
4617 truncate wchar_t character to char. */
4618 static unsigned char truncate_wchar (CHAR_T c);
4619
4620 static unsigned char
4621 truncate_wchar (c)
4622 CHAR_T c;
4623 {
4624 unsigned char buf[MB_LEN_MAX];
4625 int retval = wctomb(buf, c);
4626 return retval > 0 ? buf[0] : (unsigned char)c;
4627 }
4628 #endif /* WCHAR */
4629
4630 static int
4631 PREFIX(re_compile_fastmap) (bufp)
4632 struct re_pattern_buffer *bufp;
4633 {
4634 int j, k;
4635 #ifdef MATCH_MAY_ALLOCATE
4636 PREFIX(fail_stack_type) fail_stack;
4637 #endif
4638 #ifndef REGEX_MALLOC
4639 char *destination;
4640 #endif
4641
4642 register char *fastmap = bufp->fastmap;
4643
4644 #ifdef WCHAR
4645 /* We need to cast pattern to (wchar_t*), because we casted this compiled
4646 pattern to (char*) in regex_compile. */
4647 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer;
4648 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used);
4649 #else /* BYTE */
4650 UCHAR_T *pattern = bufp->buffer;
4651 register UCHAR_T *pend = pattern + bufp->used;
4652 #endif /* WCHAR */
4653 UCHAR_T *p = pattern;
4654
4655 #ifdef REL_ALLOC
4656 /* This holds the pointer to the failure stack, when
4657 it is allocated relocatably. */
4658 fail_stack_elt_t *failure_stack_ptr;
4659 #endif
4660
4661 /* Assume that each path through the pattern can be null until
4662 proven otherwise. We set this false at the bottom of switch
4663 statement, to which we get only if a particular path doesn't
4664 match the empty string. */
4665 boolean path_can_be_null = true;
4666
4667 /* We aren't doing a `succeed_n' to begin with. */
4668 boolean succeed_n_p = false;
4669
4670 assert (fastmap != NULL && p != NULL);
4671
4672 INIT_FAIL_STACK ();
4673 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4674 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4675 bufp->can_be_null = 0;
4676
4677 while (1)
4678 {
4679 if (p == pend || *p == succeed)
4680 {
4681 /* We have reached the (effective) end of pattern. */
4682 if (!FAIL_STACK_EMPTY ())
4683 {
4684 bufp->can_be_null |= path_can_be_null;
4685
4686 /* Reset for next path. */
4687 path_can_be_null = true;
4688
4689 p = fail_stack.stack[--fail_stack.avail].pointer;
4690
4691 continue;
4692 }
4693 else
4694 break;
4695 }
4696
4697 /* We should never be about to go beyond the end of the pattern. */
4698 assert (p < pend);
4699
4700 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4701 {
4702
4703 /* I guess the idea here is to simply not bother with a fastmap
4704 if a backreference is used, since it's too hard to figure out
4705 the fastmap for the corresponding group. Setting
4706 `can_be_null' stops `re_search_2' from using the fastmap, so
4707 that is all we do. */
4708 case duplicate:
4709 bufp->can_be_null = 1;
4710 goto done;
4711
4712
4713 /* Following are the cases which match a character. These end
4714 with `break'. */
4715
4716 #ifdef WCHAR
4717 case exactn:
4718 fastmap[truncate_wchar(p[1])] = 1;
4719 break;
4720 #else /* BYTE */
4721 case exactn:
4722 fastmap[p[1]] = 1;
4723 break;
4724 #endif /* WCHAR */
4725 #ifdef MBS_SUPPORT
4726 case exactn_bin:
4727 fastmap[p[1]] = 1;
4728 break;
4729 #endif
4730
4731 #ifdef WCHAR
4732 /* It is hard to distinguish fastmap from (multi byte) characters
4733 which depends on current locale. */
4734 case charset:
4735 case charset_not:
4736 case wordchar:
4737 case notwordchar:
4738 bufp->can_be_null = 1;
4739 goto done;
4740 #else /* BYTE */
4741 case charset:
4742 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4743 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
4744 fastmap[j] = 1;
4745 break;
4746
4747
4748 case charset_not:
4749 /* Chars beyond end of map must be allowed. */
4750 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
4751 fastmap[j] = 1;
4752
4753 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
4754 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
4755 fastmap[j] = 1;
4756 break;
4757
4758
4759 case wordchar:
4760 for (j = 0; j < (1 << BYTEWIDTH); j++)
4761 if (SYNTAX (j) == Sword)
4762 fastmap[j] = 1;
4763 break;
4764
4765
4766 case notwordchar:
4767 for (j = 0; j < (1 << BYTEWIDTH); j++)
4768 if (SYNTAX (j) != Sword)
4769 fastmap[j] = 1;
4770 break;
4771 #endif /* WCHAR */
4772
4773 case anychar:
4774 {
4775 int fastmap_newline = fastmap['\n'];
4776
4777 /* `.' matches anything ... */
4778 for (j = 0; j < (1 << BYTEWIDTH); j++)
4779 fastmap[j] = 1;
4780
4781 /* ... except perhaps newline. */
4782 if (!(bufp->syntax & RE_DOT_NEWLINE))
4783 fastmap['\n'] = fastmap_newline;
4784
4785 /* Return if we have already set `can_be_null'; if we have,
4786 then the fastmap is irrelevant. Something's wrong here. */
4787 else if (bufp->can_be_null)
4788 goto done;
4789
4790 /* Otherwise, have to check alternative paths. */
4791 break;
4792 }
4793
4794 #ifdef emacs
4795 case syntaxspec:
4796 k = *p++;
4797 for (j = 0; j < (1 << BYTEWIDTH); j++)
4798 if (SYNTAX (j) == (enum syntaxcode) k)
4799 fastmap[j] = 1;
4800 break;
4801
4802
4803 case notsyntaxspec:
4804 k = *p++;
4805 for (j = 0; j < (1 << BYTEWIDTH); j++)
4806 if (SYNTAX (j) != (enum syntaxcode) k)
4807 fastmap[j] = 1;
4808 break;
4809
4810
4811 /* All cases after this match the empty string. These end with
4812 `continue'. */
4813
4814
4815 case before_dot:
4816 case at_dot:
4817 case after_dot:
4818 continue;
4819 #endif /* emacs */
4820
4821
4822 case no_op:
4823 case begline:
4824 case endline:
4825 case begbuf:
4826 case endbuf:
4827 case wordbound:
4828 case notwordbound:
4829 case wordbeg:
4830 case wordend:
4831 case push_dummy_failure:
4832 continue;
4833
4834
4835 case jump_n:
4836 case pop_failure_jump:
4837 case maybe_pop_jump:
4838 case jump:
4839 case jump_past_alt:
4840 case dummy_failure_jump:
4841 EXTRACT_NUMBER_AND_INCR (j, p);
4842 p += j;
4843 if (j > 0)
4844 continue;
4845
4846 /* Jump backward implies we just went through the body of a
4847 loop and matched nothing. Opcode jumped to should be
4848 `on_failure_jump' or `succeed_n'. Just treat it like an
4849 ordinary jump. For a * loop, it has pushed its failure
4850 point already; if so, discard that as redundant. */
4851 if ((re_opcode_t) *p != on_failure_jump
4852 && (re_opcode_t) *p != succeed_n)
4853 continue;
4854
4855 p++;
4856 EXTRACT_NUMBER_AND_INCR (j, p);
4857 p += j;
4858
4859 /* If what's on the stack is where we are now, pop it. */
4860 if (!FAIL_STACK_EMPTY ()
4861 && fail_stack.stack[fail_stack.avail - 1].pointer == p)
4862 fail_stack.avail--;
4863
4864 continue;
4865
4866
4867 case on_failure_jump:
4868 case on_failure_keep_string_jump:
4869 handle_on_failure_jump:
4870 EXTRACT_NUMBER_AND_INCR (j, p);
4871
4872 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
4873 end of the pattern. We don't want to push such a point,
4874 since when we restore it above, entering the switch will
4875 increment `p' past the end of the pattern. We don't need
4876 to push such a point since we obviously won't find any more
4877 fastmap entries beyond `pend'. Such a pattern can match
4878 the null string, though. */
4879 if (p + j < pend)
4880 {
4881 if (!PUSH_PATTERN_OP (p + j, fail_stack))
4882 {
4883 RESET_FAIL_STACK ();
4884 return -2;
4885 }
4886 }
4887 else
4888 bufp->can_be_null = 1;
4889
4890 if (succeed_n_p)
4891 {
4892 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
4893 succeed_n_p = false;
4894 }
4895
4896 continue;
4897
4898
4899 case succeed_n:
4900 /* Get to the number of times to succeed. */
4901 p += OFFSET_ADDRESS_SIZE;
4902
4903 /* Increment p past the n for when k != 0. */
4904 EXTRACT_NUMBER_AND_INCR (k, p);
4905 if (k == 0)
4906 {
4907 p -= 2 * OFFSET_ADDRESS_SIZE;
4908 succeed_n_p = true; /* Spaghetti code alert. */
4909 goto handle_on_failure_jump;
4910 }
4911 continue;
4912
4913
4914 case set_number_at:
4915 p += 2 * OFFSET_ADDRESS_SIZE;
4916 continue;
4917
4918
4919 case start_memory:
4920 case stop_memory:
4921 p += 2;
4922 continue;
4923
4924
4925 default:
4926 abort (); /* We have listed all the cases. */
4927 } /* switch *p++ */
4928
4929 /* Getting here means we have found the possible starting
4930 characters for one path of the pattern -- and that the empty
4931 string does not match. We need not follow this path further.
4932 Instead, look at the next alternative (remembered on the
4933 stack), or quit if no more. The test at the top of the loop
4934 does these things. */
4935 path_can_be_null = false;
4936 p = pend;
4937 } /* while p */
4938
4939 /* Set `can_be_null' for the last path (also the first path, if the
4940 pattern is empty). */
4941 bufp->can_be_null |= path_can_be_null;
4942
4943 done:
4944 RESET_FAIL_STACK ();
4945 return 0;
4946 }
4947
4948 #else /* not INSIDE_RECURSION */
4949
4950 int
4951 re_compile_fastmap (bufp)
4952 struct re_pattern_buffer *bufp;
4953 {
4954 # ifdef MBS_SUPPORT
4955 if (MB_CUR_MAX != 1)
4956 return wcs_re_compile_fastmap(bufp);
4957 else
4958 # endif
4959 return byte_re_compile_fastmap(bufp);
4960 } /* re_compile_fastmap */
4961 #ifdef _LIBC
4962 weak_alias (__re_compile_fastmap, re_compile_fastmap)
4963 #endif
4964 \f
4965
4966 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4967 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4968 this memory for recording register information. STARTS and ENDS
4969 must be allocated using the malloc library routine, and must each
4970 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4971
4972 If NUM_REGS == 0, then subsequent matches should allocate their own
4973 register data.
4974
4975 Unless this function is called, the first search or match using
4976 PATTERN_BUFFER will allocate its own register data, without
4977 freeing the old data. */
4978
4979 void
4980 re_set_registers (bufp, regs, num_regs, starts, ends)
4981 struct re_pattern_buffer *bufp;
4982 struct re_registers *regs;
4983 unsigned num_regs;
4984 regoff_t *starts, *ends;
4985 {
4986 if (num_regs)
4987 {
4988 bufp->regs_allocated = REGS_REALLOCATE;
4989 regs->num_regs = num_regs;
4990 regs->start = starts;
4991 regs->end = ends;
4992 }
4993 else
4994 {
4995 bufp->regs_allocated = REGS_UNALLOCATED;
4996 regs->num_regs = 0;
4997 regs->start = regs->end = (regoff_t *) 0;
4998 }
4999 }
5000 #ifdef _LIBC
5001 weak_alias (__re_set_registers, re_set_registers)
5002 #endif
5003 \f
5004 /* Searching routines. */
5005
5006 /* Like re_search_2, below, but only one string is specified, and
5007 doesn't let you say where to stop matching. */
5008
5009 int
5010 re_search (bufp, string, size, startpos, range, regs)
5011 struct re_pattern_buffer *bufp;
5012 const char *string;
5013 int size, startpos, range;
5014 struct re_registers *regs;
5015 {
5016 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
5017 regs, size);
5018 }
5019 #ifdef _LIBC
5020 weak_alias (__re_search, re_search)
5021 #endif
5022
5023
5024 /* Using the compiled pattern in BUFP->buffer, first tries to match the
5025 virtual concatenation of STRING1 and STRING2, starting first at index
5026 STARTPOS, then at STARTPOS + 1, and so on.
5027
5028 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
5029
5030 RANGE is how far to scan while trying to match. RANGE = 0 means try
5031 only at STARTPOS; in general, the last start tried is STARTPOS +
5032 RANGE.
5033
5034 In REGS, return the indices of the virtual concatenation of STRING1
5035 and STRING2 that matched the entire BUFP->buffer and its contained
5036 subexpressions.
5037
5038 Do not consider matching one past the index STOP in the virtual
5039 concatenation of STRING1 and STRING2.
5040
5041 We return either the position in the strings at which the match was
5042 found, -1 if no match, or -2 if error (such as failure
5043 stack overflow). */
5044
5045 int
5046 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
5047 struct re_pattern_buffer *bufp;
5048 const char *string1, *string2;
5049 int size1, size2;
5050 int startpos;
5051 int range;
5052 struct re_registers *regs;
5053 int stop;
5054 {
5055 # ifdef MBS_SUPPORT
5056 if (MB_CUR_MAX != 1)
5057 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5058 range, regs, stop);
5059 else
5060 # endif
5061 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos,
5062 range, regs, stop);
5063 } /* re_search_2 */
5064 #ifdef _LIBC
5065 weak_alias (__re_search_2, re_search_2)
5066 #endif
5067
5068 #endif /* not INSIDE_RECURSION */
5069
5070 #ifdef INSIDE_RECURSION
5071
5072 #ifdef MATCH_MAY_ALLOCATE
5073 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
5074 #else
5075 # define FREE_VAR(var) if (var) free (var); var = NULL
5076 #endif
5077
5078 #ifdef WCHAR
5079 # define FREE_WCS_BUFFERS() \
5080 do { \
5081 FREE_VAR (string1); \
5082 FREE_VAR (string2); \
5083 FREE_VAR (mbs_offset1); \
5084 FREE_VAR (mbs_offset2); \
5085 } while (0)
5086
5087 #endif
5088
5089 static int
5090 PREFIX(re_search_2) (bufp, string1, size1, string2, size2, startpos, range,
5091 regs, stop)
5092 struct re_pattern_buffer *bufp;
5093 const char *string1, *string2;
5094 int size1, size2;
5095 int startpos;
5096 int range;
5097 struct re_registers *regs;
5098 int stop;
5099 {
5100 int val;
5101 register char *fastmap = bufp->fastmap;
5102 register RE_TRANSLATE_TYPE translate = bufp->translate;
5103 int total_size = size1 + size2;
5104 int endpos = startpos + range;
5105 #ifdef WCHAR
5106 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5107 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL;
5108 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5109 int wcs_size1 = 0, wcs_size2 = 0;
5110 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5111 int *mbs_offset1 = NULL, *mbs_offset2 = NULL;
5112 /* They hold whether each wchar_t is binary data or not. */
5113 char *is_binary = NULL;
5114 #endif /* WCHAR */
5115
5116 /* Check for out-of-range STARTPOS. */
5117 if (startpos < 0 || startpos > total_size)
5118 return -1;
5119
5120 /* Fix up RANGE if it might eventually take us outside
5121 the virtual concatenation of STRING1 and STRING2.
5122 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
5123 if (endpos < 0)
5124 range = 0 - startpos;
5125 else if (endpos > total_size)
5126 range = total_size - startpos;
5127
5128 /* If the search isn't to be a backwards one, don't waste time in a
5129 search for a pattern that must be anchored. */
5130 if (bufp->used > 0 && range > 0
5131 && ((re_opcode_t) bufp->buffer[0] == begbuf
5132 /* `begline' is like `begbuf' if it cannot match at newlines. */
5133 || ((re_opcode_t) bufp->buffer[0] == begline
5134 && !bufp->newline_anchor)))
5135 {
5136 if (startpos > 0)
5137 return -1;
5138 else
5139 range = 1;
5140 }
5141
5142 #ifdef emacs
5143 /* In a forward search for something that starts with \=.
5144 don't keep searching past point. */
5145 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
5146 {
5147 range = PT - startpos;
5148 if (range <= 0)
5149 return -1;
5150 }
5151 #endif /* emacs */
5152
5153 /* Update the fastmap now if not correct already. */
5154 if (fastmap && !bufp->fastmap_accurate)
5155 if (re_compile_fastmap (bufp) == -2)
5156 return -2;
5157
5158 #ifdef WCHAR
5159 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and
5160 fill them with converted string. */
5161 if (size1 != 0)
5162 {
5163 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T);
5164 mbs_offset1 = REGEX_TALLOC (size1 + 1, int);
5165 is_binary = REGEX_TALLOC (size1 + 1, char);
5166 if (!wcs_string1 || !mbs_offset1 || !is_binary)
5167 {
5168 FREE_VAR (wcs_string1);
5169 FREE_VAR (mbs_offset1);
5170 FREE_VAR (is_binary);
5171 return -2;
5172 }
5173 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1,
5174 mbs_offset1, is_binary);
5175 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */
5176 FREE_VAR (is_binary);
5177 }
5178 if (size2 != 0)
5179 {
5180 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T);
5181 mbs_offset2 = REGEX_TALLOC (size2 + 1, int);
5182 is_binary = REGEX_TALLOC (size2 + 1, char);
5183 if (!wcs_string2 || !mbs_offset2 || !is_binary)
5184 {
5185 FREE_WCS_BUFFERS ();
5186 FREE_VAR (is_binary);
5187 return -2;
5188 }
5189 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2,
5190 mbs_offset2, is_binary);
5191 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */
5192 FREE_VAR (is_binary);
5193 }
5194 #endif /* WCHAR */
5195
5196
5197 /* Loop through the string, looking for a place to start matching. */
5198 for (;;)
5199 {
5200 /* If a fastmap is supplied, skip quickly over characters that
5201 cannot be the start of a match. If the pattern can match the
5202 null string, however, we don't need to skip characters; we want
5203 the first null string. */
5204 if (fastmap && startpos < total_size && !bufp->can_be_null)
5205 {
5206 if (range > 0) /* Searching forwards. */
5207 {
5208 register const char *d;
5209 register int lim = 0;
5210 int irange = range;
5211
5212 if (startpos < size1 && startpos + range >= size1)
5213 lim = range - (size1 - startpos);
5214
5215 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
5216
5217 /* Written out as an if-else to avoid testing `translate'
5218 inside the loop. */
5219 if (translate)
5220 while (range > lim
5221 && !fastmap[(unsigned char)
5222 translate[(unsigned char) *d++]])
5223 range--;
5224 else
5225 while (range > lim && !fastmap[(unsigned char) *d++])
5226 range--;
5227
5228 startpos += irange - range;
5229 }
5230 else /* Searching backwards. */
5231 {
5232 register CHAR_T c = (size1 == 0 || startpos >= size1
5233 ? string2[startpos - size1]
5234 : string1[startpos]);
5235
5236 if (!fastmap[(unsigned char) TRANSLATE (c)])
5237 goto advance;
5238 }
5239 }
5240
5241 /* If can't match the null string, and that's all we have left, fail. */
5242 if (range >= 0 && startpos == total_size && fastmap
5243 && !bufp->can_be_null)
5244 {
5245 #ifdef WCHAR
5246 FREE_WCS_BUFFERS ();
5247 #endif
5248 return -1;
5249 }
5250
5251 #ifdef WCHAR
5252 val = wcs_re_match_2_internal (bufp, string1, size1, string2,
5253 size2, startpos, regs, stop,
5254 wcs_string1, wcs_size1,
5255 wcs_string2, wcs_size2,
5256 mbs_offset1, mbs_offset2);
5257 #else /* BYTE */
5258 val = byte_re_match_2_internal (bufp, string1, size1, string2,
5259 size2, startpos, regs, stop);
5260 #endif /* BYTE */
5261
5262 #ifndef REGEX_MALLOC
5263 # ifdef C_ALLOCA
5264 alloca (0);
5265 # endif
5266 #endif
5267
5268 if (val >= 0)
5269 {
5270 #ifdef WCHAR
5271 FREE_WCS_BUFFERS ();
5272 #endif
5273 return startpos;
5274 }
5275
5276 if (val == -2)
5277 {
5278 #ifdef WCHAR
5279 FREE_WCS_BUFFERS ();
5280 #endif
5281 return -2;
5282 }
5283
5284 advance:
5285 if (!range)
5286 break;
5287 else if (range > 0)
5288 {
5289 range--;
5290 startpos++;
5291 }
5292 else
5293 {
5294 range++;
5295 startpos--;
5296 }
5297 }
5298 #ifdef WCHAR
5299 FREE_WCS_BUFFERS ();
5300 #endif
5301 return -1;
5302 }
5303
5304 #ifdef WCHAR
5305 /* This converts PTR, a pointer into one of the search wchar_t strings
5306 `string1' and `string2' into an multibyte string offset from the
5307 beginning of that string. We use mbs_offset to optimize.
5308 See convert_mbs_to_wcs. */
5309 # define POINTER_TO_OFFSET(ptr) \
5310 (FIRST_STRING_P (ptr) \
5311 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \
5312 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \
5313 + csize1)))
5314 #else /* BYTE */
5315 /* This converts PTR, a pointer into one of the search strings `string1'
5316 and `string2' into an offset from the beginning of that string. */
5317 # define POINTER_TO_OFFSET(ptr) \
5318 (FIRST_STRING_P (ptr) \
5319 ? ((regoff_t) ((ptr) - string1)) \
5320 : ((regoff_t) ((ptr) - string2 + size1)))
5321 #endif /* WCHAR */
5322
5323 /* Macros for dealing with the split strings in re_match_2. */
5324
5325 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
5326
5327 /* Call before fetching a character with *d. This switches over to
5328 string2 if necessary. */
5329 #define PREFETCH() \
5330 while (d == dend) \
5331 { \
5332 /* End of string2 => fail. */ \
5333 if (dend == end_match_2) \
5334 goto fail; \
5335 /* End of string1 => advance to string2. */ \
5336 d = string2; \
5337 dend = end_match_2; \
5338 }
5339
5340 /* Test if at very beginning or at very end of the virtual concatenation
5341 of `string1' and `string2'. If only one string, it's `string2'. */
5342 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
5343 #define AT_STRINGS_END(d) ((d) == end2)
5344
5345
5346 /* Test if D points to a character which is word-constituent. We have
5347 two special cases to check for: if past the end of string1, look at
5348 the first character in string2; and if before the beginning of
5349 string2, look at the last character in string1. */
5350 #ifdef WCHAR
5351 /* Use internationalized API instead of SYNTAX. */
5352 # define WORDCHAR_P(d) \
5353 (iswalnum ((wint_t)((d) == end1 ? *string2 \
5354 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \
5355 || ((d) == end1 ? *string2 \
5356 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_')
5357 #else /* BYTE */
5358 # define WORDCHAR_P(d) \
5359 (SYNTAX ((d) == end1 ? *string2 \
5360 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
5361 == Sword)
5362 #endif /* WCHAR */
5363
5364 /* Disabled due to a compiler bug -- see comment at case wordbound */
5365 #if 0
5366 /* Test if the character before D and the one at D differ with respect
5367 to being word-constituent. */
5368 #define AT_WORD_BOUNDARY(d) \
5369 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
5370 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
5371 #endif
5372
5373 /* Free everything we malloc. */
5374 #ifdef MATCH_MAY_ALLOCATE
5375 # ifdef WCHAR
5376 # define FREE_VARIABLES() \
5377 do { \
5378 REGEX_FREE_STACK (fail_stack.stack); \
5379 FREE_VAR (regstart); \
5380 FREE_VAR (regend); \
5381 FREE_VAR (old_regstart); \
5382 FREE_VAR (old_regend); \
5383 FREE_VAR (best_regstart); \
5384 FREE_VAR (best_regend); \
5385 FREE_VAR (reg_info); \
5386 FREE_VAR (reg_dummy); \
5387 FREE_VAR (reg_info_dummy); \
5388 if (!cant_free_wcs_buf) \
5389 { \
5390 FREE_VAR (string1); \
5391 FREE_VAR (string2); \
5392 FREE_VAR (mbs_offset1); \
5393 FREE_VAR (mbs_offset2); \
5394 } \
5395 } while (0)
5396 # else /* BYTE */
5397 # define FREE_VARIABLES() \
5398 do { \
5399 REGEX_FREE_STACK (fail_stack.stack); \
5400 FREE_VAR (regstart); \
5401 FREE_VAR (regend); \
5402 FREE_VAR (old_regstart); \
5403 FREE_VAR (old_regend); \
5404 FREE_VAR (best_regstart); \
5405 FREE_VAR (best_regend); \
5406 FREE_VAR (reg_info); \
5407 FREE_VAR (reg_dummy); \
5408 FREE_VAR (reg_info_dummy); \
5409 } while (0)
5410 # endif /* WCHAR */
5411 #else
5412 # ifdef WCHAR
5413 # define FREE_VARIABLES() \
5414 do { \
5415 if (!cant_free_wcs_buf) \
5416 { \
5417 FREE_VAR (string1); \
5418 FREE_VAR (string2); \
5419 FREE_VAR (mbs_offset1); \
5420 FREE_VAR (mbs_offset2); \
5421 } \
5422 } while (0)
5423 # else /* BYTE */
5424 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
5425 # endif /* WCHAR */
5426 #endif /* not MATCH_MAY_ALLOCATE */
5427
5428 /* These values must meet several constraints. They must not be valid
5429 register values; since we have a limit of 255 registers (because
5430 we use only one byte in the pattern for the register number), we can
5431 use numbers larger than 255. They must differ by 1, because of
5432 NUM_FAILURE_ITEMS above. And the value for the lowest register must
5433 be larger than the value for the highest register, so we do not try
5434 to actually save any registers when none are active. */
5435 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
5436 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
5437 \f
5438 #else /* not INSIDE_RECURSION */
5439 /* Matching routines. */
5440
5441 #ifndef emacs /* Emacs never uses this. */
5442 /* re_match is like re_match_2 except it takes only a single string. */
5443
5444 int
5445 re_match (bufp, string, size, pos, regs)
5446 struct re_pattern_buffer *bufp;
5447 const char *string;
5448 int size, pos;
5449 struct re_registers *regs;
5450 {
5451 int result;
5452 # ifdef MBS_SUPPORT
5453 if (MB_CUR_MAX != 1)
5454 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size,
5455 pos, regs, size,
5456 NULL, 0, NULL, 0, NULL, NULL);
5457 else
5458 # endif
5459 result = byte_re_match_2_internal (bufp, NULL, 0, string, size,
5460 pos, regs, size);
5461 # ifndef REGEX_MALLOC
5462 # ifdef C_ALLOCA
5463 alloca (0);
5464 # endif
5465 # endif
5466 return result;
5467 }
5468 # ifdef _LIBC
5469 weak_alias (__re_match, re_match)
5470 # endif
5471 #endif /* not emacs */
5472
5473 #endif /* not INSIDE_RECURSION */
5474
5475 #ifdef INSIDE_RECURSION
5476 static boolean PREFIX(group_match_null_string_p) _RE_ARGS ((UCHAR_T **p,
5477 UCHAR_T *end,
5478 PREFIX(register_info_type) *reg_info));
5479 static boolean PREFIX(alt_match_null_string_p) _RE_ARGS ((UCHAR_T *p,
5480 UCHAR_T *end,
5481 PREFIX(register_info_type) *reg_info));
5482 static boolean PREFIX(common_op_match_null_string_p) _RE_ARGS ((UCHAR_T **p,
5483 UCHAR_T *end,
5484 PREFIX(register_info_type) *reg_info));
5485 static int PREFIX(bcmp_translate) _RE_ARGS ((const CHAR_T *s1, const CHAR_T *s2,
5486 int len, char *translate));
5487 #else /* not INSIDE_RECURSION */
5488
5489 /* re_match_2 matches the compiled pattern in BUFP against the
5490 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
5491 and SIZE2, respectively). We start matching at POS, and stop
5492 matching at STOP.
5493
5494 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
5495 store offsets for the substring each group matched in REGS. See the
5496 documentation for exactly how many groups we fill.
5497
5498 We return -1 if no match, -2 if an internal error (such as the
5499 failure stack overflowing). Otherwise, we return the length of the
5500 matched substring. */
5501
5502 int
5503 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
5504 struct re_pattern_buffer *bufp;
5505 const char *string1, *string2;
5506 int size1, size2;
5507 int pos;
5508 struct re_registers *regs;
5509 int stop;
5510 {
5511 int result;
5512 # ifdef MBS_SUPPORT
5513 if (MB_CUR_MAX != 1)
5514 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2,
5515 pos, regs, stop,
5516 NULL, 0, NULL, 0, NULL, NULL);
5517 else
5518 # endif
5519 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2,
5520 pos, regs, stop);
5521
5522 #ifndef REGEX_MALLOC
5523 # ifdef C_ALLOCA
5524 alloca (0);
5525 # endif
5526 #endif
5527 return result;
5528 }
5529 #ifdef _LIBC
5530 weak_alias (__re_match_2, re_match_2)
5531 #endif
5532
5533 #endif /* not INSIDE_RECURSION */
5534
5535 #ifdef INSIDE_RECURSION
5536
5537 #ifdef WCHAR
5538 static int count_mbs_length PARAMS ((int *, int));
5539
5540 /* This check the substring (from 0, to length) of the multibyte string,
5541 to which offset_buffer correspond. And count how many wchar_t_characters
5542 the substring occupy. We use offset_buffer to optimization.
5543 See convert_mbs_to_wcs. */
5544
5545 static int
5546 count_mbs_length(offset_buffer, length)
5547 int *offset_buffer;
5548 int length;
5549 {
5550 int upper, lower;
5551
5552 /* Check whether the size is valid. */
5553 if (length < 0)
5554 return -1;
5555
5556 if (offset_buffer == NULL)
5557 return 0;
5558
5559 /* If there are no multibyte character, offset_buffer[i] == i.
5560 Optmize for this case. */
5561 if (offset_buffer[length] == length)
5562 return length;
5563
5564 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */
5565 upper = length;
5566 lower = 0;
5567
5568 while (true)
5569 {
5570 int middle = (lower + upper) / 2;
5571 if (middle == lower || middle == upper)
5572 break;
5573 if (offset_buffer[middle] > length)
5574 upper = middle;
5575 else if (offset_buffer[middle] < length)
5576 lower = middle;
5577 else
5578 return middle;
5579 }
5580
5581 return -1;
5582 }
5583 #endif /* WCHAR */
5584
5585 /* This is a separate function so that we can force an alloca cleanup
5586 afterwards. */
5587 #ifdef WCHAR
5588 static int
5589 wcs_re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos,
5590 regs, stop, string1, size1, string2, size2,
5591 mbs_offset1, mbs_offset2)
5592 struct re_pattern_buffer *bufp;
5593 const char *cstring1, *cstring2;
5594 int csize1, csize2;
5595 int pos;
5596 struct re_registers *regs;
5597 int stop;
5598 /* string1 == string2 == NULL means string1/2, size1/2 and
5599 mbs_offset1/2 need seting up in this function. */
5600 /* We need wchar_t* buffers correspond to cstring1, cstring2. */
5601 wchar_t *string1, *string2;
5602 /* We need the size of wchar_t buffers correspond to csize1, csize2. */
5603 int size1, size2;
5604 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */
5605 int *mbs_offset1, *mbs_offset2;
5606 #else /* BYTE */
5607 static int
5608 byte_re_match_2_internal (bufp, string1, size1,string2, size2, pos,
5609 regs, stop)
5610 struct re_pattern_buffer *bufp;
5611 const char *string1, *string2;
5612 int size1, size2;
5613 int pos;
5614 struct re_registers *regs;
5615 int stop;
5616 #endif /* BYTE */
5617 {
5618 /* General temporaries. */
5619 int mcnt;
5620 UCHAR_T *p1;
5621 #ifdef WCHAR
5622 /* They hold whether each wchar_t is binary data or not. */
5623 char *is_binary = NULL;
5624 /* If true, we can't free string1/2, mbs_offset1/2. */
5625 int cant_free_wcs_buf = 1;
5626 #endif /* WCHAR */
5627
5628 /* Just past the end of the corresponding string. */
5629 const CHAR_T *end1, *end2;
5630
5631 /* Pointers into string1 and string2, just past the last characters in
5632 each to consider matching. */
5633 const CHAR_T *end_match_1, *end_match_2;
5634
5635 /* Where we are in the data, and the end of the current string. */
5636 const CHAR_T *d, *dend;
5637
5638 /* Where we are in the pattern, and the end of the pattern. */
5639 #ifdef WCHAR
5640 UCHAR_T *pattern, *p;
5641 register UCHAR_T *pend;
5642 #else /* BYTE */
5643 UCHAR_T *p = bufp->buffer;
5644 register UCHAR_T *pend = p + bufp->used;
5645 #endif /* WCHAR */
5646
5647 /* Mark the opcode just after a start_memory, so we can test for an
5648 empty subpattern when we get to the stop_memory. */
5649 UCHAR_T *just_past_start_mem = 0;
5650
5651 /* We use this to map every character in the string. */
5652 RE_TRANSLATE_TYPE translate = bufp->translate;
5653
5654 /* Failure point stack. Each place that can handle a failure further
5655 down the line pushes a failure point on this stack. It consists of
5656 restart, regend, and reg_info for all registers corresponding to
5657 the subexpressions we're currently inside, plus the number of such
5658 registers, and, finally, two char *'s. The first char * is where
5659 to resume scanning the pattern; the second one is where to resume
5660 scanning the strings. If the latter is zero, the failure point is
5661 a ``dummy''; if a failure happens and the failure point is a dummy,
5662 it gets discarded and the next next one is tried. */
5663 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5664 PREFIX(fail_stack_type) fail_stack;
5665 #endif
5666 #ifdef DEBUG
5667 static unsigned failure_id;
5668 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5669 #endif
5670
5671 #ifdef REL_ALLOC
5672 /* This holds the pointer to the failure stack, when
5673 it is allocated relocatably. */
5674 fail_stack_elt_t *failure_stack_ptr;
5675 #endif
5676
5677 /* We fill all the registers internally, independent of what we
5678 return, for use in backreferences. The number here includes
5679 an element for register zero. */
5680 size_t num_regs = bufp->re_nsub + 1;
5681
5682 /* The currently active registers. */
5683 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5684 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5685
5686 /* Information on the contents of registers. These are pointers into
5687 the input strings; they record just what was matched (on this
5688 attempt) by a subexpression part of the pattern, that is, the
5689 regnum-th regstart pointer points to where in the pattern we began
5690 matching and the regnum-th regend points to right after where we
5691 stopped matching the regnum-th subexpression. (The zeroth register
5692 keeps track of what the whole pattern matches.) */
5693 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5694 const CHAR_T **regstart, **regend;
5695 #endif
5696
5697 /* If a group that's operated upon by a repetition operator fails to
5698 match anything, then the register for its start will need to be
5699 restored because it will have been set to wherever in the string we
5700 are when we last see its open-group operator. Similarly for a
5701 register's end. */
5702 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5703 const CHAR_T **old_regstart, **old_regend;
5704 #endif
5705
5706 /* The is_active field of reg_info helps us keep track of which (possibly
5707 nested) subexpressions we are currently in. The matched_something
5708 field of reg_info[reg_num] helps us tell whether or not we have
5709 matched any of the pattern so far this time through the reg_num-th
5710 subexpression. These two fields get reset each time through any
5711 loop their register is in. */
5712 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5713 PREFIX(register_info_type) *reg_info;
5714 #endif
5715
5716 /* The following record the register info as found in the above
5717 variables when we find a match better than any we've seen before.
5718 This happens as we backtrack through the failure points, which in
5719 turn happens only if we have not yet matched the entire string. */
5720 unsigned best_regs_set = false;
5721 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5722 const CHAR_T **best_regstart, **best_regend;
5723 #endif
5724
5725 /* Logically, this is `best_regend[0]'. But we don't want to have to
5726 allocate space for that if we're not allocating space for anything
5727 else (see below). Also, we never need info about register 0 for
5728 any of the other register vectors, and it seems rather a kludge to
5729 treat `best_regend' differently than the rest. So we keep track of
5730 the end of the best match so far in a separate variable. We
5731 initialize this to NULL so that when we backtrack the first time
5732 and need to test it, it's not garbage. */
5733 const CHAR_T *match_end = NULL;
5734
5735 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
5736 int set_regs_matched_done = 0;
5737
5738 /* Used when we pop values we don't care about. */
5739 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5740 const CHAR_T **reg_dummy;
5741 PREFIX(register_info_type) *reg_info_dummy;
5742 #endif
5743
5744 #ifdef DEBUG
5745 /* Counts the total number of registers pushed. */
5746 unsigned num_regs_pushed = 0;
5747 #endif
5748
5749 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5750
5751 INIT_FAIL_STACK ();
5752
5753 #ifdef MATCH_MAY_ALLOCATE
5754 /* Do not bother to initialize all the register variables if there are
5755 no groups in the pattern, as it takes a fair amount of time. If
5756 there are groups, we include space for register 0 (the whole
5757 pattern), even though we never use it, since it simplifies the
5758 array indexing. We should fix this. */
5759 if (bufp->re_nsub)
5760 {
5761 regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5762 regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5763 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5764 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5765 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *);
5766 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *);
5767 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5768 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *);
5769 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type));
5770
5771 if (!(regstart && regend && old_regstart && old_regend && reg_info
5772 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
5773 {
5774 FREE_VARIABLES ();
5775 return -2;
5776 }
5777 }
5778 else
5779 {
5780 /* We must initialize all our variables to NULL, so that
5781 `FREE_VARIABLES' doesn't try to free them. */
5782 regstart = regend = old_regstart = old_regend = best_regstart
5783 = best_regend = reg_dummy = NULL;
5784 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL;
5785 }
5786 #endif /* MATCH_MAY_ALLOCATE */
5787
5788 /* The starting position is bogus. */
5789 #ifdef WCHAR
5790 if (pos < 0 || pos > csize1 + csize2)
5791 #else /* BYTE */
5792 if (pos < 0 || pos > size1 + size2)
5793 #endif
5794 {
5795 FREE_VARIABLES ();
5796 return -1;
5797 }
5798
5799 #ifdef WCHAR
5800 /* Allocate wchar_t array for string1 and string2 and
5801 fill them with converted string. */
5802 if (string1 == NULL && string2 == NULL)
5803 {
5804 /* We need seting up buffers here. */
5805
5806 /* We must free wcs buffers in this function. */
5807 cant_free_wcs_buf = 0;
5808
5809 if (csize1 != 0)
5810 {
5811 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T);
5812 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int);
5813 is_binary = REGEX_TALLOC (csize1 + 1, char);
5814 if (!string1 || !mbs_offset1 || !is_binary)
5815 {
5816 FREE_VAR (string1);
5817 FREE_VAR (mbs_offset1);
5818 FREE_VAR (is_binary);
5819 return -2;
5820 }
5821 }
5822 if (csize2 != 0)
5823 {
5824 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T);
5825 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int);
5826 is_binary = REGEX_TALLOC (csize2 + 1, char);
5827 if (!string2 || !mbs_offset2 || !is_binary)
5828 {
5829 FREE_VAR (string1);
5830 FREE_VAR (mbs_offset1);
5831 FREE_VAR (string2);
5832 FREE_VAR (mbs_offset2);
5833 FREE_VAR (is_binary);
5834 return -2;
5835 }
5836 size2 = convert_mbs_to_wcs(string2, cstring2, csize2,
5837 mbs_offset2, is_binary);
5838 string2[size2] = L'\0'; /* for a sentinel */
5839 FREE_VAR (is_binary);
5840 }
5841 }
5842
5843 /* We need to cast pattern to (wchar_t*), because we casted this compiled
5844 pattern to (char*) in regex_compile. */
5845 p = pattern = (CHAR_T*)bufp->buffer;
5846 pend = (CHAR_T*)(bufp->buffer + bufp->used);
5847
5848 #endif /* WCHAR */
5849
5850 /* Initialize subexpression text positions to -1 to mark ones that no
5851 start_memory/stop_memory has been seen for. Also initialize the
5852 register information struct. */
5853 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
5854 {
5855 regstart[mcnt] = regend[mcnt]
5856 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
5857
5858 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
5859 IS_ACTIVE (reg_info[mcnt]) = 0;
5860 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5861 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
5862 }
5863
5864 /* We move `string1' into `string2' if the latter's empty -- but not if
5865 `string1' is null. */
5866 if (size2 == 0 && string1 != NULL)
5867 {
5868 string2 = string1;
5869 size2 = size1;
5870 string1 = 0;
5871 size1 = 0;
5872 #ifdef WCHAR
5873 mbs_offset2 = mbs_offset1;
5874 csize2 = csize1;
5875 mbs_offset1 = NULL;
5876 csize1 = 0;
5877 #endif
5878 }
5879 end1 = string1 + size1;
5880 end2 = string2 + size2;
5881
5882 /* Compute where to stop matching, within the two strings. */
5883 #ifdef WCHAR
5884 if (stop <= csize1)
5885 {
5886 mcnt = count_mbs_length(mbs_offset1, stop);
5887 end_match_1 = string1 + mcnt;
5888 end_match_2 = string2;
5889 }
5890 else
5891 {
5892 if (stop > csize1 + csize2)
5893 stop = csize1 + csize2;
5894 end_match_1 = end1;
5895 mcnt = count_mbs_length(mbs_offset2, stop-csize1);
5896 end_match_2 = string2 + mcnt;
5897 }
5898 if (mcnt < 0)
5899 { /* count_mbs_length return error. */
5900 FREE_VARIABLES ();
5901 return -1;
5902 }
5903 #else
5904 if (stop <= size1)
5905 {
5906 end_match_1 = string1 + stop;
5907 end_match_2 = string2;
5908 }
5909 else
5910 {
5911 end_match_1 = end1;
5912 end_match_2 = string2 + stop - size1;
5913 }
5914 #endif /* WCHAR */
5915
5916 /* `p' scans through the pattern as `d' scans through the data.
5917 `dend' is the end of the input string that `d' points within. `d'
5918 is advanced into the following input string whenever necessary, but
5919 this happens before fetching; therefore, at the beginning of the
5920 loop, `d' can be pointing at the end of a string, but it cannot
5921 equal `string2'. */
5922 #ifdef WCHAR
5923 if (size1 > 0 && pos <= csize1)
5924 {
5925 mcnt = count_mbs_length(mbs_offset1, pos);
5926 d = string1 + mcnt;
5927 dend = end_match_1;
5928 }
5929 else
5930 {
5931 mcnt = count_mbs_length(mbs_offset2, pos-csize1);
5932 d = string2 + mcnt;
5933 dend = end_match_2;
5934 }
5935
5936 if (mcnt < 0)
5937 { /* count_mbs_length return error. */
5938 FREE_VARIABLES ();
5939 return -1;
5940 }
5941 #else
5942 if (size1 > 0 && pos <= size1)
5943 {
5944 d = string1 + pos;
5945 dend = end_match_1;
5946 }
5947 else
5948 {
5949 d = string2 + pos - size1;
5950 dend = end_match_2;
5951 }
5952 #endif /* WCHAR */
5953
5954 DEBUG_PRINT1 ("The compiled pattern is:\n");
5955 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5956 DEBUG_PRINT1 ("The string to match is: `");
5957 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5958 DEBUG_PRINT1 ("'\n");
5959
5960 /* This loops over pattern commands. It exits by returning from the
5961 function if the match is complete, or it drops through if the match
5962 fails at this starting point in the input data. */
5963 for (;;)
5964 {
5965 #ifdef _LIBC
5966 DEBUG_PRINT2 ("\n%p: ", p);
5967 #else
5968 DEBUG_PRINT2 ("\n0x%x: ", p);
5969 #endif
5970
5971 if (p == pend)
5972 { /* End of pattern means we might have succeeded. */
5973 DEBUG_PRINT1 ("end of pattern ... ");
5974
5975 /* If we haven't matched the entire string, and we want the
5976 longest match, try backtracking. */
5977 if (d != end_match_2)
5978 {
5979 /* 1 if this match ends in the same string (string1 or string2)
5980 as the best previous match. */
5981 boolean same_str_p = (FIRST_STRING_P (match_end)
5982 == MATCHING_IN_FIRST_STRING);
5983 /* 1 if this match is the best seen so far. */
5984 boolean best_match_p;
5985
5986 /* AIX compiler got confused when this was combined
5987 with the previous declaration. */
5988 if (same_str_p)
5989 best_match_p = d > match_end;
5990 else
5991 best_match_p = !MATCHING_IN_FIRST_STRING;
5992
5993 DEBUG_PRINT1 ("backtracking.\n");
5994
5995 if (!FAIL_STACK_EMPTY ())
5996 { /* More failure points to try. */
5997
5998 /* If exceeds best match so far, save it. */
5999 if (!best_regs_set || best_match_p)
6000 {
6001 best_regs_set = true;
6002 match_end = d;
6003
6004 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
6005
6006 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6007 {
6008 best_regstart[mcnt] = regstart[mcnt];
6009 best_regend[mcnt] = regend[mcnt];
6010 }
6011 }
6012 goto fail;
6013 }
6014
6015 /* If no failure points, don't restore garbage. And if
6016 last match is real best match, don't restore second
6017 best one. */
6018 else if (best_regs_set && !best_match_p)
6019 {
6020 restore_best_regs:
6021 /* Restore best match. It may happen that `dend ==
6022 end_match_1' while the restored d is in string2.
6023 For example, the pattern `x.*y.*z' against the
6024 strings `x-' and `y-z-', if the two strings are
6025 not consecutive in memory. */
6026 DEBUG_PRINT1 ("Restoring best registers.\n");
6027
6028 d = match_end;
6029 dend = ((d >= string1 && d <= end1)
6030 ? end_match_1 : end_match_2);
6031
6032 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++)
6033 {
6034 regstart[mcnt] = best_regstart[mcnt];
6035 regend[mcnt] = best_regend[mcnt];
6036 }
6037 }
6038 } /* d != end_match_2 */
6039
6040 succeed_label:
6041 DEBUG_PRINT1 ("Accepting match.\n");
6042 /* If caller wants register contents data back, do it. */
6043 if (regs && !bufp->no_sub)
6044 {
6045 /* Have the register data arrays been allocated? */
6046 if (bufp->regs_allocated == REGS_UNALLOCATED)
6047 { /* No. So allocate them with malloc. We need one
6048 extra element beyond `num_regs' for the `-1' marker
6049 GNU code uses. */
6050 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
6051 regs->start = TALLOC (regs->num_regs, regoff_t);
6052 regs->end = TALLOC (regs->num_regs, regoff_t);
6053 if (regs->start == NULL || regs->end == NULL)
6054 {
6055 FREE_VARIABLES ();
6056 return -2;
6057 }
6058 bufp->regs_allocated = REGS_REALLOCATE;
6059 }
6060 else if (bufp->regs_allocated == REGS_REALLOCATE)
6061 { /* Yes. If we need more elements than were already
6062 allocated, reallocate them. If we need fewer, just
6063 leave it alone. */
6064 if (regs->num_regs < num_regs + 1)
6065 {
6066 regs->num_regs = num_regs + 1;
6067 RETALLOC (regs->start, regs->num_regs, regoff_t);
6068 RETALLOC (regs->end, regs->num_regs, regoff_t);
6069 if (regs->start == NULL || regs->end == NULL)
6070 {
6071 FREE_VARIABLES ();
6072 return -2;
6073 }
6074 }
6075 }
6076 else
6077 {
6078 /* These braces fend off a "empty body in an else-statement"
6079 warning under GCC when assert expands to nothing. */
6080 assert (bufp->regs_allocated == REGS_FIXED);
6081 }
6082
6083 /* Convert the pointer data in `regstart' and `regend' to
6084 indices. Register zero has to be set differently,
6085 since we haven't kept track of any info for it. */
6086 if (regs->num_regs > 0)
6087 {
6088 regs->start[0] = pos;
6089 #ifdef WCHAR
6090 if (MATCHING_IN_FIRST_STRING)
6091 regs->end[0] = mbs_offset1 != NULL ?
6092 mbs_offset1[d-string1] : 0;
6093 else
6094 regs->end[0] = csize1 + (mbs_offset2 != NULL ?
6095 mbs_offset2[d-string2] : 0);
6096 #else
6097 regs->end[0] = (MATCHING_IN_FIRST_STRING
6098 ? ((regoff_t) (d - string1))
6099 : ((regoff_t) (d - string2 + size1)));
6100 #endif /* WCHAR */
6101 }
6102
6103 /* Go through the first `min (num_regs, regs->num_regs)'
6104 registers, since that is all we initialized. */
6105 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs);
6106 mcnt++)
6107 {
6108 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
6109 regs->start[mcnt] = regs->end[mcnt] = -1;
6110 else
6111 {
6112 regs->start[mcnt]
6113 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
6114 regs->end[mcnt]
6115 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
6116 }
6117 }
6118
6119 /* If the regs structure we return has more elements than
6120 were in the pattern, set the extra elements to -1. If
6121 we (re)allocated the registers, this is the case,
6122 because we always allocate enough to have at least one
6123 -1 at the end. */
6124 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++)
6125 regs->start[mcnt] = regs->end[mcnt] = -1;
6126 } /* regs && !bufp->no_sub */
6127
6128 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
6129 nfailure_points_pushed, nfailure_points_popped,
6130 nfailure_points_pushed - nfailure_points_popped);
6131 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
6132
6133 #ifdef WCHAR
6134 if (MATCHING_IN_FIRST_STRING)
6135 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0;
6136 else
6137 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) +
6138 csize1;
6139 mcnt -= pos;
6140 #else
6141 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
6142 ? string1
6143 : string2 - size1);
6144 #endif /* WCHAR */
6145
6146 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
6147
6148 FREE_VARIABLES ();
6149 return mcnt;
6150 }
6151
6152 /* Otherwise match next pattern command. */
6153 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
6154 {
6155 /* Ignore these. Used to ignore the n of succeed_n's which
6156 currently have n == 0. */
6157 case no_op:
6158 DEBUG_PRINT1 ("EXECUTING no_op.\n");
6159 break;
6160
6161 case succeed:
6162 DEBUG_PRINT1 ("EXECUTING succeed.\n");
6163 goto succeed_label;
6164
6165 /* Match the next n pattern characters exactly. The following
6166 byte in the pattern defines n, and the n bytes after that
6167 are the characters to match. */
6168 case exactn:
6169 #ifdef MBS_SUPPORT
6170 case exactn_bin:
6171 #endif
6172 mcnt = *p++;
6173 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
6174
6175 /* This is written out as an if-else so we don't waste time
6176 testing `translate' inside the loop. */
6177 if (translate)
6178 {
6179 do
6180 {
6181 PREFETCH ();
6182 #ifdef WCHAR
6183 if (*d <= 0xff)
6184 {
6185 if ((UCHAR_T) translate[(unsigned char) *d++]
6186 != (UCHAR_T) *p++)
6187 goto fail;
6188 }
6189 else
6190 {
6191 if (*d++ != (CHAR_T) *p++)
6192 goto fail;
6193 }
6194 #else
6195 if ((UCHAR_T) translate[(unsigned char) *d++]
6196 != (UCHAR_T) *p++)
6197 goto fail;
6198 #endif /* WCHAR */
6199 }
6200 while (--mcnt);
6201 }
6202 else
6203 {
6204 do
6205 {
6206 PREFETCH ();
6207 if (*d++ != (CHAR_T) *p++) goto fail;
6208 }
6209 while (--mcnt);
6210 }
6211 SET_REGS_MATCHED ();
6212 break;
6213
6214
6215 /* Match any character except possibly a newline or a null. */
6216 case anychar:
6217 DEBUG_PRINT1 ("EXECUTING anychar.\n");
6218
6219 PREFETCH ();
6220
6221 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
6222 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
6223 goto fail;
6224
6225 SET_REGS_MATCHED ();
6226 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d);
6227 d++;
6228 break;
6229
6230
6231 case charset:
6232 case charset_not:
6233 {
6234 register UCHAR_T c;
6235 #ifdef WCHAR
6236 unsigned int i, char_class_length, coll_symbol_length,
6237 equiv_class_length, ranges_length, chars_length, length;
6238 CHAR_T *workp, *workp2, *charset_top;
6239 #define WORK_BUFFER_SIZE 128
6240 CHAR_T str_buf[WORK_BUFFER_SIZE];
6241 # ifdef _LIBC
6242 uint32_t nrules;
6243 # endif /* _LIBC */
6244 #endif /* WCHAR */
6245 boolean not = (re_opcode_t) *(p - 1) == charset_not;
6246
6247 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
6248 PREFETCH ();
6249 c = TRANSLATE (*d); /* The character to match. */
6250 #ifdef WCHAR
6251 # ifdef _LIBC
6252 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
6253 # endif /* _LIBC */
6254 charset_top = p - 1;
6255 char_class_length = *p++;
6256 coll_symbol_length = *p++;
6257 equiv_class_length = *p++;
6258 ranges_length = *p++;
6259 chars_length = *p++;
6260 /* p points charset[6], so the address of the next instruction
6261 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'],
6262 where l=length of char_classes, m=length of collating_symbol,
6263 n=equivalence_class, o=length of char_range,
6264 p'=length of character. */
6265 workp = p;
6266 /* Update p to indicate the next instruction. */
6267 p += char_class_length + coll_symbol_length+ equiv_class_length +
6268 2*ranges_length + chars_length;
6269
6270 /* match with char_class? */
6271 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE)
6272 {
6273 wctype_t wctype;
6274 uintptr_t alignedp = ((uintptr_t)workp
6275 + __alignof__(wctype_t) - 1)
6276 & ~(uintptr_t)(__alignof__(wctype_t) - 1);
6277 wctype = *((wctype_t*)alignedp);
6278 workp += CHAR_CLASS_SIZE;
6279 if (iswctype((wint_t)c, wctype))
6280 goto char_set_matched;
6281 }
6282
6283 /* match with collating_symbol? */
6284 # ifdef _LIBC
6285 if (nrules != 0)
6286 {
6287 const unsigned char *extra = (const unsigned char *)
6288 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
6289
6290 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;
6291 workp++)
6292 {
6293 int32_t *wextra;
6294 wextra = (int32_t*)(extra + *workp++);
6295 for (i = 0; i < *wextra; ++i)
6296 if (TRANSLATE(d[i]) != wextra[1 + i])
6297 break;
6298
6299 if (i == *wextra)
6300 {
6301 /* Update d, however d will be incremented at
6302 char_set_matched:, we decrement d here. */
6303 d += i - 1;
6304 goto char_set_matched;
6305 }
6306 }
6307 }
6308 else /* (nrules == 0) */
6309 # endif
6310 /* If we can't look up collation data, we use wcscoll
6311 instead. */
6312 {
6313 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;)
6314 {
6315 const CHAR_T *backup_d = d, *backup_dend = dend;
6316 length = wcslen(workp);
6317
6318 /* If wcscoll(the collating symbol, whole string) > 0,
6319 any substring of the string never match with the
6320 collating symbol. */
6321 if (wcscoll(workp, d) > 0)
6322 {
6323 workp += length + 1;
6324 continue;
6325 }
6326
6327 /* First, we compare the collating symbol with
6328 the first character of the string.
6329 If it don't match, we add the next character to
6330 the compare buffer in turn. */
6331 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++)
6332 {
6333 int match;
6334 if (d == dend)
6335 {
6336 if (dend == end_match_2)
6337 break;
6338 d = string2;
6339 dend = end_match_2;
6340 }
6341
6342 /* add next character to the compare buffer. */
6343 str_buf[i] = TRANSLATE(*d);
6344 str_buf[i+1] = '\0';
6345
6346 match = wcscoll(workp, str_buf);
6347 if (match == 0)
6348 goto char_set_matched;
6349
6350 if (match < 0)
6351 /* (str_buf > workp) indicate (str_buf + X > workp),
6352 because for all X (str_buf + X > str_buf).
6353 So we don't need continue this loop. */
6354 break;
6355
6356 /* Otherwise(str_buf < workp),
6357 (str_buf+next_character) may equals (workp).
6358 So we continue this loop. */
6359 }
6360 /* not matched */
6361 d = backup_d;
6362 dend = backup_dend;
6363 workp += length + 1;
6364 }
6365 }
6366 /* match with equivalence_class? */
6367 # ifdef _LIBC
6368 if (nrules != 0)
6369 {
6370 const CHAR_T *backup_d = d, *backup_dend = dend;
6371 /* Try to match the equivalence class against
6372 those known to the collate implementation. */
6373 const int32_t *table;
6374 const int32_t *weights;
6375 const int32_t *extra;
6376 const int32_t *indirect;
6377 int32_t idx, idx2;
6378 wint_t *cp;
6379 size_t len;
6380
6381 /* This #include defines a local function! */
6382 # include <locale/weightwc.h>
6383
6384 table = (const int32_t *)
6385 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC);
6386 weights = (const wint_t *)
6387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC);
6388 extra = (const wint_t *)
6389 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC);
6390 indirect = (const int32_t *)
6391 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC);
6392
6393 /* Write 1 collating element to str_buf, and
6394 get its index. */
6395 idx2 = 0;
6396
6397 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++)
6398 {
6399 cp = (wint_t*)str_buf;
6400 if (d == dend)
6401 {
6402 if (dend == end_match_2)
6403 break;
6404 d = string2;
6405 dend = end_match_2;
6406 }
6407 str_buf[i] = TRANSLATE(*(d+i));
6408 str_buf[i+1] = '\0'; /* sentinel */
6409 idx2 = findidx ((const wint_t**)&cp);
6410 }
6411
6412 /* Update d, however d will be incremented at
6413 char_set_matched:, we decrement d here. */
6414 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1);
6415 if (d >= dend)
6416 {
6417 if (dend == end_match_2)
6418 d = dend;
6419 else
6420 {
6421 d = string2;
6422 dend = end_match_2;
6423 }
6424 }
6425
6426 len = weights[idx2];
6427
6428 for (workp2 = workp + equiv_class_length ; workp < workp2 ;
6429 workp++)
6430 {
6431 idx = (int32_t)*workp;
6432 /* We already checked idx != 0 in regex_compile. */
6433
6434 if (idx2 != 0 && len == weights[idx])
6435 {
6436 int cnt = 0;
6437 while (cnt < len && (weights[idx + 1 + cnt]
6438 == weights[idx2 + 1 + cnt]))
6439 ++cnt;
6440
6441 if (cnt == len)
6442 goto char_set_matched;
6443 }
6444 }
6445 /* not matched */
6446 d = backup_d;
6447 dend = backup_dend;
6448 }
6449 else /* (nrules == 0) */
6450 # endif
6451 /* If we can't look up collation data, we use wcscoll
6452 instead. */
6453 {
6454 for (workp2 = workp + equiv_class_length ; workp < workp2 ;)
6455 {
6456 const CHAR_T *backup_d = d, *backup_dend = dend;
6457 length = wcslen(workp);
6458
6459 /* If wcscoll(the collating symbol, whole string) > 0,
6460 any substring of the string never match with the
6461 collating symbol. */
6462 if (wcscoll(workp, d) > 0)
6463 {
6464 workp += length + 1;
6465 break;
6466 }
6467
6468 /* First, we compare the equivalence class with
6469 the first character of the string.
6470 If it don't match, we add the next character to
6471 the compare buffer in turn. */
6472 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++)
6473 {
6474 int match;
6475 if (d == dend)
6476 {
6477 if (dend == end_match_2)
6478 break;
6479 d = string2;
6480 dend = end_match_2;
6481 }
6482
6483 /* add next character to the compare buffer. */
6484 str_buf[i] = TRANSLATE(*d);
6485 str_buf[i+1] = '\0';
6486
6487 match = wcscoll(workp, str_buf);
6488
6489 if (match == 0)
6490 goto char_set_matched;
6491
6492 if (match < 0)
6493 /* (str_buf > workp) indicate (str_buf + X > workp),
6494 because for all X (str_buf + X > str_buf).
6495 So we don't need continue this loop. */
6496 break;
6497
6498 /* Otherwise(str_buf < workp),
6499 (str_buf+next_character) may equals (workp).
6500 So we continue this loop. */
6501 }
6502 /* not matched */
6503 d = backup_d;
6504 dend = backup_dend;
6505 workp += length + 1;
6506 }
6507 }
6508
6509 /* match with char_range? */
6510 #ifdef _LIBC
6511 if (nrules != 0)
6512 {
6513 uint32_t collseqval;
6514 const char *collseq = (const char *)
6515 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC);
6516
6517 collseqval = collseq_table_lookup (collseq, c);
6518
6519 for (; workp < p - chars_length ;)
6520 {
6521 uint32_t start_val, end_val;
6522
6523 /* We already compute the collation sequence value
6524 of the characters (or collating symbols). */
6525 start_val = (uint32_t) *workp++; /* range_start */
6526 end_val = (uint32_t) *workp++; /* range_end */
6527
6528 if (start_val <= collseqval && collseqval <= end_val)
6529 goto char_set_matched;
6530 }
6531 }
6532 else
6533 #endif
6534 {
6535 /* We set range_start_char at str_buf[0], range_end_char
6536 at str_buf[4], and compared char at str_buf[2]. */
6537 str_buf[1] = 0;
6538 str_buf[2] = c;
6539 str_buf[3] = 0;
6540 str_buf[5] = 0;
6541 for (; workp < p - chars_length ;)
6542 {
6543 wchar_t *range_start_char, *range_end_char;
6544
6545 /* match if (range_start_char <= c <= range_end_char). */
6546
6547 /* If range_start(or end) < 0, we assume -range_start(end)
6548 is the offset of the collating symbol which is specified
6549 as the character of the range start(end). */
6550
6551 /* range_start */
6552 if (*workp < 0)
6553 range_start_char = charset_top - (*workp++);
6554 else
6555 {
6556 str_buf[0] = *workp++;
6557 range_start_char = str_buf;
6558 }
6559
6560 /* range_end */
6561 if (*workp < 0)
6562 range_end_char = charset_top - (*workp++);
6563 else
6564 {
6565 str_buf[4] = *workp++;
6566 range_end_char = str_buf + 4;
6567 }
6568
6569 if (wcscoll(range_start_char, str_buf+2) <= 0 &&
6570 wcscoll(str_buf+2, range_end_char) <= 0)
6571
6572 goto char_set_matched;
6573 }
6574 }
6575
6576 /* match with char? */
6577 for (; workp < p ; workp++)
6578 if (c == *workp)
6579 goto char_set_matched;
6580
6581 not = !not;
6582
6583 char_set_matched:
6584 if (not) goto fail;
6585 #else
6586 /* Cast to `unsigned' instead of `unsigned char' in case the
6587 bit list is a full 32 bytes long. */
6588 if (c < (unsigned) (*p * BYTEWIDTH)
6589 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
6590 not = !not;
6591
6592 p += 1 + *p;
6593
6594 if (!not) goto fail;
6595 #undef WORK_BUFFER_SIZE
6596 #endif /* WCHAR */
6597 SET_REGS_MATCHED ();
6598 d++;
6599 break;
6600 }
6601
6602
6603 /* The beginning of a group is represented by start_memory.
6604 The arguments are the register number in the next byte, and the
6605 number of groups inner to this one in the next. The text
6606 matched within the group is recorded (in the internal
6607 registers data structure) under the register number. */
6608 case start_memory:
6609 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n",
6610 (long int) *p, (long int) p[1]);
6611
6612 /* Find out if this group can match the empty string. */
6613 p1 = p; /* To send to group_match_null_string_p. */
6614
6615 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
6616 REG_MATCH_NULL_STRING_P (reg_info[*p])
6617 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info);
6618
6619 /* Save the position in the string where we were the last time
6620 we were at this open-group operator in case the group is
6621 operated upon by a repetition operator, e.g., with `(a*)*b'
6622 against `ab'; then we want to ignore where we are now in
6623 the string in case this attempt to match fails. */
6624 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6625 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
6626 : regstart[*p];
6627 DEBUG_PRINT2 (" old_regstart: %d\n",
6628 POINTER_TO_OFFSET (old_regstart[*p]));
6629
6630 regstart[*p] = d;
6631 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
6632
6633 IS_ACTIVE (reg_info[*p]) = 1;
6634 MATCHED_SOMETHING (reg_info[*p]) = 0;
6635
6636 /* Clear this whenever we change the register activity status. */
6637 set_regs_matched_done = 0;
6638
6639 /* This is the new highest active register. */
6640 highest_active_reg = *p;
6641
6642 /* If nothing was active before, this is the new lowest active
6643 register. */
6644 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6645 lowest_active_reg = *p;
6646
6647 /* Move past the register number and inner group count. */
6648 p += 2;
6649 just_past_start_mem = p;
6650
6651 break;
6652
6653
6654 /* The stop_memory opcode represents the end of a group. Its
6655 arguments are the same as start_memory's: the register
6656 number, and the number of inner groups. */
6657 case stop_memory:
6658 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n",
6659 (long int) *p, (long int) p[1]);
6660
6661 /* We need to save the string position the last time we were at
6662 this close-group operator in case the group is operated
6663 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
6664 against `aba'; then we want to ignore where we are now in
6665 the string in case this attempt to match fails. */
6666 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
6667 ? REG_UNSET (regend[*p]) ? d : regend[*p]
6668 : regend[*p];
6669 DEBUG_PRINT2 (" old_regend: %d\n",
6670 POINTER_TO_OFFSET (old_regend[*p]));
6671
6672 regend[*p] = d;
6673 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
6674
6675 /* This register isn't active anymore. */
6676 IS_ACTIVE (reg_info[*p]) = 0;
6677
6678 /* Clear this whenever we change the register activity status. */
6679 set_regs_matched_done = 0;
6680
6681 /* If this was the only register active, nothing is active
6682 anymore. */
6683 if (lowest_active_reg == highest_active_reg)
6684 {
6685 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6686 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6687 }
6688 else
6689 { /* We must scan for the new highest active register, since
6690 it isn't necessarily one less than now: consider
6691 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
6692 new highest active register is 1. */
6693 UCHAR_T r = *p - 1;
6694 while (r > 0 && !IS_ACTIVE (reg_info[r]))
6695 r--;
6696
6697 /* If we end up at register zero, that means that we saved
6698 the registers as the result of an `on_failure_jump', not
6699 a `start_memory', and we jumped to past the innermost
6700 `stop_memory'. For example, in ((.)*) we save
6701 registers 1 and 2 as a result of the *, but when we pop
6702 back to the second ), we are at the stop_memory 1.
6703 Thus, nothing is active. */
6704 if (r == 0)
6705 {
6706 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
6707 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
6708 }
6709 else
6710 highest_active_reg = r;
6711 }
6712
6713 /* If just failed to match something this time around with a
6714 group that's operated on by a repetition operator, try to
6715 force exit from the ``loop'', and restore the register
6716 information for this group that we had before trying this
6717 last match. */
6718 if ((!MATCHED_SOMETHING (reg_info[*p])
6719 || just_past_start_mem == p - 1)
6720 && (p + 2) < pend)
6721 {
6722 boolean is_a_jump_n = false;
6723
6724 p1 = p + 2;
6725 mcnt = 0;
6726 switch ((re_opcode_t) *p1++)
6727 {
6728 case jump_n:
6729 is_a_jump_n = true;
6730 case pop_failure_jump:
6731 case maybe_pop_jump:
6732 case jump:
6733 case dummy_failure_jump:
6734 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6735 if (is_a_jump_n)
6736 p1 += OFFSET_ADDRESS_SIZE;
6737 break;
6738
6739 default:
6740 /* do nothing */ ;
6741 }
6742 p1 += mcnt;
6743
6744 /* If the next operation is a jump backwards in the pattern
6745 to an on_failure_jump right before the start_memory
6746 corresponding to this stop_memory, exit from the loop
6747 by forcing a failure after pushing on the stack the
6748 on_failure_jump's jump in the pattern, and d. */
6749 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
6750 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory
6751 && p1[2+OFFSET_ADDRESS_SIZE] == *p)
6752 {
6753 /* If this group ever matched anything, then restore
6754 what its registers were before trying this last
6755 failed match, e.g., with `(a*)*b' against `ab' for
6756 regstart[1], and, e.g., with `((a*)*(b*)*)*'
6757 against `aba' for regend[3].
6758
6759 Also restore the registers for inner groups for,
6760 e.g., `((a*)(b*))*' against `aba' (register 3 would
6761 otherwise get trashed). */
6762
6763 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
6764 {
6765 unsigned r;
6766
6767 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
6768
6769 /* Restore this and inner groups' (if any) registers. */
6770 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
6771 r++)
6772 {
6773 regstart[r] = old_regstart[r];
6774
6775 /* xx why this test? */
6776 if (old_regend[r] >= regstart[r])
6777 regend[r] = old_regend[r];
6778 }
6779 }
6780 p1++;
6781 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
6782 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
6783
6784 goto fail;
6785 }
6786 }
6787
6788 /* Move past the register number and the inner group count. */
6789 p += 2;
6790 break;
6791
6792
6793 /* \<digit> has been turned into a `duplicate' command which is
6794 followed by the numeric value of <digit> as the register number. */
6795 case duplicate:
6796 {
6797 register const CHAR_T *d2, *dend2;
6798 int regno = *p++; /* Get which register to match against. */
6799 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
6800
6801 /* Can't back reference a group which we've never matched. */
6802 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
6803 goto fail;
6804
6805 /* Where in input to try to start matching. */
6806 d2 = regstart[regno];
6807
6808 /* Where to stop matching; if both the place to start and
6809 the place to stop matching are in the same string, then
6810 set to the place to stop, otherwise, for now have to use
6811 the end of the first string. */
6812
6813 dend2 = ((FIRST_STRING_P (regstart[regno])
6814 == FIRST_STRING_P (regend[regno]))
6815 ? regend[regno] : end_match_1);
6816 for (;;)
6817 {
6818 /* If necessary, advance to next segment in register
6819 contents. */
6820 while (d2 == dend2)
6821 {
6822 if (dend2 == end_match_2) break;
6823 if (dend2 == regend[regno]) break;
6824
6825 /* End of string1 => advance to string2. */
6826 d2 = string2;
6827 dend2 = regend[regno];
6828 }
6829 /* At end of register contents => success */
6830 if (d2 == dend2) break;
6831
6832 /* If necessary, advance to next segment in data. */
6833 PREFETCH ();
6834
6835 /* How many characters left in this segment to match. */
6836 mcnt = dend - d;
6837
6838 /* Want how many consecutive characters we can match in
6839 one shot, so, if necessary, adjust the count. */
6840 if (mcnt > dend2 - d2)
6841 mcnt = dend2 - d2;
6842
6843 /* Compare that many; failure if mismatch, else move
6844 past them. */
6845 if (translate
6846 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate)
6847 : memcmp (d, d2, mcnt*sizeof(UCHAR_T)))
6848 goto fail;
6849 d += mcnt, d2 += mcnt;
6850
6851 /* Do this because we've match some characters. */
6852 SET_REGS_MATCHED ();
6853 }
6854 }
6855 break;
6856
6857
6858 /* begline matches the empty string at the beginning of the string
6859 (unless `not_bol' is set in `bufp'), and, if
6860 `newline_anchor' is set, after newlines. */
6861 case begline:
6862 DEBUG_PRINT1 ("EXECUTING begline.\n");
6863
6864 if (AT_STRINGS_BEG (d))
6865 {
6866 if (!bufp->not_bol) break;
6867 }
6868 else if (d[-1] == '\n' && bufp->newline_anchor)
6869 {
6870 break;
6871 }
6872 /* In all other cases, we fail. */
6873 goto fail;
6874
6875
6876 /* endline is the dual of begline. */
6877 case endline:
6878 DEBUG_PRINT1 ("EXECUTING endline.\n");
6879
6880 if (AT_STRINGS_END (d))
6881 {
6882 if (!bufp->not_eol) break;
6883 }
6884
6885 /* We have to ``prefetch'' the next character. */
6886 else if ((d == end1 ? *string2 : *d) == '\n'
6887 && bufp->newline_anchor)
6888 {
6889 break;
6890 }
6891 goto fail;
6892
6893
6894 /* Match at the very beginning of the data. */
6895 case begbuf:
6896 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
6897 if (AT_STRINGS_BEG (d))
6898 break;
6899 goto fail;
6900
6901
6902 /* Match at the very end of the data. */
6903 case endbuf:
6904 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
6905 if (AT_STRINGS_END (d))
6906 break;
6907 goto fail;
6908
6909
6910 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
6911 pushes NULL as the value for the string on the stack. Then
6912 `pop_failure_point' will keep the current value for the
6913 string, instead of restoring it. To see why, consider
6914 matching `foo\nbar' against `.*\n'. The .* matches the foo;
6915 then the . fails against the \n. But the next thing we want
6916 to do is match the \n against the \n; if we restored the
6917 string value, we would be back at the foo.
6918
6919 Because this is used only in specific cases, we don't need to
6920 check all the things that `on_failure_jump' does, to make
6921 sure the right things get saved on the stack. Hence we don't
6922 share its code. The only reason to push anything on the
6923 stack at all is that otherwise we would have to change
6924 `anychar's code to do something besides goto fail in this
6925 case; that seems worse than this. */
6926 case on_failure_keep_string_jump:
6927 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
6928
6929 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6930 #ifdef _LIBC
6931 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt);
6932 #else
6933 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
6934 #endif
6935
6936 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
6937 break;
6938
6939
6940 /* Uses of on_failure_jump:
6941
6942 Each alternative starts with an on_failure_jump that points
6943 to the beginning of the next alternative. Each alternative
6944 except the last ends with a jump that in effect jumps past
6945 the rest of the alternatives. (They really jump to the
6946 ending jump of the following alternative, because tensioning
6947 these jumps is a hassle.)
6948
6949 Repeats start with an on_failure_jump that points past both
6950 the repetition text and either the following jump or
6951 pop_failure_jump back to this on_failure_jump. */
6952 case on_failure_jump:
6953 on_failure:
6954 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
6955
6956 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6957 #ifdef _LIBC
6958 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt);
6959 #else
6960 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
6961 #endif
6962
6963 /* If this on_failure_jump comes right before a group (i.e.,
6964 the original * applied to a group), save the information
6965 for that group and all inner ones, so that if we fail back
6966 to this point, the group's information will be correct.
6967 For example, in \(a*\)*\1, we need the preceding group,
6968 and in \(zz\(a*\)b*\)\2, we need the inner group. */
6969
6970 /* We can't use `p' to check ahead because we push
6971 a failure point to `p + mcnt' after we do this. */
6972 p1 = p;
6973
6974 /* We need to skip no_op's before we look for the
6975 start_memory in case this on_failure_jump is happening as
6976 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
6977 against aba. */
6978 while (p1 < pend && (re_opcode_t) *p1 == no_op)
6979 p1++;
6980
6981 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
6982 {
6983 /* We have a new highest active register now. This will
6984 get reset at the start_memory we are about to get to,
6985 but we will have saved all the registers relevant to
6986 this repetition op, as described above. */
6987 highest_active_reg = *(p1 + 1) + *(p1 + 2);
6988 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
6989 lowest_active_reg = *(p1 + 1);
6990 }
6991
6992 DEBUG_PRINT1 (":\n");
6993 PUSH_FAILURE_POINT (p + mcnt, d, -2);
6994 break;
6995
6996
6997 /* A smart repeat ends with `maybe_pop_jump'.
6998 We change it to either `pop_failure_jump' or `jump'. */
6999 case maybe_pop_jump:
7000 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7001 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
7002 {
7003 register UCHAR_T *p2 = p;
7004
7005 /* Compare the beginning of the repeat with what in the
7006 pattern follows its end. If we can establish that there
7007 is nothing that they would both match, i.e., that we
7008 would have to backtrack because of (as in, e.g., `a*a')
7009 then we can change to pop_failure_jump, because we'll
7010 never have to backtrack.
7011
7012 This is not true in the case of alternatives: in
7013 `(a|ab)*' we do need to backtrack to the `ab' alternative
7014 (e.g., if the string was `ab'). But instead of trying to
7015 detect that here, the alternative has put on a dummy
7016 failure point which is what we will end up popping. */
7017
7018 /* Skip over open/close-group commands.
7019 If what follows this loop is a ...+ construct,
7020 look at what begins its body, since we will have to
7021 match at least one of that. */
7022 while (1)
7023 {
7024 if (p2 + 2 < pend
7025 && ((re_opcode_t) *p2 == stop_memory
7026 || (re_opcode_t) *p2 == start_memory))
7027 p2 += 3;
7028 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend
7029 && (re_opcode_t) *p2 == dummy_failure_jump)
7030 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE;
7031 else
7032 break;
7033 }
7034
7035 p1 = p + mcnt;
7036 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
7037 to the `maybe_finalize_jump' of this case. Examine what
7038 follows. */
7039
7040 /* If we're at the end of the pattern, we can change. */
7041 if (p2 == pend)
7042 {
7043 /* Consider what happens when matching ":\(.*\)"
7044 against ":/". I don't really understand this code
7045 yet. */
7046 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7047 pop_failure_jump;
7048 DEBUG_PRINT1
7049 (" End of pattern: change to `pop_failure_jump'.\n");
7050 }
7051
7052 else if ((re_opcode_t) *p2 == exactn
7053 #ifdef MBS_SUPPORT
7054 || (re_opcode_t) *p2 == exactn_bin
7055 #endif
7056 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
7057 {
7058 register UCHAR_T c
7059 = *p2 == (UCHAR_T) endline ? '\n' : p2[2];
7060
7061 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn
7062 #ifdef MBS_SUPPORT
7063 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin
7064 #endif
7065 ) && p1[3+OFFSET_ADDRESS_SIZE] != c)
7066 {
7067 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T)
7068 pop_failure_jump;
7069 #ifdef WCHAR
7070 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n",
7071 (wint_t) c,
7072 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]);
7073 #else
7074 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
7075 (char) c,
7076 (char) p1[3+OFFSET_ADDRESS_SIZE]);
7077 #endif
7078 }
7079
7080 #ifndef WCHAR
7081 else if ((re_opcode_t) p1[3] == charset
7082 || (re_opcode_t) p1[3] == charset_not)
7083 {
7084 int not = (re_opcode_t) p1[3] == charset_not;
7085
7086 if (c < (unsigned) (p1[4] * BYTEWIDTH)
7087 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
7088 not = !not;
7089
7090 /* `not' is equal to 1 if c would match, which means
7091 that we can't change to pop_failure_jump. */
7092 if (!not)
7093 {
7094 p[-3] = (unsigned char) pop_failure_jump;
7095 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7096 }
7097 }
7098 #endif /* not WCHAR */
7099 }
7100 #ifndef WCHAR
7101 else if ((re_opcode_t) *p2 == charset)
7102 {
7103 /* We win if the first character of the loop is not part
7104 of the charset. */
7105 if ((re_opcode_t) p1[3] == exactn
7106 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
7107 && (p2[2 + p1[5] / BYTEWIDTH]
7108 & (1 << (p1[5] % BYTEWIDTH)))))
7109 {
7110 p[-3] = (unsigned char) pop_failure_jump;
7111 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7112 }
7113
7114 else if ((re_opcode_t) p1[3] == charset_not)
7115 {
7116 int idx;
7117 /* We win if the charset_not inside the loop
7118 lists every character listed in the charset after. */
7119 for (idx = 0; idx < (int) p2[1]; idx++)
7120 if (! (p2[2 + idx] == 0
7121 || (idx < (int) p1[4]
7122 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
7123 break;
7124
7125 if (idx == p2[1])
7126 {
7127 p[-3] = (unsigned char) pop_failure_jump;
7128 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7129 }
7130 }
7131 else if ((re_opcode_t) p1[3] == charset)
7132 {
7133 int idx;
7134 /* We win if the charset inside the loop
7135 has no overlap with the one after the loop. */
7136 for (idx = 0;
7137 idx < (int) p2[1] && idx < (int) p1[4];
7138 idx++)
7139 if ((p2[2 + idx] & p1[5 + idx]) != 0)
7140 break;
7141
7142 if (idx == p2[1] || idx == p1[4])
7143 {
7144 p[-3] = (unsigned char) pop_failure_jump;
7145 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
7146 }
7147 }
7148 }
7149 #endif /* not WCHAR */
7150 }
7151 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */
7152 if ((re_opcode_t) p[-1] != pop_failure_jump)
7153 {
7154 p[-1] = (UCHAR_T) jump;
7155 DEBUG_PRINT1 (" Match => jump.\n");
7156 goto unconditional_jump;
7157 }
7158 /* Note fall through. */
7159
7160
7161 /* The end of a simple repeat has a pop_failure_jump back to
7162 its matching on_failure_jump, where the latter will push a
7163 failure point. The pop_failure_jump takes off failure
7164 points put on by this pop_failure_jump's matching
7165 on_failure_jump; we got through the pattern to here from the
7166 matching on_failure_jump, so didn't fail. */
7167 case pop_failure_jump:
7168 {
7169 /* We need to pass separate storage for the lowest and
7170 highest registers, even though we don't care about the
7171 actual values. Otherwise, we will restore only one
7172 register from the stack, since lowest will == highest in
7173 `pop_failure_point'. */
7174 active_reg_t dummy_low_reg, dummy_high_reg;
7175 UCHAR_T *pdummy = NULL;
7176 const CHAR_T *sdummy = NULL;
7177
7178 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
7179 POP_FAILURE_POINT (sdummy, pdummy,
7180 dummy_low_reg, dummy_high_reg,
7181 reg_dummy, reg_dummy, reg_info_dummy);
7182 }
7183 /* Note fall through. */
7184
7185 unconditional_jump:
7186 #ifdef _LIBC
7187 DEBUG_PRINT2 ("\n%p: ", p);
7188 #else
7189 DEBUG_PRINT2 ("\n0x%x: ", p);
7190 #endif
7191 /* Note fall through. */
7192
7193 /* Unconditionally jump (without popping any failure points). */
7194 case jump:
7195 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
7196 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
7197 p += mcnt; /* Do the jump. */
7198 #ifdef _LIBC
7199 DEBUG_PRINT2 ("(to %p).\n", p);
7200 #else
7201 DEBUG_PRINT2 ("(to 0x%x).\n", p);
7202 #endif
7203 break;
7204
7205
7206 /* We need this opcode so we can detect where alternatives end
7207 in `group_match_null_string_p' et al. */
7208 case jump_past_alt:
7209 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
7210 goto unconditional_jump;
7211
7212
7213 /* Normally, the on_failure_jump pushes a failure point, which
7214 then gets popped at pop_failure_jump. We will end up at
7215 pop_failure_jump, also, and with a pattern of, say, `a+', we
7216 are skipping over the on_failure_jump, so we have to push
7217 something meaningless for pop_failure_jump to pop. */
7218 case dummy_failure_jump:
7219 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
7220 /* It doesn't matter what we push for the string here. What
7221 the code at `fail' tests is the value for the pattern. */
7222 PUSH_FAILURE_POINT (NULL, NULL, -2);
7223 goto unconditional_jump;
7224
7225
7226 /* At the end of an alternative, we need to push a dummy failure
7227 point in case we are followed by a `pop_failure_jump', because
7228 we don't want the failure point for the alternative to be
7229 popped. For example, matching `(a|ab)*' against `aab'
7230 requires that we match the `ab' alternative. */
7231 case push_dummy_failure:
7232 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
7233 /* See comments just above at `dummy_failure_jump' about the
7234 two zeroes. */
7235 PUSH_FAILURE_POINT (NULL, NULL, -2);
7236 break;
7237
7238 /* Have to succeed matching what follows at least n times.
7239 After that, handle like `on_failure_jump'. */
7240 case succeed_n:
7241 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7242 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
7243
7244 assert (mcnt >= 0);
7245 /* Originally, this is how many times we HAVE to succeed. */
7246 if (mcnt > 0)
7247 {
7248 mcnt--;
7249 p += OFFSET_ADDRESS_SIZE;
7250 STORE_NUMBER_AND_INCR (p, mcnt);
7251 #ifdef _LIBC
7252 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE
7253 , mcnt);
7254 #else
7255 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE
7256 , mcnt);
7257 #endif
7258 }
7259 else if (mcnt == 0)
7260 {
7261 #ifdef _LIBC
7262 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n",
7263 p + OFFSET_ADDRESS_SIZE);
7264 #else
7265 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n",
7266 p + OFFSET_ADDRESS_SIZE);
7267 #endif /* _LIBC */
7268
7269 #ifdef WCHAR
7270 p[1] = (UCHAR_T) no_op;
7271 #else
7272 p[2] = (UCHAR_T) no_op;
7273 p[3] = (UCHAR_T) no_op;
7274 #endif /* WCHAR */
7275 goto on_failure;
7276 }
7277 break;
7278
7279 case jump_n:
7280 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE);
7281 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
7282
7283 /* Originally, this is how many times we CAN jump. */
7284 if (mcnt)
7285 {
7286 mcnt--;
7287 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt);
7288
7289 #ifdef _LIBC
7290 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE,
7291 mcnt);
7292 #else
7293 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE,
7294 mcnt);
7295 #endif /* _LIBC */
7296 goto unconditional_jump;
7297 }
7298 /* If don't have to jump any more, skip over the rest of command. */
7299 else
7300 p += 2 * OFFSET_ADDRESS_SIZE;
7301 break;
7302
7303 case set_number_at:
7304 {
7305 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
7306
7307 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7308 p1 = p + mcnt;
7309 EXTRACT_NUMBER_AND_INCR (mcnt, p);
7310 #ifdef _LIBC
7311 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
7312 #else
7313 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
7314 #endif
7315 STORE_NUMBER (p1, mcnt);
7316 break;
7317 }
7318
7319 #if 0
7320 /* The DEC Alpha C compiler 3.x generates incorrect code for the
7321 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
7322 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
7323 macro and introducing temporary variables works around the bug. */
7324
7325 case wordbound:
7326 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7327 if (AT_WORD_BOUNDARY (d))
7328 break;
7329 goto fail;
7330
7331 case notwordbound:
7332 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7333 if (AT_WORD_BOUNDARY (d))
7334 goto fail;
7335 break;
7336 #else
7337 case wordbound:
7338 {
7339 boolean prevchar, thischar;
7340
7341 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
7342 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7343 break;
7344
7345 prevchar = WORDCHAR_P (d - 1);
7346 thischar = WORDCHAR_P (d);
7347 if (prevchar != thischar)
7348 break;
7349 goto fail;
7350 }
7351
7352 case notwordbound:
7353 {
7354 boolean prevchar, thischar;
7355
7356 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
7357 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
7358 goto fail;
7359
7360 prevchar = WORDCHAR_P (d - 1);
7361 thischar = WORDCHAR_P (d);
7362 if (prevchar != thischar)
7363 goto fail;
7364 break;
7365 }
7366 #endif
7367
7368 case wordbeg:
7369 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
7370 if (!AT_STRINGS_END (d) && WORDCHAR_P (d)
7371 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
7372 break;
7373 goto fail;
7374
7375 case wordend:
7376 DEBUG_PRINT1 ("EXECUTING wordend.\n");
7377 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
7378 && (AT_STRINGS_END (d) || !WORDCHAR_P (d)))
7379 break;
7380 goto fail;
7381
7382 #ifdef emacs
7383 case before_dot:
7384 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
7385 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
7386 goto fail;
7387 break;
7388
7389 case at_dot:
7390 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
7391 if (PTR_CHAR_POS ((unsigned char *) d) != point)
7392 goto fail;
7393 break;
7394
7395 case after_dot:
7396 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
7397 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
7398 goto fail;
7399 break;
7400
7401 case syntaxspec:
7402 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
7403 mcnt = *p++;
7404 goto matchsyntax;
7405
7406 case wordchar:
7407 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
7408 mcnt = (int) Sword;
7409 matchsyntax:
7410 PREFETCH ();
7411 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7412 d++;
7413 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
7414 goto fail;
7415 SET_REGS_MATCHED ();
7416 break;
7417
7418 case notsyntaxspec:
7419 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
7420 mcnt = *p++;
7421 goto matchnotsyntax;
7422
7423 case notwordchar:
7424 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
7425 mcnt = (int) Sword;
7426 matchnotsyntax:
7427 PREFETCH ();
7428 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
7429 d++;
7430 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
7431 goto fail;
7432 SET_REGS_MATCHED ();
7433 break;
7434
7435 #else /* not emacs */
7436 case wordchar:
7437 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
7438 PREFETCH ();
7439 if (!WORDCHAR_P (d))
7440 goto fail;
7441 SET_REGS_MATCHED ();
7442 d++;
7443 break;
7444
7445 case notwordchar:
7446 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
7447 PREFETCH ();
7448 if (WORDCHAR_P (d))
7449 goto fail;
7450 SET_REGS_MATCHED ();
7451 d++;
7452 break;
7453 #endif /* not emacs */
7454
7455 default:
7456 abort ();
7457 }
7458 continue; /* Successfully executed one pattern command; keep going. */
7459
7460
7461 /* We goto here if a matching operation fails. */
7462 fail:
7463 if (!FAIL_STACK_EMPTY ())
7464 { /* A restart point is known. Restore to that state. */
7465 DEBUG_PRINT1 ("\nFAIL:\n");
7466 POP_FAILURE_POINT (d, p,
7467 lowest_active_reg, highest_active_reg,
7468 regstart, regend, reg_info);
7469
7470 /* If this failure point is a dummy, try the next one. */
7471 if (!p)
7472 goto fail;
7473
7474 /* If we failed to the end of the pattern, don't examine *p. */
7475 assert (p <= pend);
7476 if (p < pend)
7477 {
7478 boolean is_a_jump_n = false;
7479
7480 /* If failed to a backwards jump that's part of a repetition
7481 loop, need to pop this failure point and use the next one. */
7482 switch ((re_opcode_t) *p)
7483 {
7484 case jump_n:
7485 is_a_jump_n = true;
7486 case maybe_pop_jump:
7487 case pop_failure_jump:
7488 case jump:
7489 p1 = p + 1;
7490 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7491 p1 += mcnt;
7492
7493 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
7494 || (!is_a_jump_n
7495 && (re_opcode_t) *p1 == on_failure_jump))
7496 goto fail;
7497 break;
7498 default:
7499 /* do nothing */ ;
7500 }
7501 }
7502
7503 if (d >= string1 && d <= end1)
7504 dend = end_match_1;
7505 }
7506 else
7507 break; /* Matching at this starting point really fails. */
7508 } /* for (;;) */
7509
7510 if (best_regs_set)
7511 goto restore_best_regs;
7512
7513 FREE_VARIABLES ();
7514
7515 return -1; /* Failure to match. */
7516 } /* re_match_2 */
7517 \f
7518 /* Subroutine definitions for re_match_2. */
7519
7520
7521 /* We are passed P pointing to a register number after a start_memory.
7522
7523 Return true if the pattern up to the corresponding stop_memory can
7524 match the empty string, and false otherwise.
7525
7526 If we find the matching stop_memory, sets P to point to one past its number.
7527 Otherwise, sets P to an undefined byte less than or equal to END.
7528
7529 We don't handle duplicates properly (yet). */
7530
7531 static boolean
7532 PREFIX(group_match_null_string_p) (p, end, reg_info)
7533 UCHAR_T **p, *end;
7534 PREFIX(register_info_type) *reg_info;
7535 {
7536 int mcnt;
7537 /* Point to after the args to the start_memory. */
7538 UCHAR_T *p1 = *p + 2;
7539
7540 while (p1 < end)
7541 {
7542 /* Skip over opcodes that can match nothing, and return true or
7543 false, as appropriate, when we get to one that can't, or to the
7544 matching stop_memory. */
7545
7546 switch ((re_opcode_t) *p1)
7547 {
7548 /* Could be either a loop or a series of alternatives. */
7549 case on_failure_jump:
7550 p1++;
7551 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7552
7553 /* If the next operation is not a jump backwards in the
7554 pattern. */
7555
7556 if (mcnt >= 0)
7557 {
7558 /* Go through the on_failure_jumps of the alternatives,
7559 seeing if any of the alternatives cannot match nothing.
7560 The last alternative starts with only a jump,
7561 whereas the rest start with on_failure_jump and end
7562 with a jump, e.g., here is the pattern for `a|b|c':
7563
7564 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
7565 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
7566 /exactn/1/c
7567
7568 So, we have to first go through the first (n-1)
7569 alternatives and then deal with the last one separately. */
7570
7571
7572 /* Deal with the first (n-1) alternatives, which start
7573 with an on_failure_jump (see above) that jumps to right
7574 past a jump_past_alt. */
7575
7576 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] ==
7577 jump_past_alt)
7578 {
7579 /* `mcnt' holds how many bytes long the alternative
7580 is, including the ending `jump_past_alt' and
7581 its number. */
7582
7583 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt -
7584 (1 + OFFSET_ADDRESS_SIZE),
7585 reg_info))
7586 return false;
7587
7588 /* Move to right after this alternative, including the
7589 jump_past_alt. */
7590 p1 += mcnt;
7591
7592 /* Break if it's the beginning of an n-th alternative
7593 that doesn't begin with an on_failure_jump. */
7594 if ((re_opcode_t) *p1 != on_failure_jump)
7595 break;
7596
7597 /* Still have to check that it's not an n-th
7598 alternative that starts with an on_failure_jump. */
7599 p1++;
7600 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7601 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] !=
7602 jump_past_alt)
7603 {
7604 /* Get to the beginning of the n-th alternative. */
7605 p1 -= 1 + OFFSET_ADDRESS_SIZE;
7606 break;
7607 }
7608 }
7609
7610 /* Deal with the last alternative: go back and get number
7611 of the `jump_past_alt' just before it. `mcnt' contains
7612 the length of the alternative. */
7613 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE);
7614
7615 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info))
7616 return false;
7617
7618 p1 += mcnt; /* Get past the n-th alternative. */
7619 } /* if mcnt > 0 */
7620 break;
7621
7622
7623 case stop_memory:
7624 assert (p1[1] == **p);
7625 *p = p1 + 2;
7626 return true;
7627
7628
7629 default:
7630 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7631 return false;
7632 }
7633 } /* while p1 < end */
7634
7635 return false;
7636 } /* group_match_null_string_p */
7637
7638
7639 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
7640 It expects P to be the first byte of a single alternative and END one
7641 byte past the last. The alternative can contain groups. */
7642
7643 static boolean
7644 PREFIX(alt_match_null_string_p) (p, end, reg_info)
7645 UCHAR_T *p, *end;
7646 PREFIX(register_info_type) *reg_info;
7647 {
7648 int mcnt;
7649 UCHAR_T *p1 = p;
7650
7651 while (p1 < end)
7652 {
7653 /* Skip over opcodes that can match nothing, and break when we get
7654 to one that can't. */
7655
7656 switch ((re_opcode_t) *p1)
7657 {
7658 /* It's a loop. */
7659 case on_failure_jump:
7660 p1++;
7661 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7662 p1 += mcnt;
7663 break;
7664
7665 default:
7666 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info))
7667 return false;
7668 }
7669 } /* while p1 < end */
7670
7671 return true;
7672 } /* alt_match_null_string_p */
7673
7674
7675 /* Deals with the ops common to group_match_null_string_p and
7676 alt_match_null_string_p.
7677
7678 Sets P to one after the op and its arguments, if any. */
7679
7680 static boolean
7681 PREFIX(common_op_match_null_string_p) (p, end, reg_info)
7682 UCHAR_T **p, *end;
7683 PREFIX(register_info_type) *reg_info;
7684 {
7685 int mcnt;
7686 boolean ret;
7687 int reg_no;
7688 UCHAR_T *p1 = *p;
7689
7690 switch ((re_opcode_t) *p1++)
7691 {
7692 case no_op:
7693 case begline:
7694 case endline:
7695 case begbuf:
7696 case endbuf:
7697 case wordbeg:
7698 case wordend:
7699 case wordbound:
7700 case notwordbound:
7701 #ifdef emacs
7702 case before_dot:
7703 case at_dot:
7704 case after_dot:
7705 #endif
7706 break;
7707
7708 case start_memory:
7709 reg_no = *p1;
7710 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
7711 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info);
7712
7713 /* Have to set this here in case we're checking a group which
7714 contains a group and a back reference to it. */
7715
7716 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
7717 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
7718
7719 if (!ret)
7720 return false;
7721 break;
7722
7723 /* If this is an optimized succeed_n for zero times, make the jump. */
7724 case jump:
7725 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7726 if (mcnt >= 0)
7727 p1 += mcnt;
7728 else
7729 return false;
7730 break;
7731
7732 case succeed_n:
7733 /* Get to the number of times to succeed. */
7734 p1 += OFFSET_ADDRESS_SIZE;
7735 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7736
7737 if (mcnt == 0)
7738 {
7739 p1 -= 2 * OFFSET_ADDRESS_SIZE;
7740 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
7741 p1 += mcnt;
7742 }
7743 else
7744 return false;
7745 break;
7746
7747 case duplicate:
7748 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
7749 return false;
7750 break;
7751
7752 case set_number_at:
7753 p1 += 2 * OFFSET_ADDRESS_SIZE;
7754
7755 default:
7756 /* All other opcodes mean we cannot match the empty string. */
7757 return false;
7758 }
7759
7760 *p = p1;
7761 return true;
7762 } /* common_op_match_null_string_p */
7763
7764
7765 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
7766 bytes; nonzero otherwise. */
7767
7768 static int
7769 PREFIX(bcmp_translate) (s1, s2, len, translate)
7770 const CHAR_T *s1, *s2;
7771 register int len;
7772 RE_TRANSLATE_TYPE translate;
7773 {
7774 register const UCHAR_T *p1 = (const UCHAR_T *) s1;
7775 register const UCHAR_T *p2 = (const UCHAR_T *) s2;
7776 while (len)
7777 {
7778 #ifdef WCHAR
7779 if (((*p1<=0xff)?translate[*p1++]:*p1++)
7780 != ((*p2<=0xff)?translate[*p2++]:*p2++))
7781 return 1;
7782 #else /* BYTE */
7783 if (translate[*p1++] != translate[*p2++]) return 1;
7784 #endif /* WCHAR */
7785 len--;
7786 }
7787 return 0;
7788 }
7789 \f
7790
7791 #else /* not INSIDE_RECURSION */
7792
7793 /* Entry points for GNU code. */
7794
7795 /* re_compile_pattern is the GNU regular expression compiler: it
7796 compiles PATTERN (of length SIZE) and puts the result in BUFP.
7797 Returns 0 if the pattern was valid, otherwise an error string.
7798
7799 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
7800 are set in BUFP on entry.
7801
7802 We call regex_compile to do the actual compilation. */
7803
7804 const char *
7805 re_compile_pattern (pattern, length, bufp)
7806 const char *pattern;
7807 size_t length;
7808 struct re_pattern_buffer *bufp;
7809 {
7810 reg_errcode_t ret;
7811
7812 /* GNU code is written to assume at least RE_NREGS registers will be set
7813 (and at least one extra will be -1). */
7814 bufp->regs_allocated = REGS_UNALLOCATED;
7815
7816 /* And GNU code determines whether or not to get register information
7817 by passing null for the REGS argument to re_match, etc., not by
7818 setting no_sub. */
7819 bufp->no_sub = 0;
7820
7821 /* Match anchors at newline. */
7822 bufp->newline_anchor = 1;
7823
7824 # ifdef MBS_SUPPORT
7825 if (MB_CUR_MAX != 1)
7826 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp);
7827 else
7828 # endif
7829 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp);
7830
7831 if (!ret)
7832 return NULL;
7833 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7834 }
7835 #ifdef _LIBC
7836 weak_alias (__re_compile_pattern, re_compile_pattern)
7837 #endif
7838 \f
7839 /* Entry points compatible with 4.2 BSD regex library. We don't define
7840 them unless specifically requested. */
7841
7842 #if defined _REGEX_RE_COMP || defined _LIBC
7843
7844 /* BSD has one and only one pattern buffer. */
7845 static struct re_pattern_buffer re_comp_buf;
7846
7847 char *
7848 #ifdef _LIBC
7849 /* Make these definitions weak in libc, so POSIX programs can redefine
7850 these names if they don't use our functions, and still use
7851 regcomp/regexec below without link errors. */
7852 weak_function
7853 #endif
7854 re_comp (s)
7855 const char *s;
7856 {
7857 reg_errcode_t ret;
7858
7859 if (!s)
7860 {
7861 if (!re_comp_buf.buffer)
7862 return gettext ("No previous regular expression");
7863 return 0;
7864 }
7865
7866 if (!re_comp_buf.buffer)
7867 {
7868 re_comp_buf.buffer = (unsigned char *) malloc (200);
7869 if (re_comp_buf.buffer == NULL)
7870 return (char *) gettext (re_error_msgid
7871 + re_error_msgid_idx[(int) REG_ESPACE]);
7872 re_comp_buf.allocated = 200;
7873
7874 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
7875 if (re_comp_buf.fastmap == NULL)
7876 return (char *) gettext (re_error_msgid
7877 + re_error_msgid_idx[(int) REG_ESPACE]);
7878 }
7879
7880 /* Since `re_exec' always passes NULL for the `regs' argument, we
7881 don't need to initialize the pattern buffer fields which affect it. */
7882
7883 /* Match anchors at newlines. */
7884 re_comp_buf.newline_anchor = 1;
7885
7886 # ifdef MBS_SUPPORT
7887 if (MB_CUR_MAX != 1)
7888 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7889 else
7890 # endif
7891 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
7892
7893 if (!ret)
7894 return NULL;
7895
7896 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
7897 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]);
7898 }
7899
7900
7901 int
7902 #ifdef _LIBC
7903 weak_function
7904 #endif
7905 re_exec (s)
7906 const char *s;
7907 {
7908 const int len = strlen (s);
7909 return
7910 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
7911 }
7912
7913 #endif /* _REGEX_RE_COMP */
7914 \f
7915 /* POSIX.2 functions. Don't define these for Emacs. */
7916
7917 #ifndef emacs
7918
7919 /* regcomp takes a regular expression as a string and compiles it.
7920
7921 PREG is a regex_t *. We do not expect any fields to be initialized,
7922 since POSIX says we shouldn't. Thus, we set
7923
7924 `buffer' to the compiled pattern;
7925 `used' to the length of the compiled pattern;
7926 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
7927 REG_EXTENDED bit in CFLAGS is set; otherwise, to
7928 RE_SYNTAX_POSIX_BASIC;
7929 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
7930 `fastmap' to an allocated space for the fastmap;
7931 `fastmap_accurate' to zero;
7932 `re_nsub' to the number of subexpressions in PATTERN.
7933
7934 PATTERN is the address of the pattern string.
7935
7936 CFLAGS is a series of bits which affect compilation.
7937
7938 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
7939 use POSIX basic syntax.
7940
7941 If REG_NEWLINE is set, then . and [^...] don't match newline.
7942 Also, regexec will try a match beginning after every newline.
7943
7944 If REG_ICASE is set, then we considers upper- and lowercase
7945 versions of letters to be equivalent when matching.
7946
7947 If REG_NOSUB is set, then when PREG is passed to regexec, that
7948 routine will report only success or failure, and nothing about the
7949 registers.
7950
7951 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
7952 the return codes and their meanings.) */
7953
7954 int
7955 regcomp (preg, pattern, cflags)
7956 regex_t *preg;
7957 const char *pattern;
7958 int cflags;
7959 {
7960 reg_errcode_t ret;
7961 reg_syntax_t syntax
7962 = (cflags & REG_EXTENDED) ?
7963 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
7964
7965 /* regex_compile will allocate the space for the compiled pattern. */
7966 preg->buffer = 0;
7967 preg->allocated = 0;
7968 preg->used = 0;
7969
7970 /* Try to allocate space for the fastmap. */
7971 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
7972
7973 if (cflags & REG_ICASE)
7974 {
7975 unsigned i;
7976
7977 preg->translate
7978 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
7979 * sizeof (*(RE_TRANSLATE_TYPE)0));
7980 if (preg->translate == NULL)
7981 return (int) REG_ESPACE;
7982
7983 /* Map uppercase characters to corresponding lowercase ones. */
7984 for (i = 0; i < CHAR_SET_SIZE; i++)
7985 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
7986 }
7987 else
7988 preg->translate = NULL;
7989
7990 /* If REG_NEWLINE is set, newlines are treated differently. */
7991 if (cflags & REG_NEWLINE)
7992 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
7993 syntax &= ~RE_DOT_NEWLINE;
7994 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
7995 /* It also changes the matching behavior. */
7996 preg->newline_anchor = 1;
7997 }
7998 else
7999 preg->newline_anchor = 0;
8000
8001 preg->no_sub = !!(cflags & REG_NOSUB);
8002
8003 /* POSIX says a null character in the pattern terminates it, so we
8004 can use strlen here in compiling the pattern. */
8005 # ifdef MBS_SUPPORT
8006 if (MB_CUR_MAX != 1)
8007 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg);
8008 else
8009 # endif
8010 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg);
8011
8012 /* POSIX doesn't distinguish between an unmatched open-group and an
8013 unmatched close-group: both are REG_EPAREN. */
8014 if (ret == REG_ERPAREN) ret = REG_EPAREN;
8015
8016 if (ret == REG_NOERROR && preg->fastmap)
8017 {
8018 /* Compute the fastmap now, since regexec cannot modify the pattern
8019 buffer. */
8020 if (re_compile_fastmap (preg) == -2)
8021 {
8022 /* Some error occurred while computing the fastmap, just forget
8023 about it. */
8024 free (preg->fastmap);
8025 preg->fastmap = NULL;
8026 }
8027 }
8028
8029 return (int) ret;
8030 }
8031 #ifdef _LIBC
8032 weak_alias (__regcomp, regcomp)
8033 #endif
8034
8035
8036 /* regexec searches for a given pattern, specified by PREG, in the
8037 string STRING.
8038
8039 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
8040 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
8041 least NMATCH elements, and we set them to the offsets of the
8042 corresponding matched substrings.
8043
8044 EFLAGS specifies `execution flags' which affect matching: if
8045 REG_NOTBOL is set, then ^ does not match at the beginning of the
8046 string; if REG_NOTEOL is set, then $ does not match at the end.
8047
8048 We return 0 if we find a match and REG_NOMATCH if not. */
8049
8050 int
8051 regexec (preg, string, nmatch, pmatch, eflags)
8052 const regex_t *preg;
8053 const char *string;
8054 size_t nmatch;
8055 regmatch_t pmatch[];
8056 int eflags;
8057 {
8058 int ret;
8059 struct re_registers regs;
8060 regex_t private_preg;
8061 int len = strlen (string);
8062 boolean want_reg_info = !preg->no_sub && nmatch > 0;
8063
8064 private_preg = *preg;
8065
8066 private_preg.not_bol = !!(eflags & REG_NOTBOL);
8067 private_preg.not_eol = !!(eflags & REG_NOTEOL);
8068
8069 /* The user has told us exactly how many registers to return
8070 information about, via `nmatch'. We have to pass that on to the
8071 matching routines. */
8072 private_preg.regs_allocated = REGS_FIXED;
8073
8074 if (want_reg_info)
8075 {
8076 regs.num_regs = nmatch;
8077 regs.start = TALLOC (nmatch * 2, regoff_t);
8078 if (regs.start == NULL)
8079 return (int) REG_NOMATCH;
8080 regs.end = regs.start + nmatch;
8081 }
8082
8083 /* Perform the searching operation. */
8084 ret = re_search (&private_preg, string, len,
8085 /* start: */ 0, /* range: */ len,
8086 want_reg_info ? &regs : (struct re_registers *) 0);
8087
8088 /* Copy the register information to the POSIX structure. */
8089 if (want_reg_info)
8090 {
8091 if (ret >= 0)
8092 {
8093 unsigned r;
8094
8095 for (r = 0; r < nmatch; r++)
8096 {
8097 pmatch[r].rm_so = regs.start[r];
8098 pmatch[r].rm_eo = regs.end[r];
8099 }
8100 }
8101
8102 /* If we needed the temporary register info, free the space now. */
8103 free (regs.start);
8104 }
8105
8106 /* We want zero return to mean success, unlike `re_search'. */
8107 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
8108 }
8109 #ifdef _LIBC
8110 weak_alias (__regexec, regexec)
8111 #endif
8112
8113
8114 /* Returns a message corresponding to an error code, ERRCODE, returned
8115 from either regcomp or regexec. We don't use PREG here. */
8116
8117 size_t
8118 regerror (errcode, preg, errbuf, errbuf_size)
8119 int errcode;
8120 const regex_t *preg;
8121 char *errbuf;
8122 size_t errbuf_size;
8123 {
8124 const char *msg;
8125 size_t msg_size;
8126
8127 if (errcode < 0
8128 || errcode >= (int) (sizeof (re_error_msgid_idx)
8129 / sizeof (re_error_msgid_idx[0])))
8130 /* Only error codes returned by the rest of the code should be passed
8131 to this routine. If we are given anything else, or if other regex
8132 code generates an invalid error code, then the program has a bug.
8133 Dump core so we can fix it. */
8134 abort ();
8135
8136 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]);
8137
8138 msg_size = strlen (msg) + 1; /* Includes the null. */
8139
8140 if (errbuf_size != 0)
8141 {
8142 if (msg_size > errbuf_size)
8143 {
8144 #if defined HAVE_MEMPCPY || defined _LIBC
8145 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
8146 #else
8147 memcpy (errbuf, msg, errbuf_size - 1);
8148 errbuf[errbuf_size - 1] = 0;
8149 #endif
8150 }
8151 else
8152 memcpy (errbuf, msg, msg_size);
8153 }
8154
8155 return msg_size;
8156 }
8157 #ifdef _LIBC
8158 weak_alias (__regerror, regerror)
8159 #endif
8160
8161
8162 /* Free dynamically allocated space used by PREG. */
8163
8164 void
8165 regfree (preg)
8166 regex_t *preg;
8167 {
8168 if (preg->buffer != NULL)
8169 free (preg->buffer);
8170 preg->buffer = NULL;
8171
8172 preg->allocated = 0;
8173 preg->used = 0;
8174
8175 if (preg->fastmap != NULL)
8176 free (preg->fastmap);
8177 preg->fastmap = NULL;
8178 preg->fastmap_accurate = 0;
8179
8180 if (preg->translate != NULL)
8181 free (preg->translate);
8182 preg->translate = NULL;
8183 }
8184 #ifdef _LIBC
8185 weak_alias (__regfree, regfree)
8186 #endif
8187
8188 #endif /* not emacs */
8189
8190 #endif /* not INSIDE_RECURSION */
8191
8192 \f
8193 #undef STORE_NUMBER
8194 #undef STORE_NUMBER_AND_INCR
8195 #undef EXTRACT_NUMBER
8196 #undef EXTRACT_NUMBER_AND_INCR
8197
8198 #undef DEBUG_PRINT_COMPILED_PATTERN
8199 #undef DEBUG_PRINT_DOUBLE_STRING
8200
8201 #undef INIT_FAIL_STACK
8202 #undef RESET_FAIL_STACK
8203 #undef DOUBLE_FAIL_STACK
8204 #undef PUSH_PATTERN_OP
8205 #undef PUSH_FAILURE_POINTER
8206 #undef PUSH_FAILURE_INT
8207 #undef PUSH_FAILURE_ELT
8208 #undef POP_FAILURE_POINTER
8209 #undef POP_FAILURE_INT
8210 #undef POP_FAILURE_ELT
8211 #undef DEBUG_PUSH
8212 #undef DEBUG_POP
8213 #undef PUSH_FAILURE_POINT
8214 #undef POP_FAILURE_POINT
8215
8216 #undef REG_UNSET_VALUE
8217 #undef REG_UNSET
8218
8219 #undef PATFETCH
8220 #undef PATFETCH_RAW
8221 #undef PATUNFETCH
8222 #undef TRANSLATE
8223
8224 #undef INIT_BUF_SIZE
8225 #undef GET_BUFFER_SPACE
8226 #undef BUF_PUSH
8227 #undef BUF_PUSH_2
8228 #undef BUF_PUSH_3
8229 #undef STORE_JUMP
8230 #undef STORE_JUMP2
8231 #undef INSERT_JUMP
8232 #undef INSERT_JUMP2
8233 #undef EXTEND_BUFFER
8234 #undef GET_UNSIGNED_NUMBER
8235 #undef FREE_STACK_RETURN
8236
8237 # undef POINTER_TO_OFFSET
8238 # undef MATCHING_IN_FRST_STRING
8239 # undef PREFETCH
8240 # undef AT_STRINGS_BEG
8241 # undef AT_STRINGS_END
8242 # undef WORDCHAR_P
8243 # undef FREE_VAR
8244 # undef FREE_VARIABLES
8245 # undef NO_HIGHEST_ACTIVE_REG
8246 # undef NO_LOWEST_ACTIVE_REG
8247
8248 # undef CHAR_T
8249 # undef UCHAR_T
8250 # undef COMPILED_BUFFER_VAR
8251 # undef OFFSET_ADDRESS_SIZE
8252 # undef CHAR_CLASS_SIZE
8253 # undef PREFIX
8254 # undef ARG_PREFIX
8255 # undef PUT_CHAR
8256 # undef BYTE
8257 # undef WCHAR
8258
8259 # define DEFINED_ONCE
This page took 0.238517 seconds and 4 git commands to generate.