1 /* Extended regular expression matching and search library.
2 Copyright (C) 1985, 1989 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
18 /* To test, compile with -Dtest.
19 This Dtestable feature turns this into a self-contained program
20 which reads a pattern, describes how it compiles,
21 then reads a string and searches for it. */
25 /* The `emacs' switch turns on certain special matching commands
26 that make sense only in emacs. */
35 #define bcmp(s1,s2,n) memcmp((s1),(s2),(n))
36 #define bzero(s,n) memset((s),0,(n))
38 /* Make alloca work the best possible way. */
40 #define alloca __builtin_alloca
48 * Define the syntax stuff, so we can do the \<...\> things.
51 #ifndef Sword /* must be non-zero in some of the tests below... */
55 #define SYNTAX(c) re_syntax_table[c]
59 char *re_syntax_table
;
63 static char re_syntax_table
[256];
74 bzero (re_syntax_table
, sizeof re_syntax_table
);
76 for (c
= 'a'; c
<= 'z'; c
++)
77 re_syntax_table
[c
] = Sword
;
79 for (c
= 'A'; c
<= 'Z'; c
++)
80 re_syntax_table
[c
] = Sword
;
82 for (c
= '0'; c
<= '9'; c
++)
83 re_syntax_table
[c
] = Sword
;
88 #endif /* SYNTAX_TABLE */
89 #endif /* not emacs */
93 /* Number of failure points to allocate space for initially,
94 when matching. If this number is exceeded, more space is allocated,
95 so it is not a hard limit. */
99 #endif /* NFAILURES */
101 /* width of a byte in bits */
105 #ifndef SIGN_EXTEND_CHAR
106 #define SIGN_EXTEND_CHAR(x) (x)
109 static int obscure_syntax
= 0;
111 /* Specify the precise syntax of regexp for compilation.
112 This provides for compatibility for various utilities
113 which historically have different, incompatible syntaxes.
115 The argument SYNTAX is a bit-mask containing the two bits
116 RE_NO_BK_PARENS and RE_NO_BK_VBAR. */
119 re_set_syntax (syntax
)
124 ret
= obscure_syntax
;
125 obscure_syntax
= syntax
;
129 /* re_compile_pattern takes a regular-expression string
130 and converts it into a buffer full of byte commands for matching.
132 PATTERN is the address of the pattern string
133 SIZE is the length of it.
134 BUFP is a struct re_pattern_buffer * which points to the info
135 on where to store the byte commands.
136 This structure contains a char * which points to the
137 actual space, which should have been obtained with malloc.
138 re_compile_pattern may use realloc to grow the buffer space.
140 The number of bytes of commands can be found out by looking in
141 the struct re_pattern_buffer that bufp pointed to,
142 after re_compile_pattern returns.
145 #define PATPUSH(ch) (*b++ = (char) (ch))
147 #define PATFETCH(c) \
148 {if (p == pend) goto end_of_pattern; \
149 c = * (unsigned char *) p++; \
150 if (translate) c = translate[c]; }
152 #define PATFETCH_RAW(c) \
153 {if (p == pend) goto end_of_pattern; \
154 c = * (unsigned char *) p++; }
156 #define PATUNFETCH p--
158 #define EXTEND_BUFFER \
159 { char *old_buffer = bufp->buffer; \
160 if (bufp->allocated == (1<<16)) goto too_big; \
161 bufp->allocated *= 2; \
162 if (bufp->allocated > (1<<16)) bufp->allocated = (1<<16); \
163 if (!(bufp->buffer = (char *) realloc (bufp->buffer, bufp->allocated))) \
164 goto memory_exhausted; \
165 c = bufp->buffer - old_buffer; \
173 pending_exact += c; \
176 static void store_jump (), insert_jump ();
179 re_compile_pattern (pattern
, size
, bufp
)
182 struct re_pattern_buffer
*bufp
;
184 register char *b
= bufp
->buffer
;
185 register char *p
= pattern
;
186 char *pend
= pattern
+ size
;
187 register unsigned c
, c1
;
189 unsigned char *translate
= (unsigned char *) bufp
->translate
;
191 /* address of the count-byte of the most recently inserted "exactn" command.
192 This makes it possible to tell whether a new exact-match character
193 can be added to that command or requires a new "exactn" command. */
195 char *pending_exact
= 0;
197 /* address of the place where a forward-jump should go
198 to the end of the containing expression.
199 Each alternative of an "or", except the last, ends with a forward-jump
202 char *fixup_jump
= 0;
204 /* address of start of the most recently finished expression.
205 This tells postfix * where to find the start of its operand. */
209 /* In processing a repeat, 1 means zero matches is allowed */
213 /* In processing a repeat, 1 means many matches is allowed */
217 /* address of beginning of regexp, or inside of last \( */
221 /* Stack of information saved by \( and restored by \).
222 Four stack elements are pushed by each \(:
223 First, the value of b.
224 Second, the value of fixup_jump.
225 Third, the value of regnum.
226 Fourth, the value of begalt. */
229 int *stackp
= stackb
;
230 int *stacke
= stackb
+ 40;
233 /* Counts \('s as they are encountered. Remembered for the matching \),
234 where it becomes the "register number" to put in the stop_memory command */
238 bufp
->fastmap_accurate
= 0;
243 * Initialize the syntax table.
249 if (bufp
->allocated
== 0)
251 bufp
->allocated
= 28;
253 /* EXTEND_BUFFER loses when bufp->allocated is 0 */
254 bufp
->buffer
= (char *) realloc (bufp
->buffer
, 28);
256 /* Caller did not allocate a buffer. Do it for him */
257 bufp
->buffer
= (char *) malloc (28);
258 if (!bufp
->buffer
) goto memory_exhausted
;
259 begalt
= b
= bufp
->buffer
;
264 if (b
- bufp
->buffer
> bufp
->allocated
- 10)
265 /* Note that EXTEND_BUFFER clobbers c */
273 if (obscure_syntax
& RE_TIGHT_VBAR
)
275 if (! (obscure_syntax
& RE_CONTEXT_INDEP_OPS
) && p
!= pend
)
277 /* Make operand of last vbar end before this `$'. */
279 store_jump (fixup_jump
, jump
, b
);
285 /* $ means succeed if at end of line, but only in special contexts.
286 If randomly in the middle of a pattern, it is a normal character. */
287 if (p
== pend
|| *p
== '\n'
288 || (obscure_syntax
& RE_CONTEXT_INDEP_OPS
)
289 || (obscure_syntax
& RE_NO_BK_PARENS
291 : *p
== '\\' && p
[1] == ')')
292 || (obscure_syntax
& RE_NO_BK_VBAR
294 : *p
== '\\' && p
[1] == '|'))
302 /* ^ means succeed if at beg of line, but only if no preceding pattern. */
304 if (laststart
&& p
[-2] != '\n'
305 && ! (obscure_syntax
& RE_CONTEXT_INDEP_OPS
))
307 if (obscure_syntax
& RE_TIGHT_VBAR
)
310 && ! (obscure_syntax
& RE_CONTEXT_INDEP_OPS
))
321 if (obscure_syntax
& RE_BK_PLUS_QM
)
325 /* If there is no previous pattern, char not special. */
326 if (!laststart
&& ! (obscure_syntax
& RE_CONTEXT_INDEP_OPS
))
328 /* If there is a sequence of repetition chars,
329 collapse it down to equivalent to just one. */
334 zero_times_ok
|= c
!= '+';
335 many_times_ok
|= c
!= '?';
341 else if (!(obscure_syntax
& RE_BK_PLUS_QM
)
342 && (c
== '+' || c
== '?'))
344 else if ((obscure_syntax
& RE_BK_PLUS_QM
)
349 if (!(c1
== '+' || c1
== '?'))
364 /* Star, etc. applied to an empty pattern is equivalent
365 to an empty pattern. */
369 /* Now we know whether 0 matches is allowed,
370 and whether 2 or more matches is allowed. */
373 /* If more than one repetition is allowed,
374 put in a backward jump at the end. */
375 store_jump (b
, maybe_finalize_jump
, laststart
- 3);
378 insert_jump (on_failure_jump
, laststart
, b
+ 3, b
);
383 /* At least one repetition required: insert before the loop
384 a skip over the initial on-failure-jump instruction */
385 insert_jump (dummy_failure_jump
, laststart
, laststart
+ 6, b
);
396 while (b
- bufp
->buffer
397 > bufp
->allocated
- 3 - (1 << BYTEWIDTH
) / BYTEWIDTH
)
398 /* Note that EXTEND_BUFFER clobbers c */
403 PATPUSH (charset_not
), p
++;
408 PATPUSH ((1 << BYTEWIDTH
) / BYTEWIDTH
);
409 /* Clear the whole map */
410 bzero (b
, (1 << BYTEWIDTH
) / BYTEWIDTH
);
411 /* Read in characters and ranges, setting map bits */
415 if (c
== ']' && p
!= p1
+ 1) break;
416 if (*p
== '-' && p
[1] != ']')
421 b
[c
/ BYTEWIDTH
] |= 1 << (c
% BYTEWIDTH
), c
++;
425 b
[c
/ BYTEWIDTH
] |= 1 << (c
% BYTEWIDTH
);
428 /* Discard any bitmap bytes that are all 0 at the end of the map.
429 Decrement the map-length byte too. */
430 while ((int) b
[-1] > 0 && b
[b
[-1] - 1] == 0)
436 if (! (obscure_syntax
& RE_NO_BK_PARENS
))
442 if (! (obscure_syntax
& RE_NO_BK_PARENS
))
448 if (! (obscure_syntax
& RE_NEWLINE_OR
))
454 if (! (obscure_syntax
& RE_NO_BK_VBAR
))
460 if (p
== pend
) goto invalid_pattern
;
465 if (obscure_syntax
& RE_NO_BK_PARENS
)
468 if (stackp
== stacke
) goto nesting_too_deep
;
469 if (regnum
< RE_NREGS
)
471 PATPUSH (start_memory
);
474 *stackp
++ = b
- bufp
->buffer
;
475 *stackp
++ = fixup_jump
? fixup_jump
- bufp
->buffer
+ 1 : 0;
476 *stackp
++ = regnum
++;
477 *stackp
++ = begalt
- bufp
->buffer
;
484 if (obscure_syntax
& RE_NO_BK_PARENS
)
487 if (stackp
== stackb
) goto unmatched_close
;
488 begalt
= *--stackp
+ bufp
->buffer
;
490 store_jump (fixup_jump
, jump
, b
);
491 if (stackp
[-1] < RE_NREGS
)
493 PATPUSH (stop_memory
);
494 PATPUSH (stackp
[-1]);
499 fixup_jump
= *stackp
+ bufp
->buffer
- 1;
500 laststart
= *--stackp
+ bufp
->buffer
;
504 if (obscure_syntax
& RE_NO_BK_VBAR
)
507 insert_jump (on_failure_jump
, begalt
, b
+ 6, b
);
511 store_jump (fixup_jump
, jump
, b
);
525 PATPUSH (syntaxspec
);
527 PATPUSH (syntax_spec_code
[c
]);
532 PATPUSH (notsyntaxspec
);
534 PATPUSH (syntax_spec_code
[c
]);
545 PATPUSH (notwordchar
);
561 PATPUSH (notwordbound
);
584 for (stackt
= stackp
- 2; stackt
> stackb
; stackt
-= 4)
594 if (obscure_syntax
& RE_BK_PLUS_QM
)
599 /* You might think it would be useful for \ to mean
600 not to translate; but if we don't translate it
601 it will never match anything. */
602 if (translate
) c
= translate
[c
];
609 if (!pending_exact
|| pending_exact
+ *pending_exact
+ 1 != b
610 || *pending_exact
== 0177 || *p
== '*' || *p
== '^'
611 || ((obscure_syntax
& RE_BK_PLUS_QM
)
612 ? *p
== '\\' && (p
[1] == '+' || p
[1] == '?')
613 : (*p
== '+' || *p
== '?')))
626 store_jump (fixup_jump
, jump
, b
);
628 if (stackp
!= stackb
) goto unmatched_open
;
630 bufp
->used
= b
- bufp
->buffer
;
634 return "Invalid regular expression";
637 return "Unmatched \\(";
640 return "Unmatched \\)";
643 return "Premature end of regular expression";
646 return "Nesting too deep";
649 return "Regular expression too big";
652 return "Memory exhausted";
655 /* Store where `from' points a jump operation to jump to where `to' points.
656 `opcode' is the opcode to store. */
659 store_jump (from
, opcode
, to
)
664 from
[1] = (to
- (from
+ 3)) & 0377;
665 from
[2] = (to
- (from
+ 3)) >> 8;
668 /* Open up space at char FROM, and insert there a jump to TO.
669 CURRENT_END gives te end of the storage no in use,
670 so we know how much data to copy up.
671 OP is the opcode of the jump to insert.
673 If you call this function, you must zero out pending_exact. */
676 insert_jump (op
, from
, to
, current_end
)
678 char *from
, *to
, *current_end
;
680 register char *pto
= current_end
+ 3;
681 register char *pfrom
= current_end
;
682 while (pfrom
!= from
)
684 store_jump (from
, op
, to
);
687 /* Given a pattern, compute a fastmap from it.
688 The fastmap records which of the (1 << BYTEWIDTH) possible characters
689 can start a string that matches the pattern.
690 This fastmap is used by re_search to skip quickly over totally implausible text.
692 The caller must supply the address of a (1 << BYTEWIDTH)-byte data area
694 The other components of bufp describe the pattern to be used. */
697 re_compile_fastmap (bufp
)
698 struct re_pattern_buffer
*bufp
;
700 unsigned char *pattern
= (unsigned char *) bufp
->buffer
;
701 int size
= bufp
->used
;
702 register char *fastmap
= bufp
->fastmap
;
703 register unsigned char *p
= pattern
;
704 register unsigned char *pend
= pattern
+ size
;
706 unsigned char *translate
= (unsigned char *) bufp
->translate
;
708 unsigned char *stackb
[NFAILURES
];
709 unsigned char **stackp
= stackb
;
711 bzero (fastmap
, (1 << BYTEWIDTH
));
712 bufp
->fastmap_accurate
= 1;
713 bufp
->can_be_null
= 0;
719 bufp
->can_be_null
= 1;
722 #ifdef SWITCH_ENUM_BUG
723 switch ((int) ((enum regexpcode
) *p
++))
725 switch ((enum regexpcode
) *p
++)
730 fastmap
[translate
[p
[1]]] = 1;
749 fastmap
[translate
['\n']] = 1;
752 if (bufp
->can_be_null
!= 1)
753 bufp
->can_be_null
= 2;
757 case maybe_finalize_jump
:
759 case dummy_failure_jump
:
760 bufp
->can_be_null
= 1;
762 j
+= SIGN_EXTEND_CHAR (*(char *)p
) << 8;
763 p
+= j
+ 1; /* The 1 compensates for missing ++ above */
766 /* Jump backward reached implies we just went through
767 the body of a loop and matched nothing.
768 Opcode jumped to should be an on_failure_jump.
769 Just treat it like an ordinary jump.
770 For a * loop, it has pushed its failure point already;
771 if so, discard that as redundant. */
772 if ((enum regexpcode
) *p
!= on_failure_jump
)
776 j
+= SIGN_EXTEND_CHAR (*(char *)p
) << 8;
777 p
+= j
+ 1; /* The 1 compensates for missing ++ above */
778 if (stackp
!= stackb
&& *stackp
== p
)
782 case on_failure_jump
:
784 j
+= SIGN_EXTEND_CHAR (*(char *)p
) << 8;
795 bufp
->can_be_null
= 1;
798 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
801 if (bufp
->can_be_null
)
803 /* Don't return; check the alternative paths
804 so we can set can_be_null if appropriate. */
808 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
809 if (SYNTAX (j
) == Sword
)
814 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
815 if (SYNTAX (j
) != Sword
)
822 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
823 if (SYNTAX (j
) == (enum syntaxcode
) k
)
829 for (j
= 0; j
< (1 << BYTEWIDTH
); j
++)
830 if (SYNTAX (j
) != (enum syntaxcode
) k
)
836 for (j
= *p
++ * BYTEWIDTH
- 1; j
>= 0; j
--)
837 if (p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
)))
840 fastmap
[translate
[j
]] = 1;
847 /* Chars beyond end of map must be allowed */
848 for (j
= *p
* BYTEWIDTH
; j
< (1 << BYTEWIDTH
); j
++)
850 fastmap
[translate
[j
]] = 1;
854 for (j
= *p
++ * BYTEWIDTH
- 1; j
>= 0; j
--)
855 if (!(p
[j
/ BYTEWIDTH
] & (1 << (j
% BYTEWIDTH
))))
858 fastmap
[translate
[j
]] = 1;
865 /* Get here means we have successfully found the possible starting characters
866 of one path of the pattern. We need not follow this path any farther.
867 Instead, look at the next alternative remembered in the stack. */
868 if (stackp
!= stackb
)
875 /* Like re_search_2, below, but only one string is specified. */
878 re_search (pbufp
, string
, size
, startpos
, range
, regs
)
879 struct re_pattern_buffer
*pbufp
;
881 int size
, startpos
, range
;
882 struct re_registers
*regs
;
884 return re_search_2 (pbufp
, 0, 0, string
, size
, startpos
, range
, regs
, size
);
887 /* Like re_match_2 but tries first a match starting at index STARTPOS,
888 then at STARTPOS + 1, and so on.
889 RANGE is the number of places to try before giving up.
890 If RANGE is negative, the starting positions tried are
891 STARTPOS, STARTPOS - 1, etc.
892 It is up to the caller to make sure that range is not so large
893 as to take the starting position outside of the input strings.
895 The value returned is the position at which the match was found,
896 or -1 if no match was found,
897 or -2 if error (such as failure stack overflow). */
900 re_search_2 (pbufp
, string1
, size1
, string2
, size2
, startpos
, range
, regs
, mstop
)
901 struct re_pattern_buffer
*pbufp
;
902 char *string1
, *string2
;
906 struct re_registers
*regs
;
909 register char *fastmap
= pbufp
->fastmap
;
910 register unsigned char *translate
= (unsigned char *) pbufp
->translate
;
911 int total
= size1
+ size2
;
914 /* Update the fastmap now if not correct already */
915 if (fastmap
&& !pbufp
->fastmap_accurate
)
916 re_compile_fastmap (pbufp
);
918 /* Don't waste time in a long search for a pattern
919 that says it is anchored. */
920 if (pbufp
->used
> 0 && (enum regexpcode
) pbufp
->buffer
[0] == begbuf
931 /* If a fastmap is supplied, skip quickly over characters
932 that cannot possibly be the start of a match.
933 Note, however, that if the pattern can possibly match
934 the null string, we must test it at each starting point
935 so that we take the first null string we get. */
937 if (fastmap
&& startpos
< total
&& pbufp
->can_be_null
!= 1)
941 register int lim
= 0;
942 register unsigned char *p
;
944 if (startpos
< size1
&& startpos
+ range
>= size1
)
945 lim
= range
- (size1
- startpos
);
947 p
= ((unsigned char *)
948 &(startpos
>= size1
? string2
- size1
: string1
)[startpos
]);
952 while (range
> lim
&& !fastmap
[translate
[*p
++]])
957 while (range
> lim
&& !fastmap
[*p
++])
960 startpos
+= irange
- range
;
964 register unsigned char c
;
965 if (startpos
>= size1
)
966 c
= string2
[startpos
- size1
];
968 c
= string1
[startpos
];
970 if (translate
? !fastmap
[translate
[c
]] : !fastmap
[c
])
975 if (range
>= 0 && startpos
== total
976 && fastmap
&& pbufp
->can_be_null
== 0)
979 val
= re_match_2 (pbufp
, string1
, size1
, string2
, size2
, startpos
, regs
, mstop
);
989 #endif /* C_ALLOCA */
993 if (range
> 0) range
--, startpos
++; else range
++, startpos
--;
998 #ifndef emacs /* emacs never uses this */
1000 re_match (pbufp
, string
, size
, pos
, regs
)
1001 struct re_pattern_buffer
*pbufp
;
1004 struct re_registers
*regs
;
1006 return re_match_2 (pbufp
, 0, 0, string
, size
, pos
, regs
, size
);
1010 /* Maximum size of failure stack. Beyond this, overflow is an error. */
1012 int re_max_failures
= 2000;
1014 static int bcmp_translate();
1015 /* Match the pattern described by PBUFP
1016 against data which is the virtual concatenation of STRING1 and STRING2.
1017 SIZE1 and SIZE2 are the sizes of the two data strings.
1018 Start the match at position POS.
1019 Do not consider matching past the position MSTOP.
1021 If pbufp->fastmap is nonzero, then it had better be up to date.
1023 The reason that the data to match are specified as two components
1024 which are to be regarded as concatenated
1025 is so this function can be used directly on the contents of an Emacs buffer.
1027 -1 is returned if there is no match. -2 is returned if there is
1028 an error (such as match stack overflow). Otherwise the value is the length
1029 of the substring which was matched. */
1032 re_match_2 (pbufp
, string1
, size1
, string2
, size2
, pos
, regs
, mstop
)
1033 struct re_pattern_buffer
*pbufp
;
1034 unsigned char *string1
, *string2
;
1037 struct re_registers
*regs
;
1040 register unsigned char *p
= (unsigned char *) pbufp
->buffer
;
1041 register unsigned char *pend
= p
+ pbufp
->used
;
1042 /* End of first string */
1043 unsigned char *end1
;
1044 /* End of second string */
1045 unsigned char *end2
;
1046 /* Pointer just past last char to consider matching */
1047 unsigned char *end_match_1
, *end_match_2
;
1048 register unsigned char *d
, *dend
;
1050 unsigned char *translate
= (unsigned char *) pbufp
->translate
;
1052 /* Failure point stack. Each place that can handle a failure further down the line
1053 pushes a failure point on this stack. It consists of two char *'s.
1054 The first one pushed is where to resume scanning the pattern;
1055 the second pushed is where to resume scanning the strings.
1056 If the latter is zero, the failure point is a "dummy".
1057 If a failure happens and the innermost failure point is dormant,
1058 it discards that failure point and tries the next one. */
1060 unsigned char *initial_stack
[2 * NFAILURES
];
1061 unsigned char **stackb
= initial_stack
;
1062 unsigned char **stackp
= stackb
, **stacke
= &stackb
[2 * NFAILURES
];
1064 /* Information on the "contents" of registers.
1065 These are pointers into the input strings; they record
1066 just what was matched (on this attempt) by some part of the pattern.
1067 The start_memory command stores the start of a register's contents
1068 and the stop_memory command stores the end.
1070 At that point, regstart[regnum] points to the first character in the register,
1071 regend[regnum] points to the first character beyond the end of the register,
1072 regstart_seg1[regnum] is true iff regstart[regnum] points into string1,
1073 and regend_seg1[regnum] is true iff regend[regnum] points into string1. */
1075 unsigned char *regstart
[RE_NREGS
];
1076 unsigned char *regend
[RE_NREGS
];
1077 unsigned char regstart_seg1
[RE_NREGS
], regend_seg1
[RE_NREGS
];
1079 /* Set up pointers to ends of strings.
1080 Don't allow the second string to be empty unless both are empty. */
1088 end1
= string1
+ size1
;
1089 end2
= string2
+ size2
;
1091 /* Compute where to stop matching, within the two strings */
1094 end_match_1
= string1
+ mstop
;
1095 end_match_2
= string2
;
1100 end_match_2
= string2
+ mstop
- size1
;
1103 /* Initialize \) text positions to -1
1104 to mark ones that no \( or \) has been seen for. */
1106 for (mcnt
= 0; mcnt
< sizeof (regend
) / sizeof (*regend
); mcnt
++)
1107 regend
[mcnt
] = (unsigned char *) -1;
1109 /* `p' scans through the pattern as `d' scans through the data.
1110 `dend' is the end of the input string that `d' points within.
1111 `d' is advanced into the following input string whenever necessary,
1112 but this happens before fetching;
1113 therefore, at the beginning of the loop,
1114 `d' can be pointing at the end of a string,
1115 but it cannot equal string2. */
1118 d
= string1
+ pos
, dend
= end_match_1
;
1120 d
= string2
+ pos
- size1
, dend
= end_match_2
;
1122 /* Write PREFETCH; just before fetching a character with *d. */
1125 { if (dend == end_match_2) goto fail; /* end of string2 => failure */ \
1126 d = string2; /* end of string1 => advance to string2. */ \
1127 dend = end_match_2; }
1129 /* This loop loops over pattern commands.
1130 It exits by returning from the function if match is complete,
1131 or it drops through if match fails at this starting point in the input data. */
1136 /* End of pattern means we have succeeded! */
1138 /* If caller wants register contents data back, convert it to indices */
1141 regs
->start
[0] = pos
;
1142 if (dend
== end_match_1
)
1143 regs
->end
[0] = d
- string1
;
1145 regs
->end
[0] = d
- string2
+ size1
;
1146 for (mcnt
= 1; mcnt
< RE_NREGS
; mcnt
++)
1148 if (regend
[mcnt
] == (unsigned char *) -1)
1150 regs
->start
[mcnt
] = -1;
1151 regs
->end
[mcnt
] = -1;
1154 if (regstart_seg1
[mcnt
])
1155 regs
->start
[mcnt
] = regstart
[mcnt
] - string1
;
1157 regs
->start
[mcnt
] = regstart
[mcnt
] - string2
+ size1
;
1158 if (regend_seg1
[mcnt
])
1159 regs
->end
[mcnt
] = regend
[mcnt
] - string1
;
1161 regs
->end
[mcnt
] = regend
[mcnt
] - string2
+ size1
;
1164 if (dend
== end_match_1
)
1165 return (d
- string1
- pos
);
1167 return d
- string2
+ size1
- pos
;
1170 /* Otherwise match next pattern command */
1171 #ifdef SWITCH_ENUM_BUG
1172 switch ((int) ((enum regexpcode
) *p
++))
1174 switch ((enum regexpcode
) *p
++)
1178 /* \( is represented by a start_memory, \) by a stop_memory.
1179 Both of those commands contain a "register number" argument.
1180 The text matched within the \( and \) is recorded under that number.
1181 Then, \<digit> turns into a `duplicate' command which
1182 is followed by the numeric value of <digit> as the register number. */
1186 regstart_seg1
[*p
++] = (dend
== end_match_1
);
1191 regend_seg1
[*p
++] = (dend
== end_match_1
);
1196 int regno
= *p
++; /* Get which register to match against */
1197 register unsigned char *d2
, *dend2
;
1199 d2
= regstart
[regno
];
1200 dend2
= ((regstart_seg1
[regno
] == regend_seg1
[regno
])
1201 ? regend
[regno
] : end_match_1
);
1204 /* Advance to next segment in register contents, if necessary */
1207 if (dend2
== end_match_2
) break;
1208 if (dend2
== regend
[regno
]) break;
1209 d2
= string2
, dend2
= regend
[regno
]; /* end of string1 => advance to string2. */
1211 /* At end of register contents => success */
1212 if (d2
== dend2
) break;
1214 /* Advance to next segment in data being matched, if necessary */
1217 /* mcnt gets # consecutive chars to compare */
1219 if (mcnt
> dend2
- d2
)
1221 /* Compare that many; failure if mismatch, else skip them. */
1222 if (translate
? bcmp_translate (d
, d2
, mcnt
, translate
) : bcmp (d
, d2
, mcnt
))
1224 d
+= mcnt
, d2
+= mcnt
;
1230 /* fetch a data character */
1232 /* Match anything but a newline. */
1233 if ((translate
? translate
[*d
++] : *d
++) == '\n')
1240 /* Nonzero for charset_not */
1243 if (*(p
- 1) == (unsigned char) charset_not
)
1246 /* fetch a data character */
1254 if (c
< *p
* BYTEWIDTH
1255 && p
[1 + c
/ BYTEWIDTH
] & (1 << (c
% BYTEWIDTH
)))
1260 if (!not) goto fail
;
1266 if (d
== string1
|| d
[-1] == '\n')
1272 || (d
== end1
? (size2
== 0 || *string2
== '\n') : *d
== '\n'))
1276 /* "or" constructs ("|") are handled by starting each alternative
1277 with an on_failure_jump that points to the start of the next alternative.
1278 Each alternative except the last ends with a jump to the joining point.
1279 (Actually, each jump except for the last one really jumps
1280 to the following jump, because tensioning the jumps is a hassle.) */
1282 /* The start of a stupid repeat has an on_failure_jump that points
1283 past the end of the repeat text.
1284 This makes a failure point so that, on failure to match a repetition,
1285 matching restarts past as many repetitions have been found
1286 with no way to fail and look for another one. */
1288 /* A smart repeat is similar but loops back to the on_failure_jump
1289 so that each repetition makes another failure point. */
1291 case on_failure_jump
:
1292 if (stackp
== stacke
)
1294 unsigned char **stackx
;
1295 if (stacke
- stackb
> re_max_failures
* 2)
1297 stackx
= (unsigned char **) alloca (2 * (stacke
- stackb
)
1299 memcpy (stackx
, stackb
, (stacke
- stackb
) * sizeof (char *));
1300 stackp
= stackx
+ (stackp
- stackb
);
1301 stacke
= stackx
+ 2 * (stacke
- stackb
);
1305 mcnt
+= SIGN_EXTEND_CHAR (*(char *)p
) << 8;
1307 *stackp
++ = mcnt
+ p
;
1311 /* The end of a smart repeat has an maybe_finalize_jump back.
1312 Change it either to a finalize_jump or an ordinary jump. */
1314 case maybe_finalize_jump
:
1316 mcnt
+= SIGN_EXTEND_CHAR (*(char *)p
) << 8;
1319 register unsigned char *p2
= p
;
1320 /* Compare what follows with the begining of the repeat.
1321 If we can establish that there is nothing that they would
1322 both match, we can change to finalize_jump */
1324 && (*p2
== (unsigned char) stop_memory
1325 || *p2
== (unsigned char) start_memory
))
1328 p
[-3] = (unsigned char) finalize_jump
;
1329 else if (*p2
== (unsigned char) exactn
1330 || *p2
== (unsigned char) endline
)
1332 register int c
= *p2
== (unsigned char) endline
? '\n' : p2
[2];
1333 register unsigned char *p1
= p
+ mcnt
;
1334 /* p1[0] ... p1[2] are an on_failure_jump.
1335 Examine what follows that */
1336 if (p1
[3] == (unsigned char) exactn
&& p1
[5] != c
)
1337 p
[-3] = (unsigned char) finalize_jump
;
1338 else if (p1
[3] == (unsigned char) charset
1339 || p1
[3] == (unsigned char) charset_not
)
1341 int not = p1
[3] == (unsigned char) charset_not
;
1342 if (c
< p1
[4] * BYTEWIDTH
1343 && p1
[5 + c
/ BYTEWIDTH
] & (1 << (c
% BYTEWIDTH
)))
1345 /* not is 1 if c would match */
1346 /* That means it is not safe to finalize */
1348 p
[-3] = (unsigned char) finalize_jump
;
1353 if (p
[-1] != (unsigned char) finalize_jump
)
1355 p
[-1] = (unsigned char) jump
;
1359 /* The end of a stupid repeat has a finalize-jump
1360 back to the start, where another failure point will be made
1361 which will point after all the repetitions found so far. */
1369 mcnt
+= SIGN_EXTEND_CHAR (*(char *)p
) << 8;
1370 p
+= mcnt
+ 1; /* The 1 compensates for missing ++ above */
1373 case dummy_failure_jump
:
1374 if (stackp
== stacke
)
1376 unsigned char **stackx
1377 = (unsigned char **) alloca (2 * (stacke
- stackb
)
1379 memcpy (stackx
, stackb
, (stacke
- stackb
) * sizeof (char *));
1380 stackp
= stackx
+ (stackp
- stackb
);
1381 stacke
= stackx
+ 2 * (stacke
- stackb
);
1389 if (d
== string1
/* Points to first char */
1390 || d
== end2
/* Points to end */
1391 || (d
== end1
&& size2
== 0)) /* Points to end */
1393 if ((SYNTAX (d
[-1]) == Sword
)
1394 != (SYNTAX (d
== end1
? *string2
: *d
) == Sword
))
1399 if (d
== string1
/* Points to first char */
1400 || d
== end2
/* Points to end */
1401 || (d
== end1
&& size2
== 0)) /* Points to end */
1403 if ((SYNTAX (d
[-1]) == Sword
)
1404 != (SYNTAX (d
== end1
? *string2
: *d
) == Sword
))
1409 if (d
== end2
/* Points to end */
1410 || (d
== end1
&& size2
== 0) /* Points to end */
1411 || SYNTAX (* (d
== end1
? string2
: d
)) != Sword
) /* Next char not a letter */
1413 if (d
== string1
/* Points to first char */
1414 || SYNTAX (d
[-1]) != Sword
) /* prev char not letter */
1419 if (d
== string1
/* Points to first char */
1420 || SYNTAX (d
[-1]) != Sword
) /* prev char not letter */
1422 if (d
== end2
/* Points to end */
1423 || (d
== end1
&& size2
== 0) /* Points to end */
1424 || SYNTAX (d
== end1
? *string2
: *d
) != Sword
) /* Next char not a letter */
1430 if (((d
- string2
<= (unsigned) size2
)
1431 ? d
- bf_p2
: d
- bf_p1
)
1437 if (((d
- string2
<= (unsigned) size2
)
1438 ? d
- bf_p2
: d
- bf_p1
)
1444 if (((d
- string2
<= (unsigned) size2
)
1445 ? d
- bf_p2
: d
- bf_p1
)
1458 if (SYNTAX (*d
++) != (enum syntaxcode
) mcnt
) goto fail
;
1463 goto matchnotsyntax
;
1469 if (SYNTAX (*d
++) == (enum syntaxcode
) mcnt
) goto fail
;
1474 if (SYNTAX (*d
++) == 0) goto fail
;
1479 if (SYNTAX (*d
++) != 0) goto fail
;
1481 #endif /* not emacs */
1484 if (d
== string1
) /* Note, d cannot equal string2 */
1485 break; /* unless string1 == string2. */
1489 if (d
== end2
|| (d
== end1
&& size2
== 0))
1494 /* Match the next few pattern characters exactly.
1495 mcnt is how many characters to match. */
1502 if (translate
[*d
++] != *p
++) goto fail
;
1511 if (*d
++ != *p
++) goto fail
;
1517 continue; /* Successfully matched one pattern command; keep matching */
1519 /* Jump here if any matching operation fails. */
1521 if (stackp
!= stackb
)
1522 /* A restart point is known. Restart there and pop it. */
1525 { /* If innermost failure point is dormant, flush it and keep looking */
1531 if (d
>= string1
&& d
<= end1
)
1534 else break; /* Matching at this starting point really fails! */
1536 return -1; /* Failure to match */
1540 bcmp_translate (s1
, s2
, len
, translate
)
1541 unsigned char *s1
, *s2
;
1543 unsigned char *translate
;
1545 register unsigned char *p1
= s1
, *p2
= s2
;
1548 if (translate
[*p1
++] != translate
[*p2
++]) return 1;
1554 /* Entry points compatible with bsd4.2 regex library */
1558 static struct re_pattern_buffer re_comp_buf
;
1566 if (!re_comp_buf
.buffer
)
1567 return "No previous regular expression";
1571 if (!re_comp_buf
.buffer
)
1573 if (!(re_comp_buf
.buffer
= (char *) malloc (200)))
1574 return "Memory exhausted";
1575 re_comp_buf
.allocated
= 200;
1576 if (!(re_comp_buf
.fastmap
= (char *) malloc (1 << BYTEWIDTH
)))
1577 return "Memory exhausted";
1579 return re_compile_pattern (s
, strlen (s
), &re_comp_buf
);
1586 int len
= strlen (s
);
1587 return 0 <= re_search (&re_comp_buf
, s
, len
, 0, len
, 0);
1596 /* Indexed by a character, gives the upper case equivalent of the character */
1598 static char upcase
[0400] =
1599 { 000, 001, 002, 003, 004, 005, 006, 007,
1600 010, 011, 012, 013, 014, 015, 016, 017,
1601 020, 021, 022, 023, 024, 025, 026, 027,
1602 030, 031, 032, 033, 034, 035, 036, 037,
1603 040, 041, 042, 043, 044, 045, 046, 047,
1604 050, 051, 052, 053, 054, 055, 056, 057,
1605 060, 061, 062, 063, 064, 065, 066, 067,
1606 070, 071, 072, 073, 074, 075, 076, 077,
1607 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1608 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1609 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1610 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
1611 0140, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
1612 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
1613 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
1614 0130, 0131, 0132, 0173, 0174, 0175, 0176, 0177,
1615 0200, 0201, 0202, 0203, 0204, 0205, 0206, 0207,
1616 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217,
1617 0220, 0221, 0222, 0223, 0224, 0225, 0226, 0227,
1618 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237,
1619 0240, 0241, 0242, 0243, 0244, 0245, 0246, 0247,
1620 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257,
1621 0260, 0261, 0262, 0263, 0264, 0265, 0266, 0267,
1622 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277,
1623 0300, 0301, 0302, 0303, 0304, 0305, 0306, 0307,
1624 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317,
1625 0320, 0321, 0322, 0323, 0324, 0325, 0326, 0327,
1626 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337,
1627 0340, 0341, 0342, 0343, 0344, 0345, 0346, 0347,
1628 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357,
1629 0360, 0361, 0362, 0363, 0364, 0365, 0366, 0367,
1630 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377
1638 struct re_pattern_buffer buf
;
1641 char fastmap
[(1 << BYTEWIDTH
)];
1643 /* Allow a command argument to specify the style of syntax. */
1645 obscure_syntax
= atoi (argv
[1]);
1648 buf
.buffer
= (char *) malloc (buf
.allocated
);
1649 buf
.fastmap
= fastmap
;
1650 buf
.translate
= upcase
;
1658 re_compile_pattern (pat
, strlen(pat
), &buf
);
1660 for (i
= 0; i
< buf
.used
; i
++)
1661 printchar (buf
.buffer
[i
]);
1665 printf ("%d allocated, %d used.\n", buf
.allocated
, buf
.used
);
1667 re_compile_fastmap (&buf
);
1668 printf ("Allowed by fastmap: ");
1669 for (i
= 0; i
< (1 << BYTEWIDTH
); i
++)
1670 if (fastmap
[i
]) printchar (i
);
1674 gets (pat
); /* Now read the string to match against */
1676 i
= re_match (&buf
, pat
, strlen (pat
), 0, 0);
1677 printf ("Match value %d.\n", i
);
1683 struct re_pattern_buffer
*bufp
;
1687 printf ("buf is :\n----------------\n");
1688 for (i
= 0; i
< bufp
->used
; i
++)
1689 printchar (bufp
->buffer
[i
]);
1691 printf ("\n%d allocated, %d used.\n", bufp
->allocated
, bufp
->used
);
1693 printf ("Allowed by fastmap: ");
1694 for (i
= 0; i
< (1 << BYTEWIDTH
); i
++)
1695 if (bufp
->fastmap
[i
])
1697 printf ("\nAllowed by translate: ");
1698 if (bufp
->translate
)
1699 for (i
= 0; i
< (1 << BYTEWIDTH
); i
++)
1700 if (bufp
->translate
[i
])
1702 printf ("\nfastmap is%s accurate\n", bufp
->fastmap_accurate
? "" : "n't");
1703 printf ("can %s be null\n----------", bufp
->can_be_null
? "" : "not");
1710 if (c
< 041 || c
>= 0177)
1713 putchar (((c
>> 6) & 3) + '0');
1714 putchar (((c
>> 3) & 7) + '0');
1715 putchar ((c
& 7) + '0');