Commit | Line | Data |
---|---|---|
970ed795 | 1 | /****************************************************************************** |
3abe9331 | 2 | * Copyright (c) 2000-2015 Ericsson Telecom AB |
970ed795 EL |
3 | * All rights reserved. This program and the accompanying materials |
4 | * are made available under the terms of the Eclipse Public License v1.0 | |
5 | * which accompanies this distribution, and is available at | |
6 | * http://www.eclipse.org/legal/epl-v10.html | |
7 | ******************************************************************************/ | |
8 | ||
9 | /** | |
10 | * Parser for TTCN-3 character patterns. | |
11 | * | |
12 | * \author Matyas Forstner (Matyas.Forstner@eth.ericsson.se) | |
13 | * | |
14 | * 20031121 | |
15 | */ | |
16 | ||
17 | %{ | |
18 | ||
19 | /********************************************************************* | |
20 | * C(++) declarations | |
21 | *********************************************************************/ | |
22 | ||
23 | #include <stdio.h> | |
24 | #include <string.h> | |
25 | #include <ctype.h> | |
26 | #if defined(__CYGWIN__) && defined(__clang__) | |
27 | /* Cygwin's clang 3.0 has its own limits.h, which does not bring in | |
28 | the system's limits.h unless we define this macro: */ | |
29 | #define __STDC_HOSTED__ 1 | |
30 | #define _GCC_NEXT_LIMITS_H | |
31 | #endif | |
32 | #include <limits.h> | |
33 | ||
34 | #include <regex.h> | |
35 | #if !defined(RE_DUP_MAX) | |
36 | /* RE_DUP_MAX is defined in limits.h or regex.h, except on Cygwin 1.5 */ | |
37 | # include <sys/syslimits.h> | |
38 | #endif | |
39 | ||
40 | #include "memory.h" | |
41 | #include "pattern.hh" | |
42 | ||
43 | /* defined in lexer c-file: */ | |
44 | ||
45 | union YYSTYPE; | |
46 | extern int pattern_yylex(); | |
47 | extern void init_pattern_yylex(YYSTYPE *p); | |
48 | struct yy_buffer_state; | |
49 | extern yy_buffer_state* pattern_yy_scan_string(const char*); | |
50 | extern int pattern_yylex_destroy(); | |
51 | extern unsigned int get_nof_parentheses(); | |
52 | ||
53 | /* defined in this file: */ | |
54 | ||
55 | /** The converted regexp. */ | |
56 | static char *ret_val; | |
3abe9331 | 57 | /** Turns error messages for extended ASCII characters on or off */ |
58 | static bool allow_ext_ascii = false; | |
970ed795 EL |
59 | /** The parser error reporting function. */ |
60 | static void pattern_yyerror(const char *error_str); | |
61 | /** Creates the POSIX equivalent of literal character \a c using the | |
62 | * appropriate escape sequence when needed. */ | |
63 | static char *translate_character(char c); | |
64 | /** Returns the printable equivalent of character \a c */ | |
65 | static char *print_character(char c); | |
66 | /** Returns the printable equivalent of range \a lower .. \a upper */ | |
67 | static char *print_range(char lower, char upper); | |
68 | /** structure for manipulating character sets */ | |
69 | struct character_set; | |
70 | /** allocates, initializes and returns a new empty set */ | |
71 | static character_set *set_init(); | |
72 | /** allocates and returns a copy of \a set */ | |
73 | static character_set *set_copy(const character_set *set); | |
74 | /** deallocates set \a set */ | |
75 | static void set_free(character_set *set); | |
76 | /** returns whether set \a set is empty */ | |
77 | static int set_is_empty(const character_set *set); | |
78 | /** returns whether set \a set contains all characters in range 1..127 */ | |
79 | static int set_is_full(const character_set *set); | |
80 | /** returns whether set \a set contains the character \a c */ | |
81 | static int set_has_char(const character_set *set, char c); | |
82 | /** adds character \a c to set \a set */ | |
83 | static void set_add_char(character_set *set, char c); | |
84 | /** removes character \a c to set \a set */ | |
85 | static void set_remove_char(character_set *set, char c); | |
86 | /** returns whether set \a set contains at least one character in the range | |
87 | * \a lower .. \a upper */ | |
88 | static int set_has_range(const character_set *set, char lower, char upper); | |
89 | /** adds range \a lower .. \a upper to set \a set */ | |
90 | static void set_add_range(character_set *set, char lower, char upper); | |
91 | /** returns whether set \a set1 and \a set2 has non-empty intersect */ | |
92 | static int set_has_intersect(const character_set *set1, | |
93 | const character_set *set2); | |
94 | /** joins sets \a dst and \a src into \a dst */ | |
95 | static void set_join(character_set *dst, const character_set *src); | |
96 | /** negates the set \a set */ | |
97 | static void set_negate(character_set *set); | |
98 | /** reports the duplicate occurrences of characters and ranges in \a set1 | |
99 | * and \a set2 */ | |
100 | static void set_report_duplicates(const character_set *set1, | |
101 | const character_set *set2); | |
102 | /** generates the POSIX equivalent of \a set */ | |
103 | static char *set_generate_posix(const character_set *set); | |
104 | ||
105 | #define YYERROR_VERBOSE | |
106 | ||
107 | static void yyprint(FILE *file, int type, const YYSTYPE& value); | |
108 | #define YYPRINT(f,t,v) yyprint(f,t,v) | |
109 | ||
110 | %} | |
111 | ||
112 | /********************************************************************* | |
113 | * Bison declarations | |
114 | *********************************************************************/ | |
115 | ||
116 | %name-prefix="pattern_yy" | |
117 | %output="pattern_p.cc" | |
118 | %defines | |
119 | %verbose | |
120 | %expect 0 | |
121 | %start Pattern | |
122 | %debug | |
123 | ||
124 | /********************************************************************* | |
125 | * The union-type | |
126 | * Must be kept in sync with the one in pattern_uni.y ! | |
127 | *********************************************************************/ | |
128 | ||
129 | %union { | |
130 | int b; /* boolean */ | |
131 | char c; /* single character */ | |
132 | char *s; /* character string */ | |
133 | unsigned long int u; /* unsigned integer */ | |
134 | struct character_set *set; // used by nonterminals in pattern_p.y | |
135 | ||
136 | union { | |
137 | unsigned int value; | |
138 | #if defined(__sparc__) || defined(__sparc) | |
139 | struct { | |
140 | unsigned char group; | |
141 | unsigned char plane; | |
142 | unsigned char row; | |
143 | unsigned char cell; | |
144 | } comp; | |
145 | #else | |
146 | struct { | |
147 | unsigned char cell; | |
148 | unsigned char row; | |
149 | unsigned char plane; | |
150 | unsigned char group; | |
151 | } comp; | |
152 | #endif | |
153 | } q; // single universal char, used by nonterminals in pattern_uni.y | |
154 | class QuadSet* qset; // used by nonterminals in pattern_uni.y | |
155 | } | |
156 | ||
157 | /********************************************************************* | |
158 | * Tokens | |
159 | *********************************************************************/ | |
160 | ||
161 | %token <c> TOK_Char "<ordinary character>" | |
162 | %token <u> TOK_Number "<number>" | |
163 | %token <u> TOK_Digit "<digit>" | |
164 | ||
165 | /********************************************************************* | |
166 | * Keywords | |
167 | *********************************************************************/ | |
168 | ||
169 | %token KW_BS_q "\\q" | |
170 | %token KW_BS_d "\\d" | |
171 | %token KW_BS_w "\\w" | |
172 | %token KW_BS_t "\\t" | |
173 | %token KW_BS_n "\\n" | |
174 | %token KW_BS_r "\\r" | |
175 | %token KW_BS_s "\\s" | |
176 | %token KW_BS_b "\\b" | |
177 | ||
178 | %token KW_Group_Begin "(" | |
179 | %token KW_Group_End ")" | |
180 | %token KW_Set_Begin "[" | |
181 | %token KW_Set_Begin_Neg "[^" | |
182 | %token KW_Set_Begin_Rsbrkt "[]" | |
183 | %token KW_Set_Begin_Neg_Rsbrkt "[^]" | |
184 | %token KW_Set_End "]" | |
185 | %token KW_Set_Dash_End "-]" | |
186 | ||
187 | /********************************************************************* | |
188 | * semantic types of nonterminals | |
189 | *********************************************************************/ | |
190 | ||
191 | %type <b> RE_Set_Begin RE_Set_Begin_Rsbrkt RE_Set_End | |
192 | %type <c> RE_Set_Range_Char RE_Quadruple | |
193 | %type <s> RE_Body RE_Elems RE_Alter_Elem RE_Concat_Elem | |
194 | RE_Multiply_Elem RE_Multiply_Statement RE_Group | |
195 | RE_OneCharPos | |
196 | %type <set> RE_Set RE_Set_Body RE_Set_Elem RE_Set_NoRange_Char | |
197 | ||
198 | /********************************************************************* | |
199 | * Destructors | |
200 | *********************************************************************/ | |
201 | ||
202 | %destructor { Free($$); } | |
203 | RE_Alter_Elem | |
204 | RE_Body | |
205 | RE_Concat_Elem | |
206 | RE_Elems | |
207 | RE_Group | |
208 | RE_Multiply_Elem | |
209 | RE_Multiply_Statement | |
210 | RE_OneCharPos | |
211 | ||
212 | %destructor { set_free($$); } | |
213 | RE_Set | |
214 | RE_Set_Body | |
215 | RE_Set_Elem | |
216 | RE_Set_NoRange_Char | |
217 | ||
218 | %% | |
219 | ||
220 | /********************************************************************* | |
221 | * Grammar | |
222 | *********************************************************************/ | |
223 | ||
224 | Pattern: | |
225 | RE_Body {ret_val=$1;} | |
226 | ; | |
227 | ||
228 | RE_Body: | |
229 | /* empty */ | |
230 | { | |
231 | $$ = mcopystr("^$"); | |
232 | } | |
233 | | RE_Elems | |
234 | { | |
235 | if ($1 != NULL) { | |
236 | $$ = mprintf("^%s$", $1); | |
237 | Free($1); | |
238 | } else $$ = mcopystr("^$"); | |
239 | } | |
240 | ; | |
241 | ||
242 | RE_Elems: | |
243 | RE_Alter_Elem { $$ = $1; } | |
244 | | RE_Elems '|' RE_Alter_Elem | |
245 | { | |
246 | unsigned int nof_pars = get_nof_parentheses() + (yychar==KW_Group_End ? 1 : 0); | |
247 | if ($3 != NULL) { | |
248 | if ($1 != NULL) $$ = mputprintf($1, nof_pars ? "|%s" : "$|^%s", $3); | |
249 | else $$ = mprintf( nof_pars ? "()|%s" : "()$|^%s" , $3); | |
250 | Free($3); | |
251 | } else { | |
252 | if ($1 != NULL) $$ = mputstr($1, nof_pars ? "|()" : "$|^()"); | |
253 | else $$ = NULL; | |
254 | } | |
255 | } | |
256 | ; | |
257 | ||
258 | RE_Alter_Elem: | |
259 | RE_Concat_Elem { $$ = $1; } | |
260 | | RE_Alter_Elem RE_Concat_Elem | |
261 | { | |
262 | $$ = mputstr($1, $2); | |
263 | Free($2); | |
264 | } | |
265 | ; | |
266 | ||
267 | RE_Concat_Elem: | |
268 | RE_Multiply_Elem {$$=$1;} | |
269 | | RE_Multiply_Elem RE_Multiply_Statement | |
270 | { | |
271 | if ($1 != NULL && $2 != NULL) { | |
272 | $$ = mputstr($1, $2); | |
273 | Free($2); | |
274 | } else { | |
275 | Free($1); | |
276 | Free($2); | |
277 | $$ = NULL; | |
278 | } | |
279 | } | |
280 | | '*' {$$=mcopystr(".*");} | |
281 | ; | |
282 | ||
283 | RE_Multiply_Elem: | |
284 | RE_Group {$$=$1;} | |
285 | | RE_OneCharPos {$$=$1;} | |
286 | ; | |
287 | ||
288 | RE_Group: | |
289 | KW_Group_Begin KW_Group_End | |
290 | { | |
291 | $$ = mcopystr("()"); | |
292 | } | |
293 | | KW_Group_Begin RE_Elems KW_Group_End | |
294 | { | |
295 | if ($2 != NULL) { | |
296 | $$ = mprintf("(%s)", $2); | |
297 | Free($2); | |
298 | } else { | |
299 | $$ = mcopystr("()"); | |
300 | } | |
301 | } | |
302 | ; | |
303 | ||
304 | RE_Multiply_Statement: | |
305 | '+' | |
306 | { | |
307 | $$ = mcopystr("+"); | |
308 | } | |
309 | | '#' '(' ',' ')' | |
310 | { | |
311 | $$ = mcopystr("*"); | |
312 | } | |
313 | | '#' TOK_Digit | |
314 | { | |
315 | if ($2 == 0) { | |
316 | TTCN_pattern_warning("The number of repetitions is zero: `#0'."); | |
317 | $$ = NULL; | |
318 | } else if ($2 == 1) $$ = memptystr(); | |
319 | else { | |
320 | if ($2 > 9) TTCN_pattern_warning("Internal error: Invalid number of " | |
321 | "repetitions: `#%lu'.", $2); | |
322 | $$ = mprintf("{%lu}", $2); | |
323 | } | |
324 | } | |
325 | | '#' '(' TOK_Number ')' | |
326 | { | |
327 | if ($3 == 0) { | |
328 | TTCN_pattern_warning("The number of repetitions is zero: `#(0)'."); | |
329 | $$ = NULL; | |
330 | } else if ($3 == 1) $$ = memptystr(); | |
331 | else { | |
332 | #ifdef RE_DUP_MAX | |
333 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The number of repetitions in " | |
334 | "`#(%lu)' exceeds the limit allowed by this system (%d).", $3, | |
335 | RE_DUP_MAX); | |
336 | #endif | |
337 | $$ = mprintf("{%lu}", $3); | |
338 | } | |
339 | } | |
340 | | '#' '(' TOK_Number ',' TOK_Number ')' | |
341 | { | |
342 | #ifdef RE_DUP_MAX | |
343 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The minimum number of " | |
344 | "repetitions in `#(%lu,%lu)' exceeds the limit allowed by this system " | |
345 | "(%d).", $3, $5, RE_DUP_MAX); | |
346 | if ($5 > RE_DUP_MAX) TTCN_pattern_warning("The maximum number of " | |
347 | "repetitions in `#(%lu,%lu)' exceeds the limit allowed by this system " | |
348 | "(%d).", $3, $5, RE_DUP_MAX); | |
349 | #endif | |
350 | if ($3 > $5) TTCN_pattern_error("The lower bound is higher than the upper " | |
351 | "bound in the number of repetitions: `#(%lu,%lu)'.", $3, $5); | |
352 | if ($3 == $5) { | |
353 | if ($3 == 0) { | |
354 | TTCN_pattern_warning("The number of repetitions is zero: `#(0,0)'."); | |
355 | $$ = NULL; | |
356 | } else if ($3 == 1) $$ = memptystr(); | |
357 | else { | |
358 | $$ = mprintf("{%lu}", $3); | |
359 | } | |
360 | } else { | |
361 | if ($3 == 0 && $5 == 1) $$ = mcopystr("?"); | |
362 | else $$ = mprintf("{%lu,%lu}", $3, $5); | |
363 | } | |
364 | } | |
365 | | '#' '(' ',' TOK_Number ')' | |
366 | { | |
367 | if ($4 == 0) { | |
368 | TTCN_pattern_warning("The number of repetitions is zero: `#(,0)'."); | |
369 | $$ = NULL; | |
370 | } else { | |
371 | #ifdef RE_DUP_MAX | |
372 | if ($4 > RE_DUP_MAX) TTCN_pattern_warning("The maximum number of " | |
373 | "repetitions in `#(,%lu)' exceeds the limit allowed by this system " | |
374 | "(%d).", $4, RE_DUP_MAX); | |
375 | #endif | |
376 | if ($4 == 1) $$ = mcopystr("?"); | |
377 | else $$ = mprintf("{0,%lu}", $4); | |
378 | } | |
379 | } | |
380 | | '#' '(' TOK_Number ',' ')' | |
381 | { | |
382 | if ($3 == 0) $$ = mcopystr("*"); | |
383 | else { | |
384 | #ifdef RE_DUP_MAX | |
385 | if ($3 > RE_DUP_MAX) TTCN_pattern_warning("The minimum number of " | |
386 | "repetitions in `#(%lu,)' exceeds the limit allowed by this system " | |
387 | "(%d).", $3, RE_DUP_MAX); | |
388 | #endif | |
389 | if ($3 == 1) $$ = mcopystr("+"); | |
390 | else $$ = mprintf("{%lu,}", $3); | |
391 | } | |
392 | } | |
393 | ; | |
394 | ||
395 | RE_OneCharPos: | |
396 | '?' {$$=mcopystr(".");} | |
397 | | KW_BS_d {$$=mcopystr("[0-9]");} | |
398 | | KW_BS_w {$$=mcopystr("[0-9A-Za-z]");} | |
399 | | KW_BS_t {$$=mcopystr("\t");} | |
400 | | KW_BS_n {$$=mcopystr("[\n-\r]");} | |
401 | | KW_BS_r {$$=mcopystr("\r");} | |
402 | | KW_BS_s {$$=mcopystr("[\t-\r ]");} | |
403 | | KW_BS_b | |
404 | { | |
405 | TTCN_pattern_warning("Metacharacter `\\b' is not supported yet."); | |
406 | $$ = NULL; | |
407 | } | |
408 | | TOK_Char | |
409 | { | |
410 | unsigned char c = $1; | |
3abe9331 | 411 | if (c == 0 || (c > 127 && !allow_ext_ascii)) TTCN_pattern_error("Character " |
412 | "with code %u (0x%02x) cannot be used in a pattern for type charstring.", c, c); | |
970ed795 EL |
413 | $$ = translate_character($1); |
414 | } | |
415 | | RE_Quadruple | |
416 | { | |
417 | $$ = translate_character($1); | |
418 | } | |
419 | | RE_Set | |
420 | { | |
421 | if (set_is_empty($1)) { | |
422 | TTCN_pattern_error("Empty character set."); | |
423 | $$ = NULL; | |
424 | } else $$ = set_generate_posix($1); | |
425 | set_free($1); | |
426 | } | |
427 | ; | |
428 | ||
429 | RE_Set: | |
430 | /* RE_Set_Begin is 1 for "[^", 0 for "[" | |
431 | * RE_Set_Begin_Rsbrkt is 1 for "[^]", 0 for "[]" | |
432 | * RE_Set_End is 1 for "-]", 0 for "]" | |
433 | */ | |
434 | RE_Set_Begin RE_Set_Body RE_Set_End | |
435 | { | |
436 | if ($2 != NULL) $$ = $2; | |
437 | else $$ = set_init(); | |
438 | if ($3) { | |
439 | if (set_has_char($$, '-')) | |
440 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
441 | else set_add_char($$, '-'); | |
442 | } | |
443 | if ($1) set_negate($$); | |
444 | } | |
445 | | RE_Set_Begin '-' RE_Set_Body RE_Set_End | |
446 | { | |
447 | if ($3 != NULL) $$ = $3; | |
448 | else $$ = set_init(); | |
449 | if (set_has_char($$, '-')) | |
450 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
451 | else set_add_char($$, '-'); | |
452 | if ($4) { | |
453 | if (set_has_char($$, '-')) | |
454 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
455 | else set_add_char($$, '-'); | |
456 | } | |
457 | if ($1) set_negate($$); | |
458 | } | |
459 | | RE_Set_Begin_Rsbrkt RE_Set_Body RE_Set_End | |
460 | { | |
461 | if ($2 != NULL) $$ = $2; | |
462 | else $$ = set_init(); | |
463 | if (set_has_char($$, ']')) | |
464 | TTCN_pattern_warning("Duplicate character `]' in the character set."); | |
465 | else set_add_char($$, ']'); | |
466 | if ($3) { | |
467 | if (set_has_char($$, '-')) | |
468 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
469 | else set_add_char($$, '-'); | |
470 | } | |
471 | if ($1) set_negate($$); | |
472 | } | |
473 | | RE_Set_Begin_Rsbrkt '-' RE_Set_Range_Char RE_Set_Body RE_Set_End | |
474 | { | |
475 | if ($4 != NULL) $$ = $4; | |
476 | else $$ = set_init(); | |
477 | char *range_str = print_range(']', $3); | |
478 | if (']' > $3) { | |
479 | TTCN_pattern_error("Invalid range `%s' in the character set: the " | |
480 | "character code of the lower bound (%u) is higher than that of the " | |
481 | "upper bound (%u).", range_str, ']', (unsigned char)$3); | |
482 | } else { | |
483 | if (set_has_range($$, ']', $3)) { | |
484 | character_set *tmpset = set_init(); | |
485 | set_add_range(tmpset, ']', $3); | |
486 | set_report_duplicates($$, tmpset); | |
487 | set_free(tmpset); | |
488 | } | |
489 | } | |
490 | set_add_range($$, ']', $3); | |
491 | Free(range_str); | |
492 | if ($5) { | |
493 | if (set_has_char($$, '-')) | |
494 | TTCN_pattern_warning("Duplicate character `-' in the character set."); | |
495 | else set_add_char($$, '-'); | |
496 | } | |
497 | if ($1) set_negate($$); | |
498 | } | |
499 | ; | |
500 | ||
501 | RE_Set_Begin: | |
502 | KW_Set_Begin { $$ = 0; } | |
503 | | KW_Set_Begin_Neg { $$ = 1; } | |
504 | ; | |
505 | ||
506 | RE_Set_Begin_Rsbrkt: | |
507 | KW_Set_Begin_Rsbrkt { $$ = 0; } | |
508 | | KW_Set_Begin_Neg_Rsbrkt { $$ = 1; } | |
509 | ; | |
510 | ||
511 | RE_Set_End: | |
512 | KW_Set_End { $$ = 0; } | |
513 | | KW_Set_Dash_End { $$ = 1; } | |
514 | ; | |
515 | ||
516 | RE_Set_Body: | |
517 | /* empty */ { $$ = NULL; } | |
518 | | RE_Set_Body RE_Set_Elem | |
519 | { | |
520 | if ($1 != NULL) { | |
521 | $$ = $1; | |
522 | if (set_has_intersect($$, $2)) set_report_duplicates($$, $2); | |
523 | set_join($$, $2); | |
524 | set_free($2); | |
525 | } else $$ = $2; | |
526 | } | |
527 | ; | |
528 | ||
529 | RE_Set_Elem: | |
530 | RE_Set_Range_Char | |
531 | { | |
532 | $$ = set_init(); | |
533 | set_add_char($$, $1); | |
534 | } | |
535 | | RE_Set_NoRange_Char { $$ = $1; } | |
536 | | RE_Set_Range_Char '-' RE_Set_Range_Char | |
537 | { | |
538 | if ($1 > $3) { | |
539 | char *range_str = print_range($1, $3); | |
540 | TTCN_pattern_error("Invalid range `%s' in the character set: the " | |
541 | "character code of the lower bound (%u) is higher than that of the " | |
542 | "upper bound (%u).", range_str, (unsigned char)$1, (unsigned char)$3); | |
543 | Free(range_str); | |
544 | } | |
545 | $$ = set_init(); | |
546 | set_add_range($$, $1, $3); | |
547 | } | |
548 | ; | |
549 | ||
550 | RE_Set_Range_Char: | |
551 | KW_BS_t { $$ = '\t'; } | |
552 | | KW_BS_r { $$ = '\r'; } | |
553 | | TOK_Char | |
554 | { | |
555 | unsigned char c = $1; | |
3abe9331 | 556 | if (c == 0 || (c > 127 && !allow_ext_ascii)) TTCN_pattern_error("Character " |
557 | "with code %u (0x%02x) cannot be used in a pattern for type charstring.", c, c); | |
970ed795 EL |
558 | $$ = $1; |
559 | } | |
560 | | RE_Quadruple { $$ = $1; } | |
561 | ; | |
562 | ||
563 | RE_Set_NoRange_Char: | |
564 | KW_BS_d | |
565 | { | |
566 | $$ = set_init(); | |
567 | set_add_range($$, '0', '9'); | |
568 | } | |
569 | | KW_BS_w | |
570 | { | |
571 | $$ = set_init(); | |
572 | set_add_range($$, '0', '9'); | |
573 | set_add_range($$, 'A', 'Z'); | |
574 | set_add_range($$, 'a', 'z'); | |
575 | } | |
576 | | KW_BS_n | |
577 | { | |
578 | $$ = set_init(); | |
579 | set_add_range($$, '\n', '\r'); | |
580 | } | |
581 | | KW_BS_s | |
582 | { | |
583 | $$ = set_init(); | |
584 | set_add_range($$, '\t', '\r'); | |
585 | set_add_char($$, ' '); | |
586 | } | |
587 | | KW_BS_b | |
588 | { | |
589 | TTCN_pattern_error("Metacharacter `\\b' does not make any sense in a " | |
590 | "character set."); | |
591 | $$ = set_init(); | |
592 | } | |
593 | ; | |
594 | ||
595 | RE_Quadruple: | |
596 | KW_BS_q '{' TOK_Number ',' TOK_Number ',' TOK_Number ',' TOK_Number '}' | |
597 | { | |
598 | if ($3 > 127) TTCN_pattern_error("The first number (group) of quadruple " | |
599 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..127 " | |
600 | "instead of %lu.", $3, $5, $7, $9, $3); | |
601 | if ($5 > 255) TTCN_pattern_error("The second number (plane) of quadruple " | |
602 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
603 | "instead of %lu.", $3, $5, $7, $9, $5); | |
604 | if ($7 > 255) TTCN_pattern_error("The third number (row) of quadruple " | |
605 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
606 | "instead of %lu.", $3, $5, $7, $9, $7); | |
607 | if ($9 > 255) TTCN_pattern_error("The fourth number (cell) of quadruple " | |
608 | "`\\q{%lu,%lu,%lu,%lu}' is too large. It should be in the range 0..255 " | |
609 | "instead of %lu.", $3, $5, $7, $9, $9); | |
610 | if ($3 > 0 || $5 > 0 || $7 > 0 || $9 > 127) TTCN_pattern_error("Quadruple " | |
611 | "`\\q{%lu,%lu,%lu,%lu}' is not valid in a pattern for type charstring.", | |
612 | $3, $5, $7, $9); | |
613 | if ($3 == 0 && $5 == 0 && $7 == 0 && $9 == 0) TTCN_pattern_error("Zero " | |
614 | "character (i.e. quadruple `\\q{0,0,0,0}') is not supported in a " | |
615 | "pattern for type charstring."); | |
616 | $$ = $9; | |
617 | } | |
618 | ; | |
619 | ||
620 | %% | |
621 | ||
622 | /********************************************************************* | |
623 | * Interface | |
624 | *********************************************************************/ | |
625 | ||
3abe9331 | 626 | char* TTCN_pattern_to_regexp(const char* p_pattern, bool utf8) |
970ed795 EL |
627 | { |
628 | /* if you want to debug */ | |
629 | //pattern_yydebug=1; | |
630 | ||
631 | ret_val=NULL; | |
632 | ||
3abe9331 | 633 | /* allow extended ASCII characters if the pattern is in UTF-8 format */ |
634 | allow_ext_ascii = utf8; | |
635 | ||
970ed795 EL |
636 | yy_buffer_state *flex_buffer = pattern_yy_scan_string(p_pattern); |
637 | if(flex_buffer == NULL) { | |
638 | TTCN_pattern_error("Flex buffer creation failed."); | |
639 | return NULL; | |
640 | } | |
641 | init_pattern_yylex(&yylval); | |
642 | if(pattern_yyparse()) { | |
643 | Free(ret_val); | |
644 | ret_val=NULL; | |
645 | } | |
646 | pattern_yylex_destroy(); | |
647 | return ret_val; | |
648 | } | |
649 | ||
650 | // Backwards compatibility shim | |
651 | char* TTCN_pattern_to_regexp(const char* p_pattern, int ere) | |
652 | { | |
653 | TTCN_pattern_warning( | |
654 | "TTCN_pattern_to_regexp(const char* p_pattern, int ere) is deprecated"); | |
655 | if (ere != 1) TTCN_pattern_error( | |
656 | "BRE is not supported for TTCN_pattern_to_regexp"); | |
657 | return TTCN_pattern_to_regexp(p_pattern); | |
658 | } | |
659 | ||
660 | /********************************************************************* | |
661 | * Static functions | |
662 | *********************************************************************/ | |
663 | ||
664 | /// Error reporting function | |
665 | void pattern_yyerror(const char *error_str) | |
666 | { | |
667 | TTCN_pattern_error("%s", error_str); | |
668 | } | |
669 | ||
670 | /** Escape plain characters which would be metacharacters in a regex. | |
671 | * | |
672 | * @param c plain character | |
673 | * @return a newly allocated string which must be Free() 'd | |
674 | */ | |
675 | char *translate_character(char c) | |
676 | { | |
677 | int escape_needed = 0; | |
678 | switch (c) { | |
679 | case '|': | |
680 | case '+': | |
681 | case '?': | |
682 | case '{': | |
683 | case '}': | |
684 | case '(': | |
685 | case ')': | |
686 | case '.': | |
687 | case '^': | |
688 | case '$': | |
689 | case '[': | |
690 | case '*': | |
691 | case '\\': | |
692 | escape_needed = 1; | |
693 | break; | |
694 | } | |
695 | if (escape_needed) return mprintf("\\%c", c); | |
696 | else return mputc(NULL, c); | |
697 | } | |
698 | ||
699 | char *print_character(char c) | |
700 | { | |
701 | switch (c) { | |
702 | case '\t': | |
703 | return mcopystr("\\t"); | |
704 | case '\r': | |
705 | return mcopystr("\\r"); | |
706 | default: | |
707 | if (isprint((unsigned char)c)) return mprintf("%c", c); | |
708 | else return mprintf("\\q{0,0,0,%u}", (unsigned char)c); | |
709 | } | |
710 | } | |
711 | ||
712 | char *print_range(char lower, char upper) | |
713 | { | |
714 | char *range_str = print_character(lower); | |
715 | range_str = mputc(range_str, '-'); | |
716 | char *upper_str = print_character(upper); | |
717 | range_str = mputstr(range_str, upper_str); | |
718 | Free(upper_str); | |
719 | return range_str; | |
720 | } | |
721 | ||
722 | #define CS_BITS_PER_ELEM (8 * sizeof(unsigned long)) | |
723 | #define CS_NOF_ELEMS ((128 + CS_BITS_PER_ELEM - 1) / CS_BITS_PER_ELEM) | |
724 | ||
725 | struct character_set { | |
726 | unsigned long set_members[CS_NOF_ELEMS]; | |
727 | }; | |
728 | ||
729 | character_set *set_init() | |
730 | { | |
731 | character_set *set = (character_set*)Malloc(sizeof(*set)); | |
732 | memset(set->set_members, 0, sizeof(set->set_members)); | |
733 | return set; | |
734 | } | |
735 | ||
736 | character_set *set_copy(const character_set *set) | |
737 | { | |
738 | character_set *set2 = (character_set*)Malloc(sizeof(*set2)); | |
739 | memcpy(set2, set, sizeof(*set2)); | |
740 | return set2; | |
741 | } | |
742 | ||
743 | void set_free(character_set *set) | |
744 | { | |
745 | Free(set); | |
746 | } | |
747 | ||
748 | int set_is_empty(const character_set *set) | |
749 | { | |
750 | if ((set->set_members[0] & ~1UL) != 0) return 0; | |
751 | for (size_t i = 1; i < CS_NOF_ELEMS; i++) | |
752 | if (set->set_members[i] != 0) return 0; | |
753 | return 1; | |
754 | } | |
755 | ||
756 | int set_is_full(const character_set *set) | |
757 | { | |
758 | if (~(set->set_members[0] | 1UL) != 0) return 0; | |
759 | for (size_t i = 1; i < CS_NOF_ELEMS; i++) | |
760 | if (~set->set_members[i] != 0) return 0; | |
761 | return 1; | |
762 | } | |
763 | ||
764 | int set_has_char(const character_set *set, char c) | |
765 | { | |
766 | if (set->set_members[c / CS_BITS_PER_ELEM] & 1UL << c % CS_BITS_PER_ELEM) | |
767 | return 1; | |
768 | else return 0; | |
769 | } | |
770 | ||
771 | void set_add_char(character_set *set, char c) | |
772 | { | |
773 | set->set_members[c / CS_BITS_PER_ELEM] |= 1UL << c % CS_BITS_PER_ELEM; | |
774 | } | |
775 | ||
776 | void set_remove_char(character_set *set, char c) | |
777 | { | |
778 | set->set_members[c / CS_BITS_PER_ELEM] &= ~(1UL << c % CS_BITS_PER_ELEM); | |
779 | } | |
780 | ||
781 | int set_has_range(const character_set *set, char lower, char upper) | |
782 | { | |
783 | for (size_t i = lower; i <= (unsigned char)upper; i++) | |
784 | if (set->set_members[i / CS_BITS_PER_ELEM] & 1UL << i % CS_BITS_PER_ELEM) | |
785 | return 1; | |
786 | return 0; | |
787 | } | |
788 | ||
789 | void set_add_range(character_set *set, char lower, char upper) | |
790 | { | |
791 | for (size_t i = lower; i <= (unsigned char)upper; i++) | |
792 | set->set_members[i / CS_BITS_PER_ELEM] |= 1UL << i % CS_BITS_PER_ELEM; | |
793 | } | |
794 | ||
795 | int set_has_intersect(const character_set *set1, const character_set *set2) | |
796 | { | |
797 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
798 | if (set1->set_members[i] & set2->set_members[i]) return 1; | |
799 | return 0; | |
800 | } | |
801 | ||
802 | void set_join(character_set *dst, const character_set *src) | |
803 | { | |
804 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
805 | dst->set_members[i] |= src->set_members[i]; | |
806 | } | |
807 | ||
808 | void set_negate(character_set *set) | |
809 | { | |
810 | for (size_t i = 0; i < CS_NOF_ELEMS; i++) | |
811 | set->set_members[i] = ~set->set_members[i]; | |
812 | } | |
813 | ||
814 | void set_report_duplicates(const character_set *set1, | |
815 | const character_set *set2) | |
816 | { | |
817 | for (unsigned char i = 0; i <= 127; ) { | |
818 | for (i++; i <= 127; i++) | |
819 | if (set_has_char(set2, i) && set_has_char(set1, i)) break; | |
820 | if (i > 127) break; | |
821 | char lower = i; | |
822 | for (i++; i <= 127; i++) | |
823 | if (!set_has_char(set2, i) || !set_has_char(set1, i)) break; | |
824 | char upper = i - 1; | |
825 | if (lower < upper) { | |
826 | char *range_str = print_range(lower, upper); | |
827 | TTCN_pattern_warning("Duplicate range `%s' in the character set.", | |
828 | range_str); | |
829 | Free(range_str); | |
830 | } else { | |
831 | char *char_str = print_character(lower); | |
832 | if(lower == '\r' ){ | |
833 | TTCN_pattern_warning("Duplicate character `%s' in the character " | |
834 | "set. Please note the \\n includes the \\r implicitly. " | |
835 | "Use \\q{0,0,0,10} if you would like to match the LF only.", char_str); | |
836 | } else { | |
837 | TTCN_pattern_warning("Duplicate character `%s' in the character " | |
838 | "set.", char_str); | |
839 | } | |
840 | Free(char_str); | |
841 | } | |
842 | } | |
843 | } | |
844 | ||
845 | static char *append_posix_body(char *set_body, const character_set *set) | |
846 | { | |
847 | for (unsigned char i = 0; i <= 127; ) { | |
848 | for (i++; i <= 127; i++) if (set_has_char(set, i)) break; | |
849 | if (i > 127) break; | |
850 | char lower = i; | |
851 | set_body = mputc(set_body, lower); | |
852 | for (i++; i <= 127; i++) if (!set_has_char(set, i)) break; | |
853 | char upper = i - 1; | |
854 | if (lower < upper) { | |
855 | if (lower + 1 < upper) set_body = mputc(set_body, '-'); | |
856 | set_body = mputc(set_body, upper); | |
857 | } | |
858 | } | |
859 | return set_body; | |
860 | } | |
861 | ||
862 | static char *generate_posix_body(character_set *set) | |
863 | { | |
864 | int has_caret; | |
865 | if (set_has_char(set, '^') && !(set_has_char(set, '^' - 1) && | |
866 | set_has_char(set, '^' + 1))) { | |
867 | set_remove_char(set, '^'); | |
868 | has_caret = 1; | |
869 | } else has_caret = 0; | |
870 | int has_dash; | |
871 | if (set_has_char(set, '-') && !(set_has_char(set, '-' - 1) && | |
872 | set_has_char(set, '-' + 1))) { | |
873 | set_remove_char(set, '-'); | |
874 | has_dash = 1; | |
875 | } else has_dash = 0; | |
876 | int has_rsbrkt; | |
877 | if (set_has_char(set, ']') && !(set_has_char(set, ']' - 1) && | |
878 | set_has_char(set, ']' + 1))) { | |
879 | set_remove_char(set, ']'); | |
880 | has_rsbrkt = 1; | |
881 | } else has_rsbrkt = 0; | |
882 | char *set_body = memptystr(); | |
883 | if (set_is_empty(set) && !has_rsbrkt) { | |
884 | /* the `-' must precede the `^' */ | |
885 | if (has_dash) set_body = mputc(set_body, '-'); | |
886 | if (has_caret) set_body = mputc(set_body, '^'); | |
887 | } else { | |
888 | /* order: ']', others, '^', '-' */ | |
889 | if (has_rsbrkt) set_body = mputc(set_body, ']'); | |
890 | set_body = append_posix_body(set_body, set); | |
891 | if (has_caret) set_body = mputc(set_body, '^'); | |
892 | if (has_dash) set_body = mputc(set_body, '-'); | |
893 | } | |
894 | return set_body; | |
895 | } | |
896 | ||
897 | static char *generate_posix_body_compl(character_set *set) | |
898 | { | |
899 | set_negate(set); | |
900 | int has_dash; | |
901 | if (set_has_char(set, '-') && !(set_has_char(set, '-' - 1) && | |
902 | set_has_char(set, '-' + 1))) { | |
903 | set_remove_char(set, '-'); | |
904 | has_dash = 1; | |
905 | } else has_dash = 0; | |
906 | int has_rsbrkt; | |
907 | if (set_has_char(set, ']') && !(set_has_char(set, ']' - 1) && | |
908 | set_has_char(set, ']' + 1))) { | |
909 | set_remove_char(set, ']'); | |
910 | has_rsbrkt = 1; | |
911 | } else has_rsbrkt = 0; | |
912 | char *set_body = mcopystr("^"); | |
913 | /* order: ']', others, '-' */ | |
914 | if (has_rsbrkt) set_body = mputc(set_body, ']'); | |
915 | set_body = append_posix_body(set_body, set); | |
916 | if (has_dash) set_body = mputc(set_body, '-'); | |
917 | return set_body; | |
918 | } | |
919 | ||
920 | char *set_generate_posix(const character_set *set) | |
921 | { | |
922 | /* a full set can only be represented in this way: */ | |
923 | if (set_is_full(set)) return mcopystr("."); | |
924 | character_set *tempset = set_copy(set); | |
925 | char *set_body = generate_posix_body(tempset); | |
926 | set_free(tempset); | |
927 | char *posix_str; | |
928 | if (set_body[0] == '\0') { | |
929 | Free(set_body); | |
930 | TTCN_pattern_error("Internal error: empty POSIX set."); | |
931 | return NULL; | |
932 | } | |
933 | /* do not use the set notation in POSIX if the set contains only one | |
934 | * character */ | |
935 | if (set_body[1] == '\0') posix_str = translate_character(set_body[0]); | |
936 | else { | |
937 | /* create the complemented version of the same set */ | |
938 | tempset = set_copy(set); | |
939 | char *compl_body = generate_posix_body_compl(tempset); | |
940 | set_free(tempset); | |
941 | if (compl_body[0] == '\0') { | |
942 | Free(set_body); | |
943 | Free(compl_body); | |
944 | TTCN_pattern_error("Internal error: empty complemented POSIX set."); | |
945 | return NULL; | |
946 | } | |
947 | /* use the complemented form in the POSIX equivalent if it is the shorter | |
948 | * one */ | |
949 | if (mstrlen(compl_body) < mstrlen(set_body)) | |
950 | posix_str = mprintf("[%s]", compl_body); | |
951 | else posix_str = mprintf("[%s]", set_body); | |
952 | Free(compl_body); | |
953 | } | |
954 | Free(set_body); | |
955 | return posix_str; | |
956 | } | |
957 | ||
958 | void yyprint(FILE *file, int type, const YYSTYPE& value) | |
959 | { | |
960 | switch (type) { | |
961 | case TOK_Char: | |
962 | fprintf(file, "'%c'", value.c); | |
963 | break; | |
964 | case TOK_Digit: case TOK_Number: | |
965 | fprintf(file, "'%lu'", value.u); | |
966 | break; | |
967 | default: | |
968 | break; | |
969 | } | |
970 | } | |
971 |