Sync with 5.1.0
[deliverable/titan.core.git] / compiler2 / PredefFunc.cc
CommitLineData
970ed795
EL
1///////////////////////////////////////////////////////////////////////////////
2// Copyright (c) 2000-2014 Ericsson Telecom AB
3// All rights reserved. This program and the accompanying materials
4// are made available under the terms of the Eclipse Public License v1.0
5// which accompanies this distribution, and is available at
6// http://www.eclipse.org/legal/epl-v10.html
7///////////////////////////////////////////////////////////////////////////////
8#include "PredefFunc.hh"
9#include "error.h"
10#include "Int.hh"
11#include "Real.hh"
12#include "Setting.hh"
13#include "string.hh"
14#include "ustring.hh"
15#include "CompilerError.hh"
16#include <stdio.h>
17#include <sys/types.h>
18#include <regex.h>
19#include <stdint.h>
20#include "../common/memory.h"
21#include "../common/pattern.hh"
22#include <iostream>
23
24// used by regex
25#define ERRMSG_BUFSIZE 512
26
27namespace Common {
28
29 static const char utf32be[] = {'0','0','0','0','F','E','F','F',0};
30 static const char utf32le[] = {'F','F','F','E','0','0','0','0',0};
31 static const char utf16be[] = {'F','E','F','F',0};
32 static const char utf16le[] = {'F','F','F','E',0};
33 static const char utf8[] = {'E','F','B','B','B','F',0};
34
35 static inline unsigned char get_bit_value(char c, unsigned char bit_value)
36 {
37 switch (c) {
38 case '0':
39 return 0;
40 case '1':
41 return bit_value;
42 default:
43 FATAL_ERROR("Invalid binary digit (%c) in bitstring value", c);
44 return 0;
45 }
46 }
47
48 char toupper (const char c)
49 {
50 if (('A' <= c && 'F' >= c) ||
51 ('0' <= c && '9' >= c)) return c;
52 switch (c)
53 {
54 case 'a' : return 'A';
55 case 'b' : return 'B';
56 case 'c' : return 'C';
57 case 'd' : return 'D';
58 case 'e' : return 'E';
59 case 'f' : return 'F';
60 default:
61 FATAL_ERROR("%c cannot be converted to hex character", c);
62 break;
63 }
64 }
65
66 char hexdigit_to_char(unsigned char hexdigit)
67 {
68 if (hexdigit < 10) return '0' + hexdigit;
69 else if (hexdigit < 16) return 'A' + hexdigit - 10;
70 else {
71 FATAL_ERROR("hexdigit_to_char(): invalid argument: %d", hexdigit);
72 return '\0'; // to avoid warning
73 }
74 }
75
76 unsigned char char_to_hexdigit(char c)
77 {
78 if (c >= '0' && c <= '9') return c - '0';
79 else if (c >= 'A' && c <= 'F') return c - 'A' + 10;
80 else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
81 else {
82 FATAL_ERROR("char_to_hexdigit(): invalid argument: %c", c);
83 return 0; // to avoid warning
84 }
85 }
86
87 string uchar2str(unsigned char uchar)
88 {
89 char str[2];
90 str[0] = hexdigit_to_char(uchar / 16);
91 str[1] = hexdigit_to_char(uchar % 16);
92 return string(2, str);
93 }
94
95 unsigned char str2uchar(const char& c1, const char& c2)
96 {
97 unsigned char uc = 0;
98 uc = char_to_hexdigit(c1);
99 uc <<= 4;
100 uc += char_to_hexdigit(c2);
101 return uc;
102 }
103
104 int_val_t rem(const int_val_t& left, const int_val_t& right)
105 {
106 return (left - right * (left / right));
107 }
108
109 int_val_t mod(const int_val_t& left, const int_val_t& right)
110 {
111 int_val_t r = right < 0 ? -right : right;
112 if (left > 0) {
113 return rem(left, r);
114 } else {
115 int_val_t result = rem(left, r);
116 return result == 0 ? result : result + r;
117 }
118 }
119
120 string* to_uppercase(const string& value)
121 {
122 string *s = new string(value);
123 for (size_t i = 0; i < s->size(); i++) {
124 char& c=(*s)[i];
125 if (c >= 'a' && c <= 'z') c = c - 'a' + 'A';
126 }
127 return s;
128 }
129
130 string* not4b_bit(const string& bstr)
131 {
132 string *s=new string(bstr);
133 for(size_t i=0; i<s->size(); i++) {
134 char& c=(*s)[i];
135 switch(c) {
136 case '0': c='1'; break;
137 case '1': c='0'; break;
138 default:
139 FATAL_ERROR("not4b_bit(): Invalid char in bitstring.");
140 } // switch c
141 } // for i
142 return s;
143 }
144
145 string* not4b_hex(const string& hstr)
146 {
147 string *s=new string(hstr);
148 for(size_t i=0; i<s->size(); i++) {
149 char& c=(*s)[i];
150 switch(c) {
151 case '0': c='F'; break;
152 case '1': c='E'; break;
153 case '2': c='D'; break;
154 case '3': c='C'; break;
155 case '4': c='B'; break;
156 case '5': c='A'; break;
157 case '6': c='9'; break;
158 case '7': c='8'; break;
159 case '8': c='7'; break;
160 case '9': c='6'; break;
161 case 'A': c='5'; break;
162 case 'B': c='4'; break;
163 case 'C': c='3'; break;
164 case 'D': c='2'; break;
165 case 'E': c='1'; break;
166 case 'F': c='0'; break;
167 case 'a': c='5'; break;
168 case 'b': c='4'; break;
169 case 'c': c='3'; break;
170 case 'd': c='2'; break;
171 case 'e': c='1'; break;
172 case 'f': c='0'; break;
173 default:
174 FATAL_ERROR("not4b_hex(): Invalid char in hexstring.");
175 } // switch c
176 } // for i
177 return s;
178 }
179
180 string* and4b(const string& left, const string& right)
181 {
182 string *s=new string(left);
183 for(size_t i=0; i<s->size(); i++) {
184 char& c=(*s)[i];
185 c=hexdigit_to_char(char_to_hexdigit(c) & char_to_hexdigit(right[i]));
186 } // for i
187 return s;
188 }
189
190 string* or4b(const string& left, const string& right)
191 {
192 string *s=new string(left);
193 for(size_t i=0; i<s->size(); i++) {
194 char& c=(*s)[i];
195 c=hexdigit_to_char(char_to_hexdigit(c) | char_to_hexdigit(right[i]));
196 } // for i
197 return s;
198 }
199
200 string* xor4b(const string& left, const string& right)
201 {
202 string *s=new string(left);
203 for(size_t i=0; i<s->size(); i++) {
204 char& c=(*s)[i];
205 c=hexdigit_to_char(char_to_hexdigit(c) ^ char_to_hexdigit(right[i]));
206 } // for i
207 return s;
208 }
209
210 string* shift_left(const string& value, const Int& count)
211 {
212 if (count > 0) {
213 string *s = new string;
214 if (count < static_cast<Int>(value.size())) *s = value.substr(count);
215 s->resize(value.size(), '0');
216 return s;
217 } else if (count < 0) return shift_right(value, -count);
218 else return new string(value);
219 }
220
221 string* shift_right(const string& value, const Int& count)
222 {
223 if (count > 0) {
224 string *s = new string;
225 if (count < static_cast<Int>(value.size())) {
226 s->resize(count, '0');
227 *s += value.substr(0, value.size()-count);
228 } else s->resize(value.size(), '0');
229 return s;
230 } else if (count < 0) return shift_left(value, -count);
231 else return new string(value);
232 }
233
234 string* rotate_left(const string& value, const Int& p_count)
235 {
236 size_t size = value.size();
237 if (size == 0) return new string(value);
238 else if (p_count < 0) return rotate_right(value, -p_count);
239 size_t count = p_count % size;
240 if (count == 0) return new string(value);
241 else return new string(value.substr(count) + value.substr(0, count));
242 }
243
244 string* rotate_right(const string& value, const Int& p_count)
245 {
246 size_t size = value.size();
247 if (size == 0) return new string(value);
248 else if (p_count < 0) return rotate_left(value, -p_count);
249 size_t count = p_count % size;
250 if (count == 0) return new string(value);
251 else return new string(value.substr(size - count) +
252 value.substr(0, size - count));
253 }
254
255
256 ustring* rotate_left(const ustring& value, const Int& p_count)
257 {
258 size_t size = value.size();
259 if (size == 0) return new ustring(value);
260 else if (p_count < 0) return rotate_right(value, -p_count);
261 size_t count = p_count % size;
262 if (count == 0) return new ustring(value);
263 else return new ustring(value.substr(count) + value.substr(0, count));
264 }
265
266 ustring* rotate_right(const ustring& value, const Int& p_count)
267 {
268 size_t size = value.size();
269 if (size == 0) return new ustring(value);
270 else if (p_count < 0) return rotate_left(value, -p_count);
271 size_t count = p_count % size;
272 if (count == 0) return new ustring(value);
273 else return new ustring(value.substr(size - count) +
274 value.substr(0, size - count));
275 }
276
277 int_val_t* bit2int(const string& bstr)
278 {
279 size_t nof_bits = bstr.size();
280 // skip the leading zeros
281 size_t start_index = 0;
282 while (start_index < nof_bits && bstr[start_index] == '0') start_index++;
283 int_val_t *ret_val = new int_val_t((Int)0);
284 for (size_t i = start_index; i < nof_bits; i++) {
285 *ret_val <<= 1;
286 if (bstr[i] == '1') *ret_val += 1;
287 }
288 return ret_val;
289 }
290
291 int_val_t* hex2int(const string& hstr)
292 {
293 size_t nof_digits = hstr.size();
294 size_t start_index = 0;
295 // Skip the leading zeros.
296 while (start_index < nof_digits && hstr[start_index] == '0')
297 start_index++;
298 int_val_t *ret_val = new int_val_t((Int)0);
299 for (size_t i = start_index; i < nof_digits; i++) {
300 *ret_val <<= 4;
301 *ret_val += char_to_hexdigit(hstr[i]);
302 }
303 return ret_val;
304 }
305
306 Int unichar2int(const ustring& ustr)
307 {
308 if (ustr.size() != 1) FATAL_ERROR("unichar2int(): invalid argument");
309 const ustring::universal_char& uchar = ustr.u_str()[0];
310 Int ret_val = (uchar.group << 24) | (uchar.plane << 16) | (uchar.row << 8) |
311 uchar.cell;
312 return ret_val;
313 }
314
315 string *int2bit(const int_val_t& value, const Int& length)
316 {
317 if (length < 0) FATAL_ERROR("int2bit(): negative length");
318 size_t string_length = static_cast<size_t>(length);
319 if (static_cast<Int>(string_length) != length ||
320 string_length > string::max_string_len)
321 FATAL_ERROR("int2bit(): length is too large");
322 if (value < 0) FATAL_ERROR("int2bit(): negative value");
323 string *bstr = new string;
324 bstr->resize(string_length);
325 int_val_t tmp_value = value;
326 for (size_t i = 1; i <= string_length; i++) {
327 (*bstr)[string_length - i] = (tmp_value & 1).get_val() ? '1' : '0';
328 tmp_value >>= 1;
329 }
330 if (tmp_value != 0)
331 FATAL_ERROR("int2bit(): %s does not fit in %lu bits", \
332 value.t_str().c_str(), (unsigned long)string_length);
333 return bstr;
334 }
335
336 static const char hdigits[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
337 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
338
339 string *int2hex(const int_val_t& value, const Int& length)
340 {
341 if (length < 0)
342 FATAL_ERROR("int2hex(): negative length");
343 size_t string_length = static_cast<size_t>(length);
344 if (static_cast<Int>(string_length) != length ||
345 string_length > string::max_string_len)
346 FATAL_ERROR("int2hex(): length is too large");
347 if (value < 0) FATAL_ERROR("int2hex(): negative value");
348 string *hstr = new string;
349 hstr->resize(string_length);
350 int_val_t tmp_value = value;
351 for (size_t i = 1; i <= string_length; i++) {
352 (*hstr)[string_length - i] = hdigits[(tmp_value & 0x0f).get_val()];
353 tmp_value >>= 4;
354 }
355 if (tmp_value != 0) {
356 FATAL_ERROR("int2hex(): %s does not fit in %lu hexadecimal digits",
357 value.t_str().c_str(), (unsigned long)string_length);
358 }
359 return hstr;
360 }
361
362 ustring *int2unichar(const Int& value)
363 {
364 if (value < 0 || value > 2147483647)
365 FATAL_ERROR("int2unichar(): invalid argument");
366 unsigned char group = (value >> 24) & 0xFF,
367 plane = (value >> 16) & 0xFF,
368 row = (value >> 8) & 0xFF,
369 cell = value & 0xFF;
370 return new ustring(group, plane, row, cell);
371 }
372
373 string *oct2char(const string& ostr)
374 {
375 string *cstr = new string;
376 size_t ostr_size = ostr.size();
377 if (ostr_size % 2)
378 FATAL_ERROR("oct2char(): argument has odd length: %lu",
379 (unsigned long) ostr_size);
380 size_t cstr_size = ostr_size / 2;
381 cstr->resize(cstr_size);
382 const char *ostr_ptr = ostr.c_str();
383 for (size_t i = 0; i < cstr_size; i++) {
384 unsigned char c = 16 * char_to_hexdigit(ostr_ptr[2 * i]) +
385 char_to_hexdigit(ostr_ptr[2 * i + 1]);
386 if (c > 127) FATAL_ERROR("oct2char(): resulting charstring contains " \
387 "non-ascii character: %d", c);
388 (*cstr)[i] = c;
389 }
390 return cstr;
391 }
392
393 string *char2oct(const string& cstr)
394 {
395 string *ostr = new string;
396 size_t cstr_size = cstr.size();
397 ostr->resize(cstr_size * 2, '0');
398 const char *cstr_ptr = cstr.c_str();
399 for (size_t i = 0; i < cstr_size; i++) {
400 unsigned char c = cstr_ptr[i];
401 (*ostr)[2 * i] = hexdigit_to_char(c / 16);
402 (*ostr)[2 * i + 1] = hexdigit_to_char(c % 16);
403 }
404 return ostr;
405 }
406
407 string *bit2hex(const string& bstr)
408 {
409 size_t size=bstr.size();
410 size_t hsize=(size+3)/4;
411 string *hstr = new string;
412 string *bstr4=NULL;
413 if(size%4) {
414 bstr4=new string;
415 bstr4->resize(hsize*4,'0');
416 bstr4->replace(4-(size%4),size,bstr);
417 }
418 hstr->resize(hsize,'0');
419 string b4(4,"0000");
420 for(size_t i=0;i<hsize;i++) {
421 unsigned int u;
422 if(size%4)b4=bstr4->substr(i*4,4);
423 else b4=bstr.substr(i*4,4);
424 if(b4[0]=='1')u=8;else u=0;
425 if(b4[1]=='1')u+=4;
426 if(b4[2]=='1')u+=2;
427 if(b4[3]=='1')u++;
428 (*hstr)[i]=hdigits[u];
429 }
430 if(bstr4!=NULL)delete bstr4;
431 return hstr;
432 }
433
434 string *hex2oct(const string& hstr)
435 {
436 if(hstr.size()%2==0)return new string(hstr);
437 else {
438 string *ostr=new string("0");
439 (*ostr)+=hstr;
440 return ostr;
441 }
442 }
443
444 string *asn_hex2oct(const string& hstr)
445 {
446 string *ostr = new string(hstr);
447 size_t size = ostr->size();
448 if (size % 2) ostr->resize(size + 1, '0');
449 return ostr;
450 }
451
452 string *bit2oct(const string& bstr)
453 {
454 string *s1,*s2;
455 s1=bit2hex(bstr);
456 s2=hex2oct(*s1);
457 delete s1;
458 return s2;
459 }
460
461 string *asn_bit2oct(const string& bstr)
462 {
463 size_t size = bstr.size();
464 string *ostr = new string;
465 ostr->resize(((size+7)/8)*2);
466 for(size_t i=0, j=0; i<size; ) {
467 unsigned char digit1=0, digit2=0;
468 digit1 += get_bit_value(bstr[i++], 8);
469 if (i < size) {
470 digit1 += get_bit_value(bstr[i++], 4);
471 if (i < size) {
472 digit1 += get_bit_value(bstr[i++], 2);
473 if (i < size) {
474 digit1 += get_bit_value(bstr[i++], 1);
475 if (i < size) {
476 digit2 += get_bit_value(bstr[i++], 8);
477 if (i < size) {
478 digit2 += get_bit_value(bstr[i++], 4);
479 if (i < size) {
480 digit2 += get_bit_value(bstr[i++], 2);
481 if (i < size) digit2 += get_bit_value(bstr[i++], 1);
482 }
483 }
484 }
485 }
486 }
487 }
488 (*ostr)[j++] = hexdigit_to_char(digit1);
489 (*ostr)[j++] = hexdigit_to_char(digit2);
490 }
491 return ostr;
492 }
493
494 string *hex2bit(const string& hstr)
495 {
496 size_t size=hstr.size();
497 string *bstr = new string;
498 bstr->resize(4*size);
499 for(size_t i=0; i<size; i++) {
500 switch(hstr[i]) {
501 case '0':
502 bstr->replace(4*i, 4, "0000");
503 break;
504 case '1':
505 bstr->replace(4*i, 4, "0001");
506 break;
507 case '2':
508 bstr->replace(4*i, 4, "0010");
509 break;
510 case '3':
511 bstr->replace(4*i, 4, "0011");
512 break;
513 case '4':
514 bstr->replace(4*i, 4, "0100");
515 break;
516 case '5':
517 bstr->replace(4*i, 4, "0101");
518 break;
519 case '6':
520 bstr->replace(4*i, 4, "0110");
521 break;
522 case '7':
523 bstr->replace(4*i, 4, "0111");
524 break;
525 case '8':
526 bstr->replace(4*i, 4, "1000");
527 break;
528 case '9':
529 bstr->replace(4*i, 4, "1001");
530 break;
531 case 'A':
532 case 'a':
533 bstr->replace(4*i, 4, "1010");
534 break;
535 case 'B':
536 case 'b':
537 bstr->replace(4*i, 4, "1011");
538 break;
539 case 'C':
540 case 'c':
541 bstr->replace(4*i, 4, "1100");
542 break;
543 case 'D':
544 case 'd':
545 bstr->replace(4*i, 4, "1101");
546 break;
547 case 'E':
548 case 'e':
549 bstr->replace(4*i, 4, "1110");
550 break;
551 case 'F':
552 case 'f':
553 bstr->replace(4*i, 4, "1111");
554 break;
555 default:
556 FATAL_ERROR("Common::hex2bit(): invalid hexadecimal "
557 "digit in hexstring value");
558 }
559 }
560 return bstr;
561 }
562
563 int_val_t* float2int(const Real& value, const Location& loc)
564 {
565 // We shouldn't mimic generality with `Int'.
566 if (value >= (Real)LLONG_MIN && value <= (Real)LLONG_MAX)
567 return new int_val_t((Int)value);
568 char buf[512] = "";
569 snprintf(buf, 511, "%f", value);
570 char *dot = strchr(buf, '.');
571 if (!dot) FATAL_ERROR("Conversion of float value `%f' to integer failed", value);
572 else memset(dot, 0, sizeof(buf) - (dot - buf));
573 return new int_val_t(buf, loc);
574 }
575
576/* TTCN-3 float values that have absolute value smaller than this are
577 displayed in exponential notation. Same as in core/Float.hh */
578#ifndef MIN_DECIMAL_FLOAT
579#define MIN_DECIMAL_FLOAT 1.0E-4
580#endif
581/* TTCN-3 float values that have absolute value larger or equal than
582 this are displayed in exponential notation. Same as in
583 core/Float.hh */
584#ifndef MAX_DECIMAL_FLOAT
585#define MAX_DECIMAL_FLOAT 1.0E+10
586#endif
587
588 string *float2str(const Real& value)
589 {
590 char str_buf[64];
591 if ( (value > -MAX_DECIMAL_FLOAT && value <= -MIN_DECIMAL_FLOAT)
592 || (value >= MIN_DECIMAL_FLOAT && value < MAX_DECIMAL_FLOAT)
593 || (value == 0.0))
594 snprintf(str_buf,64,"%f",value);
595 else snprintf(str_buf,64,"%e",value);
596 return new string(str_buf);
597 }
598
599 string* regexp(const string& instr, const string& expression,
600 const Int& groupno)
601 {
602 string *retval=0;
603
604 if(groupno<0) {
605 FATAL_ERROR("regexp(): groupno must be a non-negative integer");
606 return retval;
607 }
608 // do not report the warnings again
609 // they were already reported while checking the operands
610 unsigned orig_verb_level = verb_level;
611 verb_level &= ~(1|2);
612 char *posix_str=TTCN_pattern_to_regexp(expression.c_str());
613 verb_level = orig_verb_level;
614 if(posix_str==NULL) {
615 FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
616 expression.c_str());
617 return retval;
618 }
619
620 regex_t posix_regexp;
621 int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED);
622 Free(posix_str);
623 if(ret_val!=0) {
624 /* regexp error */
625 char msg[ERRMSG_BUFSIZE];
626 regerror(ret_val, &posix_regexp, msg, sizeof(msg));
627 FATAL_ERROR("regexp(): regcomp() failed: %s", msg);
628 return retval;
629 }
630
631 size_t nmatch=groupno+1;
632 if(nmatch>posix_regexp.re_nsub) {
633 FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
634 "contains only %lu group(s).", (unsigned long) (nmatch - 1),
635 (unsigned long) posix_regexp.re_nsub);
636 return retval;
637 }
638 regmatch_t* pmatch=(regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
639 ret_val=regexec(&posix_regexp, instr.c_str(), nmatch+1, pmatch, 0);
640 if(ret_val==0) {
641 if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1)
642 retval = new string(instr.substr(pmatch[nmatch].rm_so,
643 pmatch[nmatch].rm_eo - pmatch[nmatch].rm_so));
644 else retval=new string();
645 }
646 Free(pmatch);
647 if(ret_val!=0) {
648 if(ret_val==REG_NOMATCH) {
649 regfree(&posix_regexp);
650 retval=new string();
651 }
652 else {
653 /* regexp error */
654 char msg[ERRMSG_BUFSIZE];
655 regerror(ret_val, &posix_regexp, msg, sizeof(msg));
656 FATAL_ERROR("regexp(): regexec() failed: %s", msg);
657 }
658 }
659 else regfree(&posix_regexp);
660
661 return retval;
662 }
663
664 ustring* regexp(const ustring& instr, const ustring& expression,
665 const Int& groupno)
666 {
667 ustring *retval=0;
668
669 if(groupno<0) {
670 FATAL_ERROR("regexp(): groupno must be a non-negative integer");
671 return retval;
672 }
673 // do not report the warnings again
674 // they were already reported while checking the operands
675 unsigned orig_verb_level = verb_level;
676 verb_level &= ~(1|2);
677 int* user_groups;
678 char *posix_str = TTCN_pattern_to_regexp_uni(
679 expression.get_stringRepr_for_pattern().c_str(), &user_groups);
680 if (user_groups == 0)
681 FATAL_ERROR("regexp(): Cannot find any groups in the second argument.");
682 verb_level = orig_verb_level;
683 if(posix_str==NULL) {
684 FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
685 expression.get_stringRepr().c_str());
686 return retval;
687 }
688
689 regex_t posix_regexp;
690 int ret_val=regcomp(&posix_regexp, posix_str, REG_EXTENDED);
691 Free(posix_str);
692 if(ret_val!=0) {
693 /* regexp error */
694 char msg[ERRMSG_BUFSIZE];
695 regerror(ret_val, &posix_regexp, msg, sizeof(msg));
696 FATAL_ERROR("regexp(): regcomp() failed: %s", msg);
697 return retval;
698 }
699
700 size_t nmatch=user_groups[groupno+1]+1;
701 if(nmatch>posix_regexp.re_nsub) {
702 FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
703 "contains only %lu group(s).", (unsigned long) (groupno),
704 (unsigned long) user_groups[0]);
705 return retval;
706 }
707
708 Free(user_groups);
709
710 regmatch_t* pmatch = (regmatch_t*)Malloc((nmatch+1)*sizeof(regmatch_t));
711 char* tmp = instr.convert_to_regexp_form();
712 string instr_conv(tmp);
713 Free(tmp);
714 ret_val = regexec(&posix_regexp, instr_conv.c_str(), nmatch+1, pmatch, 0);
715 if(ret_val == 0) {
716 if(pmatch[nmatch].rm_so != -1 && pmatch[nmatch].rm_eo != -1) {
717 retval = new ustring(
718 instr_conv.substr(pmatch[nmatch].rm_so,
719 pmatch[nmatch].rm_eo - pmatch[nmatch].rm_so)
720 .convert_stringRepr_for_pattern());
721 } else { retval = new ustring(); }
722 }
723 Free(pmatch);
724 if(ret_val!=0) {
725 if(ret_val==REG_NOMATCH) {
726 regfree(&posix_regexp);
727 retval=new ustring();
728 }
729 else {
730 /* regexp error */
731 char msg[ERRMSG_BUFSIZE];
732 regerror(ret_val, &posix_regexp, msg, sizeof(msg));
733 FATAL_ERROR("regexp(): regexec() failed: %s", msg);
734 }
735 }
736 else regfree(&posix_regexp);
737
738 return retval;
739 }
740
741string* remove_bom(const string& encoded_value)
742{
743 size_t length = encoded_value.size();
744 if (0 == length) return new string();
745 if (length % 2 || 0 > length) {
746 ERROR("remove_bom(): Wrong string. The number of nibbles (%d) in string "
747 "shall be divisible by 2", static_cast<int>(length));
748 return new string(encoded_value);
749 }
750
751 int length_of_BOM = 0;
752 string str_uppercase(encoded_value);
753 size_t enough = length > sizeof(utf32be)-1 ? sizeof(utf32be)-1 : length;
754 for (size_t i = 0; i < enough; ++i) {
755 str_uppercase[i] = toupper(encoded_value[i]);
756 }
757
758 if (str_uppercase.find(utf32be, 0) < length) length_of_BOM = sizeof(utf32be)-1;
759 else if (str_uppercase.find(utf32le, 0) < length) length_of_BOM = sizeof(utf32le)-1;
760 else if (str_uppercase.find(utf16be, 0) < length) length_of_BOM = sizeof(utf16be)-1;
761 else if (str_uppercase.find(utf16le, 0) < length) length_of_BOM = sizeof(utf16le)-1;
762 else if (str_uppercase.find(utf8, 0) < length) length_of_BOM = sizeof(utf8)-1;
763 else return new string(encoded_value); // no BOM found
764
765 return new string(encoded_value.substr(length_of_BOM, length));
766}
767
768static CharCoding::CharCodingType is_ascii (size_t length, const unsigned char* strptr)
769{
770 const unsigned char nonASCII = 1 << 7;// MSB is 1 in case of non ASCII character
771 CharCoding::CharCodingType ret = CharCoding::ASCII;
772 for (size_t i = 0; i < length; ++i) {
773 if ( strptr[i] & nonASCII) {
774 ret = CharCoding::UNKNOWN;
775 break;
776 }
777 }
778 return ret;
779}
780
781static CharCoding::CharCodingType is_utf8(size_t length, const unsigned char* strptr)
782{
783 const char MSB = 1 << 7; // MSB is 1 in case of non ASCII character
784 const char MSBmin1 = 1 << 6; // 0100 0000
785 size_t i = 0;
786 while (length > i) {
787 if ( strptr[i] & MSB) { // non ASCII char
788 char maskUTF8 = 1 << 6; // 111x xxxx shows how many additional bytes are there
789 if (!(strptr[i] & maskUTF8)) return CharCoding::UNKNOWN; // accepted 11xxx xxxx but received 10xx xxxx
790 unsigned int noofUTF8 = 0; // 11xx xxxxx -> 2 bytes, 111x xxxxx -> 3 bytes , 1111 xxxxx -> 4 bytes in UTF-8
791 while (strptr[i] & maskUTF8) {
792 ++noofUTF8;
793 maskUTF8 >>= 1; // shift right the mask
794 }
795 // the second and third (and so on) UTF-8 byte looks like 10xx xxxx
796 while (0 < noofUTF8 ) {
797 ++i;
798 if (!(strptr[i] & MSB) || (strptr[i] & MSBmin1) || i >= length) { // if not like this: 10xx xxxx
799 return CharCoding::UNKNOWN;
800 }
801 --noofUTF8;
802 }
803 }
804 ++i;
805 }
806 return CharCoding::UTF_8;
807}
808
809string* get_stringencoding(const string& encoded_value)
810{
811 size_t length = encoded_value.size();
812 if (0 == length) return new string("<unknown>");
813 if (length % 2 || 0 > length) {
814 ERROR("get_stringencoding(): Wrong string. The number of nibbles (%d) in string "
815 "shall be divisible by 2", static_cast<int>(length));
816 return new string("<unknown>");
817 }
818
819 string str_uppercase(encoded_value);
820 size_t enough = length > sizeof(utf32be)-1 ? sizeof(utf32be)-1 : length;
821 for (size_t i = 0; i < enough; ++i) {
822 str_uppercase[i] = toupper(encoded_value[i]);
823 }
824
825 if (str_uppercase.find(utf32be, 0) < length) return new string("UTF-32BE");
826 else if (str_uppercase.find(utf32le, 0) < length) return new string("UTF-32LE");
827 else if (str_uppercase.find(utf16be, 0) < length) return new string("UTF-16BE");
828 else if (str_uppercase.find(utf16le, 0) < length) return new string("UTF-16LE");
829 else if (str_uppercase.find(utf8, 0) < length) return new string("UTF-8");
830
831 unsigned char *uc_str = new unsigned char[length/2];
832 string ret;
833 for (size_t i = 0; i < length / 2; ++i) {
834 uc_str[i] = str2uchar(encoded_value[2 * i], encoded_value[2 * i + 1]);
835 }
836 if (is_ascii (length / 2, uc_str) == CharCoding::ASCII) ret = "ASCII";
837 else if (CharCoding::UTF_8 == is_utf8 (length / 2, uc_str)) ret = "UTF-8";
838 else ret = "<unknown>";
839
840 delete [] uc_str;
841 return new string(ret);
842}
843
844static size_t check_BOM(CharCoding::CharCodingType expected_coding, size_t n_uc, unsigned char* uc_str)
845{
846 if (0 == n_uc) return 0;
847
848 switch (expected_coding) {
849 case CharCoding::UTF32:
850 case CharCoding::UTF32BE:
851 case CharCoding::UTF32LE:
852 if (4 > n_uc) {
853 ERROR("decode_utf32(): The string is shorter than the expected BOM");
854 return 0;
855 }
856 break;
857 case CharCoding::UTF16:
858 case CharCoding::UTF16BE:
859 case CharCoding::UTF16LE:
860 if (2 > n_uc) {
861 ERROR("decode_utf16(): The string is shorter than the expected BOM");
862 return 0;
863 }
864 break;
865 default: break;
866 }
867
868 //BOM indicates that the byte order is determined by a byte order mark,
869 //if present at the beginning the length of BOM is returned.
870 bool badBOM = false;
871 string errmsg;
872 string caller;
873 switch (expected_coding) {
874 case CharCoding::UTF32BE:
875 case CharCoding::UTF32:
876 if (0x00 == uc_str[0] && 0x00 == uc_str[1] && 0xFE == uc_str[2] && 0xFF == uc_str[3])
877 return 4;
878 badBOM = true;
879 caller = "decode_utf32()";
880 errmsg = "UTF-32BE";
881 break;
882 case CharCoding::UTF32LE:
883 if (0xFF == uc_str[0] && 0xFE == uc_str[1] && 0x00 == uc_str[2] && 0x00 == uc_str[3])
884 return 4;
885 badBOM = true;
886 caller = "decode_utf32()";
887 errmsg = "UTF-32LE";
888 break;
889 case CharCoding::UTF16BE:
890 case CharCoding::UTF16:
891 if (0xFE == uc_str[0] && 0xFF == uc_str[1])
892 return 2;
893 badBOM = true;
894 caller = "decode_utf16()";
895 errmsg = "UTF-16BE";
896 break;
897 case CharCoding::UTF16LE:
898 if (0xFF == uc_str[0] && 0xFE == uc_str[1])
899 return 2;
900 badBOM = true;
901 caller = "decode_utf16()";
902 errmsg = "UTF-16LE";
903 break;
904 case CharCoding::UTF_8:
905 if (0xEF == uc_str[0] && 0xBB == uc_str[1] && 0xBF == uc_str[2])
906 return 3;
907 return 0;
908 default:
909 if (CharCoding::UTF32 == expected_coding || CharCoding::UTF16 == expected_coding) {
910 const char* str = CharCoding::UTF32 == expected_coding ? "UTF-32" : "UTF-16";
911 ERROR("Wrong %s string. No BOM detected, however the given coding type (%s) "
912 "expects it to define the endianness", str, str);
913 }
914 else {
915 ERROR("Wrong string. No BOM detected");
916 }
917 }
918 if (badBOM) ERROR("%s: Wrong %s string. The expected coding could not be verified",
919 caller.c_str(), errmsg.c_str());
920 return 0;
921}
922
923static void fill_continuing_octets(int n_continuing, unsigned char *continuing_ptr,
924 size_t n_uc, const unsigned char* uc_str, int start_pos,
925 int uchar_pos)
926{
927 for (int i = 0; i < n_continuing; i++) {
928 if (start_pos + i < static_cast<int>(n_uc)) {
929 unsigned char octet = uc_str[start_pos + i];
930 if ((octet & 0xC0) != 0x80) {
931 ERROR("decode_utf8(): Malformed: At character position %u, octet position %u: %02X is "
932 "not a valid continuing octet.", uchar_pos, start_pos + i, octet);
933 return;
934 }
935 continuing_ptr[i] = octet & 0x3F;
936 }
937 else {
938 if (start_pos + i == static_cast<int>(n_uc)) {
939 if (i > 0) {
940 // only a part of octets is missing
941 ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d out "
942 "of %d continuing octets %s missing from the end of the stream.",
943 uchar_pos, start_pos + i, n_continuing - i, n_continuing,
944 n_continuing - i > 1 ? "are" : "is");
945 return;
946 }
947 else {
948 // all octets are missing
949 ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d "
950 "continuing octet%s missing from the end of the stream.", uchar_pos,
951 start_pos, n_continuing, n_continuing > 1 ? "s are" : " is");
952 return;
953 }
954 }
955 continuing_ptr[i] = 0;
956 }
957 }
958}
959
960ustring decode_utf8(const string & ostr, CharCoding::CharCodingType expected_coding)
961{
962 size_t length = ostr.size();
963 if (0 == length) return ustring();
964 if (length % 2) {
965 ERROR("decode_utf8(): Wrong UTF-8 string. The number of nibbles (%d) in octetstring "
966 "shall be divisible by 2", static_cast<int>(length));
967 return ustring();
968 }
969
970 unsigned char *uc_str = new unsigned char[length/2];
971 for (size_t i = 0; i < length / 2; ++i) {
972 uc_str[i] = str2uchar(ostr[2 * i], ostr[2 * i + 1]);
973 }
974 ustring ucstr;
975 size_t start = check_BOM(CharCoding::UTF_8, length /2, uc_str);
976
977 for (size_t i = start; i < length / 2;) {
978 // perform the decoding character by character
979 if (uc_str[i] <= 0x7F) {
980 // character encoded on a single octet: 0xxxxxxx (7 useful bits)
981 unsigned char g = 0;
982 unsigned char p = 0;
983 unsigned char r = 0;
984 unsigned char c = uc_str[i];
985 ucstr += ustring(g, p, r, c);
986 ++i;
987 }
988 else if (uc_str[i] <= 0xBF) {
989 // continuing octet (10xxxxxx) without leading octet ==> malformed
990 ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: continuing "
991 "octet %02X without leading octet.", static_cast<int>(ucstr.size()),
992 static_cast<int>(i), uc_str[i]);
993 goto dec_error;
994 }
995 else if (uc_str[i] <= 0xDF) {
996 // character encoded on 2 octets: 110xxxxx 10xxxxxx (11 useful bits)
997 unsigned char octets[2];
998 octets[0] = uc_str[i] & 0x1F;
999 fill_continuing_octets(1, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1000 unsigned char g = 0;
1001 unsigned char p = 0;
1002 unsigned char r = octets[0] >> 2;
1003 unsigned char c = octets[0] << 6 | octets[1];
1004 if (r == 0x00 && c < 0x80) {
1005 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 2-octet "
1006 "encoding for quadruple (0, 0, 0, %u).", static_cast<int>(ucstr.size()),
1007 static_cast<int>(i), c);
1008 goto dec_error;
1009 }
1010 ucstr += ustring(g, p, r, c);
1011 i += 2;
1012 }
1013 else if (uc_str[i] <= 0xEF) {
1014 // character encoded on 3 octets: 1110xxxx 10xxxxxx 10xxxxxx
1015 // (16 useful bits)
1016 unsigned char octets[3];
1017 octets[0] = uc_str[i] & 0x0F;
1018 fill_continuing_octets(2, octets + 1, length / 2, uc_str, i + 1,ucstr.size());
1019 unsigned char g = 0;
1020 unsigned char p = 0;
1021 unsigned char r = octets[0] << 4 | octets[1] >> 2;
1022 unsigned char c = octets[1] << 6 | octets[2];
1023 if (r < 0x08) {
1024 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 3-octet "
1025 "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr.size()),
1026 static_cast<int>(i), r, c);
1027 goto dec_error;
1028 }
1029 ucstr += ustring(g, p, r, c);
1030 i += 3;
1031 }
1032 else if (uc_str[i] <= 0xF7) {
1033 // character encoded on 4 octets: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1034 // (21 useful bits)
1035 unsigned char octets[4];
1036 octets[0] = uc_str[i] & 0x07;
1037 fill_continuing_octets(3, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1038 unsigned char g = 0;
1039 unsigned char p = octets[0] << 2 | octets[1] >> 4;
1040 unsigned char r = octets[1] << 4 | octets[2] >> 2;
1041 unsigned char c = octets[2] << 6 | octets[3];
1042 if (p == 0x00) {
1043 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 4-octet "
1044 "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr.size()),
1045 static_cast<int>(i), r, c);
1046 goto dec_error;
1047 }
1048 ucstr += ustring(g, p, r, c);
1049 i += 4;
1050 }
1051 else if (uc_str[i] <= 0xFB) {
1052 // character encoded on 5 octets: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
1053 // 10xxxxxx (26 useful bits)
1054 unsigned char octets[5];
1055 octets[0] = uc_str[i] & 0x03;
1056 fill_continuing_octets(4, octets + 1, length / 2, uc_str, i + 1, ucstr.size());
1057 unsigned char g = octets[0];
1058 unsigned char p = octets[1] << 2 | octets[2] >> 4;
1059 unsigned char r = octets[2] << 4 | octets[3] >> 2;
1060 unsigned char c = octets[3] << 6 | octets[4];
1061 if (g == 0x00 && p < 0x20) {
1062 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 5-octet "
1063 "encoding for quadruple (0, %u, %u, %u).", static_cast<int>(ucstr.size()),
1064 static_cast<int>(i), p, r, c);
1065 goto dec_error;
1066 }
1067 ucstr += ustring(g, p, r, c);
1068 i += 5;
1069 }
1070 else if (uc_str[i] <= 0xFD) {
1071 // character encoded on 6 octets: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx
1072 // 10xxxxxx 10xxxxxx (31 useful bits)
1073 unsigned char octets[6];
1074 octets[0] = uc_str[i] & 0x01;
1075 fill_continuing_octets(5, octets + 1, length / 2, uc_str, i + 1,ucstr.size());
1076 unsigned char g = octets[0] << 6 | octets[1];
1077 unsigned char p = octets[2] << 2 | octets[3] >> 4;
1078 unsigned char r = octets[3] << 4 | octets[4] >> 2;
1079 unsigned char c = octets[4] << 6 | octets[5];
1080 if (g < 0x04) {
1081 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 6-octet "
1082 "encoding for quadruple (%u, %u, %u, %u).", static_cast<int>(ucstr.size()),
1083 static_cast<int>(i), g, p, r, c);
1084 goto dec_error;
1085 }
1086 ucstr += ustring(g, p, r, c);
1087 i += 6;
1088 }
1089 else {
1090 // not used code points: FE and FF => malformed
1091 ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: "
1092 "unused/reserved octet %02X.", static_cast<int>(ucstr.size()),
1093 static_cast<int>(i), uc_str[i]);
1094 goto dec_error;
1095 }
1096 }
1097
1098 dec_error:
1099 delete[] uc_str;
1100 return ucstr;
1101}
1102
1103}
This page took 0.06153 seconds and 5 git commands to generate.