1 ///////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2000-2015 Ericsson Telecom AB
3 // All rights reserved. This program and the accompanying materials
4 // are made available under the terms of the Eclipse Public License v1.0
5 // which accompanies this distribution, and is available at
6 // http://www.eclipse.org/legal/epl-v10.html
7 ///////////////////////////////////////////////////////////////////////////////
8 #include "PredefFunc.hh"
15 #include "CompilerError.hh"
17 #include <sys/types.h>
20 #include "../common/memory.h"
21 #include "../common/pattern.hh"
25 #define ERRMSG_BUFSIZE 512
29 static const char utf32be
[] = {'0','0','0','0','F','E','F','F',0};
30 static const char utf32le
[] = {'F','F','F','E','0','0','0','0',0};
31 static const char utf16be
[] = {'F','E','F','F',0};
32 static const char utf16le
[] = {'F','F','F','E',0};
33 static const char utf8
[] = {'E','F','B','B','B','F',0};
35 static inline unsigned char get_bit_value(char c
, unsigned char bit_value
)
43 FATAL_ERROR("Invalid binary digit (%c) in bitstring value", c
);
48 char toupper (const char c
)
50 if (('A' <= c
&& 'F' >= c
) ||
51 ('0' <= c
&& '9' >= c
)) return c
;
54 case 'a' : return 'A';
55 case 'b' : return 'B';
56 case 'c' : return 'C';
57 case 'd' : return 'D';
58 case 'e' : return 'E';
59 case 'f' : return 'F';
61 FATAL_ERROR("%c cannot be converted to hex character", c
);
66 char hexdigit_to_char(unsigned char hexdigit
)
68 if (hexdigit
< 10) return '0' + hexdigit
;
69 else if (hexdigit
< 16) return 'A' + hexdigit
- 10;
71 FATAL_ERROR("hexdigit_to_char(): invalid argument: %d", hexdigit
);
72 return '\0'; // to avoid warning
76 unsigned char char_to_hexdigit(char c
)
78 if (c
>= '0' && c
<= '9') return c
- '0';
79 else if (c
>= 'A' && c
<= 'F') return c
- 'A' + 10;
80 else if (c
>= 'a' && c
<= 'f') return c
- 'a' + 10;
82 FATAL_ERROR("char_to_hexdigit(): invalid argument: %c", c
);
83 return 0; // to avoid warning
87 string
uchar2str(unsigned char uchar
)
90 str
[0] = hexdigit_to_char(uchar
/ 16);
91 str
[1] = hexdigit_to_char(uchar
% 16);
92 return string(2, str
);
95 unsigned char str2uchar(const char& c1
, const char& c2
)
98 uc
= char_to_hexdigit(c1
);
100 uc
+= char_to_hexdigit(c2
);
104 int_val_t
rem(const int_val_t
& left
, const int_val_t
& right
)
106 return (left
- right
* (left
/ right
));
109 int_val_t
mod(const int_val_t
& left
, const int_val_t
& right
)
111 int_val_t r
= right
< 0 ? -right
: right
;
115 int_val_t result
= rem(left
, r
);
116 return result
== 0 ? result
: result
+ r
;
120 string
* to_uppercase(const string
& value
)
122 string
*s
= new string(value
);
123 for (size_t i
= 0; i
< s
->size(); i
++) {
125 if (c
>= 'a' && c
<= 'z') c
= c
- 'a' + 'A';
130 string
* not4b_bit(const string
& bstr
)
132 string
*s
=new string(bstr
);
133 for(size_t i
=0; i
<s
->size(); i
++) {
136 case '0': c
='1'; break;
137 case '1': c
='0'; break;
139 FATAL_ERROR("not4b_bit(): Invalid char in bitstring.");
145 string
* not4b_hex(const string
& hstr
)
147 string
*s
=new string(hstr
);
148 for(size_t i
=0; i
<s
->size(); i
++) {
151 case '0': c
='F'; break;
152 case '1': c
='E'; break;
153 case '2': c
='D'; break;
154 case '3': c
='C'; break;
155 case '4': c
='B'; break;
156 case '5': c
='A'; break;
157 case '6': c
='9'; break;
158 case '7': c
='8'; break;
159 case '8': c
='7'; break;
160 case '9': c
='6'; break;
161 case 'A': c
='5'; break;
162 case 'B': c
='4'; break;
163 case 'C': c
='3'; break;
164 case 'D': c
='2'; break;
165 case 'E': c
='1'; break;
166 case 'F': c
='0'; break;
167 case 'a': c
='5'; break;
168 case 'b': c
='4'; break;
169 case 'c': c
='3'; break;
170 case 'd': c
='2'; break;
171 case 'e': c
='1'; break;
172 case 'f': c
='0'; break;
174 FATAL_ERROR("not4b_hex(): Invalid char in hexstring.");
180 string
* and4b(const string
& left
, const string
& right
)
182 string
*s
=new string(left
);
183 for(size_t i
=0; i
<s
->size(); i
++) {
185 c
=hexdigit_to_char(char_to_hexdigit(c
) & char_to_hexdigit(right
[i
]));
190 string
* or4b(const string
& left
, const string
& right
)
192 string
*s
=new string(left
);
193 for(size_t i
=0; i
<s
->size(); i
++) {
195 c
=hexdigit_to_char(char_to_hexdigit(c
) | char_to_hexdigit(right
[i
]));
200 string
* xor4b(const string
& left
, const string
& right
)
202 string
*s
=new string(left
);
203 for(size_t i
=0; i
<s
->size(); i
++) {
205 c
=hexdigit_to_char(char_to_hexdigit(c
) ^ char_to_hexdigit(right
[i
]));
210 string
* shift_left(const string
& value
, const Int
& count
)
213 string
*s
= new string
;
214 if (count
< static_cast<Int
>(value
.size())) *s
= value
.substr(count
);
215 s
->resize(value
.size(), '0');
217 } else if (count
< 0) return shift_right(value
, -count
);
218 else return new string(value
);
221 string
* shift_right(const string
& value
, const Int
& count
)
224 string
*s
= new string
;
225 if (count
< static_cast<Int
>(value
.size())) {
226 s
->resize(count
, '0');
227 *s
+= value
.substr(0, value
.size()-count
);
228 } else s
->resize(value
.size(), '0');
230 } else if (count
< 0) return shift_left(value
, -count
);
231 else return new string(value
);
234 string
* rotate_left(const string
& value
, const Int
& p_count
)
236 size_t size
= value
.size();
237 if (size
== 0) return new string(value
);
238 else if (p_count
< 0) return rotate_right(value
, -p_count
);
239 size_t count
= p_count
% size
;
240 if (count
== 0) return new string(value
);
241 else return new string(value
.substr(count
) + value
.substr(0, count
));
244 string
* rotate_right(const string
& value
, const Int
& p_count
)
246 size_t size
= value
.size();
247 if (size
== 0) return new string(value
);
248 else if (p_count
< 0) return rotate_left(value
, -p_count
);
249 size_t count
= p_count
% size
;
250 if (count
== 0) return new string(value
);
251 else return new string(value
.substr(size
- count
) +
252 value
.substr(0, size
- count
));
256 ustring
* rotate_left(const ustring
& value
, const Int
& p_count
)
258 size_t size
= value
.size();
259 if (size
== 0) return new ustring(value
);
260 else if (p_count
< 0) return rotate_right(value
, -p_count
);
261 size_t count
= p_count
% size
;
262 if (count
== 0) return new ustring(value
);
263 else return new ustring(value
.substr(count
) + value
.substr(0, count
));
266 ustring
* rotate_right(const ustring
& value
, const Int
& p_count
)
268 size_t size
= value
.size();
269 if (size
== 0) return new ustring(value
);
270 else if (p_count
< 0) return rotate_left(value
, -p_count
);
271 size_t count
= p_count
% size
;
272 if (count
== 0) return new ustring(value
);
273 else return new ustring(value
.substr(size
- count
) +
274 value
.substr(0, size
- count
));
277 int_val_t
* bit2int(const string
& bstr
)
279 size_t nof_bits
= bstr
.size();
280 // skip the leading zeros
281 size_t start_index
= 0;
282 while (start_index
< nof_bits
&& bstr
[start_index
] == '0') start_index
++;
283 int_val_t
*ret_val
= new int_val_t((Int
)0);
284 for (size_t i
= start_index
; i
< nof_bits
; i
++) {
286 if (bstr
[i
] == '1') *ret_val
+= 1;
291 int_val_t
* hex2int(const string
& hstr
)
293 size_t nof_digits
= hstr
.size();
294 size_t start_index
= 0;
295 // Skip the leading zeros.
296 while (start_index
< nof_digits
&& hstr
[start_index
] == '0')
298 int_val_t
*ret_val
= new int_val_t((Int
)0);
299 for (size_t i
= start_index
; i
< nof_digits
; i
++) {
301 *ret_val
+= char_to_hexdigit(hstr
[i
]);
306 Int
unichar2int(const ustring
& ustr
)
308 if (ustr
.size() != 1) FATAL_ERROR("unichar2int(): invalid argument");
309 const ustring::universal_char
& uchar
= ustr
.u_str()[0];
310 Int ret_val
= (uchar
.group
<< 24) | (uchar
.plane
<< 16) | (uchar
.row
<< 8) |
315 string
*int2bit(const int_val_t
& value
, const Int
& length
)
317 if (length
< 0) FATAL_ERROR("int2bit(): negative length");
318 size_t string_length
= static_cast<size_t>(length
);
319 if (static_cast<Int
>(string_length
) != length
||
320 string_length
> string::max_string_len
)
321 FATAL_ERROR("int2bit(): length is too large");
322 if (value
< 0) FATAL_ERROR("int2bit(): negative value");
323 string
*bstr
= new string
;
324 bstr
->resize(string_length
);
325 int_val_t tmp_value
= value
;
326 for (size_t i
= 1; i
<= string_length
; i
++) {
327 (*bstr
)[string_length
- i
] = (tmp_value
& 1).get_val() ? '1' : '0';
331 FATAL_ERROR("int2bit(): %s does not fit in %lu bits", \
332 value
.t_str().c_str(), (unsigned long)string_length
);
336 static const char hdigits
[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
337 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
339 string
*int2hex(const int_val_t
& value
, const Int
& length
)
342 FATAL_ERROR("int2hex(): negative length");
343 size_t string_length
= static_cast<size_t>(length
);
344 if (static_cast<Int
>(string_length
) != length
||
345 string_length
> string::max_string_len
)
346 FATAL_ERROR("int2hex(): length is too large");
347 if (value
< 0) FATAL_ERROR("int2hex(): negative value");
348 string
*hstr
= new string
;
349 hstr
->resize(string_length
);
350 int_val_t tmp_value
= value
;
351 for (size_t i
= 1; i
<= string_length
; i
++) {
352 (*hstr
)[string_length
- i
] = hdigits
[(tmp_value
& 0x0f).get_val()];
355 if (tmp_value
!= 0) {
356 FATAL_ERROR("int2hex(): %s does not fit in %lu hexadecimal digits",
357 value
.t_str().c_str(), (unsigned long)string_length
);
362 ustring
*int2unichar(const Int
& value
)
364 if (value
< 0 || value
> 2147483647)
365 FATAL_ERROR("int2unichar(): invalid argument");
366 unsigned char group
= (value
>> 24) & 0xFF,
367 plane
= (value
>> 16) & 0xFF,
368 row
= (value
>> 8) & 0xFF,
370 return new ustring(group
, plane
, row
, cell
);
373 string
*oct2char(const string
& ostr
)
375 string
*cstr
= new string
;
376 size_t ostr_size
= ostr
.size();
378 FATAL_ERROR("oct2char(): argument has odd length: %lu",
379 (unsigned long) ostr_size
);
380 size_t cstr_size
= ostr_size
/ 2;
381 cstr
->resize(cstr_size
);
382 const char *ostr_ptr
= ostr
.c_str();
383 for (size_t i
= 0; i
< cstr_size
; i
++) {
384 unsigned char c
= 16 * char_to_hexdigit(ostr_ptr
[2 * i
]) +
385 char_to_hexdigit(ostr_ptr
[2 * i
+ 1]);
386 if (c
> 127) FATAL_ERROR("oct2char(): resulting charstring contains " \
387 "non-ascii character: %d", c
);
393 string
*char2oct(const string
& cstr
)
395 string
*ostr
= new string
;
396 size_t cstr_size
= cstr
.size();
397 ostr
->resize(cstr_size
* 2, '0');
398 const char *cstr_ptr
= cstr
.c_str();
399 for (size_t i
= 0; i
< cstr_size
; i
++) {
400 unsigned char c
= cstr_ptr
[i
];
401 (*ostr
)[2 * i
] = hexdigit_to_char(c
/ 16);
402 (*ostr
)[2 * i
+ 1] = hexdigit_to_char(c
% 16);
407 string
*bit2hex(const string
& bstr
)
409 size_t size
=bstr
.size();
410 size_t hsize
=(size
+3)/4;
411 string
*hstr
= new string
;
415 bstr4
->resize(hsize
*4,'0');
416 bstr4
->replace(4-(size
%4),size
,bstr
);
418 hstr
->resize(hsize
,'0');
420 for(size_t i
=0;i
<hsize
;i
++) {
422 if(size
%4)b4
=bstr4
->substr(i
*4,4);
423 else b4
=bstr
.substr(i
*4,4);
424 if(b4
[0]=='1')u
=8;else u
=0;
428 (*hstr
)[i
]=hdigits
[u
];
430 if(bstr4
!=NULL
)delete bstr4
;
434 string
*hex2oct(const string
& hstr
)
436 if(hstr
.size()%2==0)return new string(hstr
);
438 string
*ostr
=new string("0");
444 string
*asn_hex2oct(const string
& hstr
)
446 string
*ostr
= new string(hstr
);
447 size_t size
= ostr
->size();
448 if (size
% 2) ostr
->resize(size
+ 1, '0');
452 string
*bit2oct(const string
& bstr
)
461 string
*asn_bit2oct(const string
& bstr
)
463 size_t size
= bstr
.size();
464 string
*ostr
= new string
;
465 ostr
->resize(((size
+7)/8)*2);
466 for(size_t i
=0, j
=0; i
<size
; ) {
467 unsigned char digit1
=0, digit2
=0;
468 digit1
+= get_bit_value(bstr
[i
++], 8);
470 digit1
+= get_bit_value(bstr
[i
++], 4);
472 digit1
+= get_bit_value(bstr
[i
++], 2);
474 digit1
+= get_bit_value(bstr
[i
++], 1);
476 digit2
+= get_bit_value(bstr
[i
++], 8);
478 digit2
+= get_bit_value(bstr
[i
++], 4);
480 digit2
+= get_bit_value(bstr
[i
++], 2);
481 if (i
< size
) digit2
+= get_bit_value(bstr
[i
++], 1);
488 (*ostr
)[j
++] = hexdigit_to_char(digit1
);
489 (*ostr
)[j
++] = hexdigit_to_char(digit2
);
494 string
*hex2bit(const string
& hstr
)
496 size_t size
=hstr
.size();
497 string
*bstr
= new string
;
498 bstr
->resize(4*size
);
499 for(size_t i
=0; i
<size
; i
++) {
502 bstr
->replace(4*i
, 4, "0000");
505 bstr
->replace(4*i
, 4, "0001");
508 bstr
->replace(4*i
, 4, "0010");
511 bstr
->replace(4*i
, 4, "0011");
514 bstr
->replace(4*i
, 4, "0100");
517 bstr
->replace(4*i
, 4, "0101");
520 bstr
->replace(4*i
, 4, "0110");
523 bstr
->replace(4*i
, 4, "0111");
526 bstr
->replace(4*i
, 4, "1000");
529 bstr
->replace(4*i
, 4, "1001");
533 bstr
->replace(4*i
, 4, "1010");
537 bstr
->replace(4*i
, 4, "1011");
541 bstr
->replace(4*i
, 4, "1100");
545 bstr
->replace(4*i
, 4, "1101");
549 bstr
->replace(4*i
, 4, "1110");
553 bstr
->replace(4*i
, 4, "1111");
556 FATAL_ERROR("Common::hex2bit(): invalid hexadecimal "
557 "digit in hexstring value");
563 int_val_t
* float2int(const Real
& value
, const Location
& loc
)
565 // We shouldn't mimic generality with `Int'.
566 if (value
>= (Real
)LLONG_MIN
&& value
<= (Real
)LLONG_MAX
)
567 return new int_val_t((Int
)value
);
569 snprintf(buf
, 511, "%f", value
);
570 char *dot
= strchr(buf
, '.');
571 if (!dot
) FATAL_ERROR("Conversion of float value `%f' to integer failed", value
);
572 else memset(dot
, 0, sizeof(buf
) - (dot
- buf
));
573 return new int_val_t(buf
, loc
);
576 /* TTCN-3 float values that have absolute value smaller than this are
577 displayed in exponential notation. Same as in core/Float.hh */
578 #ifndef MIN_DECIMAL_FLOAT
579 #define MIN_DECIMAL_FLOAT 1.0E-4
581 /* TTCN-3 float values that have absolute value larger or equal than
582 this are displayed in exponential notation. Same as in
584 #ifndef MAX_DECIMAL_FLOAT
585 #define MAX_DECIMAL_FLOAT 1.0E+10
588 string
*float2str(const Real
& value
)
591 if ( (value
> -MAX_DECIMAL_FLOAT
&& value
<= -MIN_DECIMAL_FLOAT
)
592 || (value
>= MIN_DECIMAL_FLOAT
&& value
< MAX_DECIMAL_FLOAT
)
594 snprintf(str_buf
,64,"%f",value
);
595 else snprintf(str_buf
,64,"%e",value
);
596 return new string(str_buf
);
599 string
* regexp(const string
& instr
, const string
& expression
,
605 FATAL_ERROR("regexp(): groupno must be a non-negative integer");
608 // do not report the warnings again
609 // they were already reported while checking the operands
610 unsigned orig_verb_level
= verb_level
;
611 verb_level
&= ~(1|2);
612 char *posix_str
=TTCN_pattern_to_regexp(expression
.c_str());
613 verb_level
= orig_verb_level
;
614 if(posix_str
==NULL
) {
615 FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
620 regex_t posix_regexp
;
621 int ret_val
=regcomp(&posix_regexp
, posix_str
, REG_EXTENDED
);
625 char msg
[ERRMSG_BUFSIZE
];
626 regerror(ret_val
, &posix_regexp
, msg
, sizeof(msg
));
627 FATAL_ERROR("regexp(): regcomp() failed: %s", msg
);
631 size_t nmatch
=groupno
+1;
632 if(nmatch
>posix_regexp
.re_nsub
) {
633 FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
634 "contains only %lu group(s).", (unsigned long) (nmatch
- 1),
635 (unsigned long) posix_regexp
.re_nsub
);
638 regmatch_t
* pmatch
=(regmatch_t
*)Malloc((nmatch
+1)*sizeof(regmatch_t
));
639 ret_val
=regexec(&posix_regexp
, instr
.c_str(), nmatch
+1, pmatch
, 0);
641 if(pmatch
[nmatch
].rm_so
!= -1 && pmatch
[nmatch
].rm_eo
!= -1)
642 retval
= new string(instr
.substr(pmatch
[nmatch
].rm_so
,
643 pmatch
[nmatch
].rm_eo
- pmatch
[nmatch
].rm_so
));
644 else retval
=new string();
648 if(ret_val
==REG_NOMATCH
) {
649 regfree(&posix_regexp
);
654 char msg
[ERRMSG_BUFSIZE
];
655 regerror(ret_val
, &posix_regexp
, msg
, sizeof(msg
));
656 FATAL_ERROR("regexp(): regexec() failed: %s", msg
);
659 else regfree(&posix_regexp
);
664 ustring
* regexp(const ustring
& instr
, const ustring
& expression
,
670 FATAL_ERROR("regexp(): groupno must be a non-negative integer");
673 // do not report the warnings again
674 // they were already reported while checking the operands
675 unsigned orig_verb_level
= verb_level
;
676 verb_level
&= ~(1|2);
678 char *posix_str
= TTCN_pattern_to_regexp_uni(
679 expression
.get_stringRepr_for_pattern().c_str(), &user_groups
);
680 if (user_groups
== 0)
681 FATAL_ERROR("regexp(): Cannot find any groups in the second argument.");
682 verb_level
= orig_verb_level
;
683 if(posix_str
==NULL
) {
684 FATAL_ERROR("regexp(): Cannot convert pattern `%s' to POSIX-equivalent.",
685 expression
.get_stringRepr().c_str());
689 regex_t posix_regexp
;
690 int ret_val
=regcomp(&posix_regexp
, posix_str
, REG_EXTENDED
);
694 char msg
[ERRMSG_BUFSIZE
];
695 regerror(ret_val
, &posix_regexp
, msg
, sizeof(msg
));
696 FATAL_ERROR("regexp(): regcomp() failed: %s", msg
);
700 size_t nmatch
=user_groups
[groupno
+1]+1;
701 if(nmatch
>posix_regexp
.re_nsub
) {
702 FATAL_ERROR("regexp(): requested groupno is %lu, but this expression "
703 "contains only %lu group(s).", (unsigned long) (groupno
),
704 (unsigned long) user_groups
[0]);
710 regmatch_t
* pmatch
= (regmatch_t
*)Malloc((nmatch
+1)*sizeof(regmatch_t
));
711 char* tmp
= instr
.convert_to_regexp_form();
712 string
instr_conv(tmp
);
714 ret_val
= regexec(&posix_regexp
, instr_conv
.c_str(), nmatch
+1, pmatch
, 0);
716 if(pmatch
[nmatch
].rm_so
!= -1 && pmatch
[nmatch
].rm_eo
!= -1) {
717 retval
= new ustring(
718 instr_conv
.substr(pmatch
[nmatch
].rm_so
,
719 pmatch
[nmatch
].rm_eo
- pmatch
[nmatch
].rm_so
)
720 .convert_stringRepr_for_pattern());
721 } else { retval
= new ustring(); }
725 if(ret_val
==REG_NOMATCH
) {
726 regfree(&posix_regexp
);
727 retval
=new ustring();
731 char msg
[ERRMSG_BUFSIZE
];
732 regerror(ret_val
, &posix_regexp
, msg
, sizeof(msg
));
733 FATAL_ERROR("regexp(): regexec() failed: %s", msg
);
736 else regfree(&posix_regexp
);
741 string
* remove_bom(const string
& encoded_value
)
743 size_t length
= encoded_value
.size();
744 if (0 == length
) return new string();
745 if (length
% 2 || 0 > length
) {
746 ERROR("remove_bom(): Wrong string. The number of nibbles (%d) in string "
747 "shall be divisible by 2", static_cast<int>(length
));
748 return new string(encoded_value
);
751 int length_of_BOM
= 0;
752 string
str_uppercase(encoded_value
);
753 size_t enough
= length
> sizeof(utf32be
)-1 ? sizeof(utf32be
)-1 : length
;
754 for (size_t i
= 0; i
< enough
; ++i
) {
755 str_uppercase
[i
] = toupper(encoded_value
[i
]);
758 if (str_uppercase
.find(utf32be
, 0) < length
) length_of_BOM
= sizeof(utf32be
)-1;
759 else if (str_uppercase
.find(utf32le
, 0) < length
) length_of_BOM
= sizeof(utf32le
)-1;
760 else if (str_uppercase
.find(utf16be
, 0) < length
) length_of_BOM
= sizeof(utf16be
)-1;
761 else if (str_uppercase
.find(utf16le
, 0) < length
) length_of_BOM
= sizeof(utf16le
)-1;
762 else if (str_uppercase
.find(utf8
, 0) < length
) length_of_BOM
= sizeof(utf8
)-1;
763 else return new string(encoded_value
); // no BOM found
765 return new string(encoded_value
.substr(length_of_BOM
, length
));
768 static CharCoding::CharCodingType
is_ascii (size_t length
, const unsigned char* strptr
)
770 const unsigned char nonASCII
= 1 << 7;// MSB is 1 in case of non ASCII character
771 CharCoding::CharCodingType ret
= CharCoding::ASCII
;
772 for (size_t i
= 0; i
< length
; ++i
) {
773 if ( strptr
[i
] & nonASCII
) {
774 ret
= CharCoding::UNKNOWN
;
781 static CharCoding::CharCodingType
is_utf8(size_t length
, const unsigned char* strptr
)
783 const char MSB
= 1 << 7; // MSB is 1 in case of non ASCII character
784 const char MSBmin1
= 1 << 6; // 0100 0000
787 if ( strptr
[i
] & MSB
) { // non ASCII char
788 char maskUTF8
= 1 << 6; // 111x xxxx shows how many additional bytes are there
789 if (!(strptr
[i
] & maskUTF8
)) return CharCoding::UNKNOWN
; // accepted 11xxx xxxx but received 10xx xxxx
790 unsigned int noofUTF8
= 0; // 11xx xxxxx -> 2 bytes, 111x xxxxx -> 3 bytes , 1111 xxxxx -> 4 bytes in UTF-8
791 while (strptr
[i
] & maskUTF8
) {
793 maskUTF8
>>= 1; // shift right the mask
795 // the second and third (and so on) UTF-8 byte looks like 10xx xxxx
796 while (0 < noofUTF8
) {
798 if (!(strptr
[i
] & MSB
) || (strptr
[i
] & MSBmin1
) || i
>= length
) { // if not like this: 10xx xxxx
799 return CharCoding::UNKNOWN
;
806 return CharCoding::UTF_8
;
809 string
* get_stringencoding(const string
& encoded_value
)
811 size_t length
= encoded_value
.size();
812 if (0 == length
) return new string("<unknown>");
813 if (length
% 2 || 0 > length
) {
814 ERROR("get_stringencoding(): Wrong string. The number of nibbles (%d) in string "
815 "shall be divisible by 2", static_cast<int>(length
));
816 return new string("<unknown>");
819 string
str_uppercase(encoded_value
);
820 size_t enough
= length
> sizeof(utf32be
)-1 ? sizeof(utf32be
)-1 : length
;
821 for (size_t i
= 0; i
< enough
; ++i
) {
822 str_uppercase
[i
] = toupper(encoded_value
[i
]);
825 if (str_uppercase
.find(utf32be
, 0) < length
) return new string("UTF-32BE");
826 else if (str_uppercase
.find(utf32le
, 0) < length
) return new string("UTF-32LE");
827 else if (str_uppercase
.find(utf16be
, 0) < length
) return new string("UTF-16BE");
828 else if (str_uppercase
.find(utf16le
, 0) < length
) return new string("UTF-16LE");
829 else if (str_uppercase
.find(utf8
, 0) < length
) return new string("UTF-8");
831 unsigned char *uc_str
= new unsigned char[length
/2];
833 for (size_t i
= 0; i
< length
/ 2; ++i
) {
834 uc_str
[i
] = str2uchar(encoded_value
[2 * i
], encoded_value
[2 * i
+ 1]);
836 if (is_ascii (length
/ 2, uc_str
) == CharCoding::ASCII
) ret
= "ASCII";
837 else if (CharCoding::UTF_8
== is_utf8 (length
/ 2, uc_str
)) ret
= "UTF-8";
838 else ret
= "<unknown>";
841 return new string(ret
);
844 static size_t check_BOM(CharCoding::CharCodingType expected_coding
, size_t n_uc
, unsigned char* uc_str
)
846 if (0 == n_uc
) return 0;
848 switch (expected_coding
) {
849 case CharCoding::UTF32
:
850 case CharCoding::UTF32BE
:
851 case CharCoding::UTF32LE
:
853 ERROR("decode_utf32(): The string is shorter than the expected BOM");
857 case CharCoding::UTF16
:
858 case CharCoding::UTF16BE
:
859 case CharCoding::UTF16LE
:
861 ERROR("decode_utf16(): The string is shorter than the expected BOM");
868 //BOM indicates that the byte order is determined by a byte order mark,
869 //if present at the beginning the length of BOM is returned.
873 switch (expected_coding
) {
874 case CharCoding::UTF32BE
:
875 case CharCoding::UTF32
:
876 if (0x00 == uc_str
[0] && 0x00 == uc_str
[1] && 0xFE == uc_str
[2] && 0xFF == uc_str
[3])
879 caller
= "decode_utf32()";
882 case CharCoding::UTF32LE
:
883 if (0xFF == uc_str
[0] && 0xFE == uc_str
[1] && 0x00 == uc_str
[2] && 0x00 == uc_str
[3])
886 caller
= "decode_utf32()";
889 case CharCoding::UTF16BE
:
890 case CharCoding::UTF16
:
891 if (0xFE == uc_str
[0] && 0xFF == uc_str
[1])
894 caller
= "decode_utf16()";
897 case CharCoding::UTF16LE
:
898 if (0xFF == uc_str
[0] && 0xFE == uc_str
[1])
901 caller
= "decode_utf16()";
904 case CharCoding::UTF_8
:
905 if (0xEF == uc_str
[0] && 0xBB == uc_str
[1] && 0xBF == uc_str
[2])
909 if (CharCoding::UTF32
== expected_coding
|| CharCoding::UTF16
== expected_coding
) {
910 const char* str
= CharCoding::UTF32
== expected_coding
? "UTF-32" : "UTF-16";
911 ERROR("Wrong %s string. No BOM detected, however the given coding type (%s) "
912 "expects it to define the endianness", str
, str
);
915 ERROR("Wrong string. No BOM detected");
918 if (badBOM
) ERROR("%s: Wrong %s string. The expected coding could not be verified",
919 caller
.c_str(), errmsg
.c_str());
923 static void fill_continuing_octets(int n_continuing
, unsigned char *continuing_ptr
,
924 size_t n_uc
, const unsigned char* uc_str
, int start_pos
,
927 for (int i
= 0; i
< n_continuing
; i
++) {
928 if (start_pos
+ i
< static_cast<int>(n_uc
)) {
929 unsigned char octet
= uc_str
[start_pos
+ i
];
930 if ((octet
& 0xC0) != 0x80) {
931 ERROR("decode_utf8(): Malformed: At character position %u, octet position %u: %02X is "
932 "not a valid continuing octet.", uchar_pos
, start_pos
+ i
, octet
);
935 continuing_ptr
[i
] = octet
& 0x3F;
938 if (start_pos
+ i
== static_cast<int>(n_uc
)) {
940 // only a part of octets is missing
941 ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d out "
942 "of %d continuing octets %s missing from the end of the stream.",
943 uchar_pos
, start_pos
+ i
, n_continuing
- i
, n_continuing
,
944 n_continuing
- i
> 1 ? "are" : "is");
948 // all octets are missing
949 ERROR("decode_utf8(): Incomplete: At character position %d, octet position %d: %d "
950 "continuing octet%s missing from the end of the stream.", uchar_pos
,
951 start_pos
, n_continuing
, n_continuing
> 1 ? "s are" : " is");
955 continuing_ptr
[i
] = 0;
960 ustring
decode_utf8(const string
& ostr
, CharCoding::CharCodingType expected_coding
)
962 size_t length
= ostr
.size();
963 if (0 == length
) return ustring();
965 ERROR("decode_utf8(): Wrong UTF-8 string. The number of nibbles (%d) in octetstring "
966 "shall be divisible by 2", static_cast<int>(length
));
970 unsigned char *uc_str
= new unsigned char[length
/2];
971 for (size_t i
= 0; i
< length
/ 2; ++i
) {
972 uc_str
[i
] = str2uchar(ostr
[2 * i
], ostr
[2 * i
+ 1]);
975 size_t start
= check_BOM(CharCoding::UTF_8
, length
/2, uc_str
);
977 for (size_t i
= start
; i
< length
/ 2;) {
978 // perform the decoding character by character
979 if (uc_str
[i
] <= 0x7F) {
980 // character encoded on a single octet: 0xxxxxxx (7 useful bits)
984 unsigned char c
= uc_str
[i
];
985 ucstr
+= ustring(g
, p
, r
, c
);
988 else if (uc_str
[i
] <= 0xBF) {
989 // continuing octet (10xxxxxx) without leading octet ==> malformed
990 ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: continuing "
991 "octet %02X without leading octet.", static_cast<int>(ucstr
.size()),
992 static_cast<int>(i
), uc_str
[i
]);
995 else if (uc_str
[i
] <= 0xDF) {
996 // character encoded on 2 octets: 110xxxxx 10xxxxxx (11 useful bits)
997 unsigned char octets
[2];
998 octets
[0] = uc_str
[i
] & 0x1F;
999 fill_continuing_octets(1, octets
+ 1, length
/ 2, uc_str
, i
+ 1, ucstr
.size());
1000 unsigned char g
= 0;
1001 unsigned char p
= 0;
1002 unsigned char r
= octets
[0] >> 2;
1003 unsigned char c
= octets
[0] << 6 | octets
[1];
1004 if (r
== 0x00 && c
< 0x80) {
1005 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 2-octet "
1006 "encoding for quadruple (0, 0, 0, %u).", static_cast<int>(ucstr
.size()),
1007 static_cast<int>(i
), c
);
1010 ucstr
+= ustring(g
, p
, r
, c
);
1013 else if (uc_str
[i
] <= 0xEF) {
1014 // character encoded on 3 octets: 1110xxxx 10xxxxxx 10xxxxxx
1016 unsigned char octets
[3];
1017 octets
[0] = uc_str
[i
] & 0x0F;
1018 fill_continuing_octets(2, octets
+ 1, length
/ 2, uc_str
, i
+ 1,ucstr
.size());
1019 unsigned char g
= 0;
1020 unsigned char p
= 0;
1021 unsigned char r
= octets
[0] << 4 | octets
[1] >> 2;
1022 unsigned char c
= octets
[1] << 6 | octets
[2];
1024 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 3-octet "
1025 "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr
.size()),
1026 static_cast<int>(i
), r
, c
);
1029 ucstr
+= ustring(g
, p
, r
, c
);
1032 else if (uc_str
[i
] <= 0xF7) {
1033 // character encoded on 4 octets: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1035 unsigned char octets
[4];
1036 octets
[0] = uc_str
[i
] & 0x07;
1037 fill_continuing_octets(3, octets
+ 1, length
/ 2, uc_str
, i
+ 1, ucstr
.size());
1038 unsigned char g
= 0;
1039 unsigned char p
= octets
[0] << 2 | octets
[1] >> 4;
1040 unsigned char r
= octets
[1] << 4 | octets
[2] >> 2;
1041 unsigned char c
= octets
[2] << 6 | octets
[3];
1043 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 4-octet "
1044 "encoding for quadruple (0, 0, %u, %u).", static_cast<int>(ucstr
.size()),
1045 static_cast<int>(i
), r
, c
);
1048 ucstr
+= ustring(g
, p
, r
, c
);
1051 else if (uc_str
[i
] <= 0xFB) {
1052 // character encoded on 5 octets: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
1053 // 10xxxxxx (26 useful bits)
1054 unsigned char octets
[5];
1055 octets
[0] = uc_str
[i
] & 0x03;
1056 fill_continuing_octets(4, octets
+ 1, length
/ 2, uc_str
, i
+ 1, ucstr
.size());
1057 unsigned char g
= octets
[0];
1058 unsigned char p
= octets
[1] << 2 | octets
[2] >> 4;
1059 unsigned char r
= octets
[2] << 4 | octets
[3] >> 2;
1060 unsigned char c
= octets
[3] << 6 | octets
[4];
1061 if (g
== 0x00 && p
< 0x20) {
1062 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 5-octet "
1063 "encoding for quadruple (0, %u, %u, %u).", static_cast<int>(ucstr
.size()),
1064 static_cast<int>(i
), p
, r
, c
);
1067 ucstr
+= ustring(g
, p
, r
, c
);
1070 else if (uc_str
[i
] <= 0xFD) {
1071 // character encoded on 6 octets: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx
1072 // 10xxxxxx 10xxxxxx (31 useful bits)
1073 unsigned char octets
[6];
1074 octets
[0] = uc_str
[i
] & 0x01;
1075 fill_continuing_octets(5, octets
+ 1, length
/ 2, uc_str
, i
+ 1,ucstr
.size());
1076 unsigned char g
= octets
[0] << 6 | octets
[1];
1077 unsigned char p
= octets
[2] << 2 | octets
[3] >> 4;
1078 unsigned char r
= octets
[3] << 4 | octets
[4] >> 2;
1079 unsigned char c
= octets
[4] << 6 | octets
[5];
1081 ERROR("decode_utf8(): Overlong: At character position %d, octet position %d: 6-octet "
1082 "encoding for quadruple (%u, %u, %u, %u).", static_cast<int>(ucstr
.size()),
1083 static_cast<int>(i
), g
, p
, r
, c
);
1086 ucstr
+= ustring(g
, p
, r
, c
);
1090 // not used code points: FE and FF => malformed
1091 ERROR("decode_utf8(): Malformed: At character position %d, octet position %d: "
1092 "unused/reserved octet %02X.", static_cast<int>(ucstr
.size()),
1093 static_cast<int>(i
), uc_str
[i
]);