1 /* Character set conversion support for GDB.
3 Copyright (C) 2001, 2003, 2007, 2008, 2009 Free Software Foundation, Inc.
5 This file is part of GDB.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "gdb_assert.h"
24 #include "gdb_obstack.h"
26 #include "charset-list.h"
31 #include "gdb_string.h"
35 /* How GDB's character set support works
37 GDB has three global settings:
39 - The `current host character set' is the character set GDB should
40 use in talking to the user, and which (hopefully) the user's
41 terminal knows how to display properly. Most users should not
44 - The `current target character set' is the character set the
45 program being debugged uses.
47 - The `current target wide character set' is the wide character set
48 the program being debugged uses, that is, the encoding used for
51 There are commands to set each of these, and mechanisms for
52 choosing reasonable default values. GDB has a global list of
53 character sets that it can use as its host or target character
56 The header file `charset.h' declares various functions that
57 different pieces of GDB need to perform tasks like:
59 - printing target strings and characters to the user's terminal
60 (mostly target->host conversions),
62 - building target-appropriate representations of strings and
63 characters the user enters in expressions (mostly host->target
68 To avoid excessive code duplication and maintenance efforts,
69 GDB simply requires a capable iconv function. Users on platforms
70 without a suitable iconv can use the GNU iconv library. */
75 /* Provide a phony iconv that does as little as possible. Also,
76 arrange for there to be a single available character set. */
78 #undef GDB_DEFAULT_HOST_CHARSET
79 #define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
80 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
81 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "ISO-8859-1"
82 #undef DEFAULT_CHARSET_NAMES
83 #define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
92 #define ICONV_CONST const
94 /* Some systems don't have EILSEQ, so we define it here, but not as
95 EINVAL, because callers of `iconv' want to distinguish EINVAL and
96 EILSEQ. This is what iconv.h from libiconv does as well. Note
97 that wchar.h may also define EILSEQ, so this needs to be after we
98 include wchar.h, which happens in defs.h through gdb_wchar.h. */
100 #define EILSEQ ENOENT
104 iconv_open (const char *to
, const char *from
)
106 /* We allow conversions from UTF-32BE, wchar_t, and the host charset.
107 We allow conversions to wchar_t and the host charset. */
108 if (strcmp (from
, "UTF-32BE") && strcmp (from
, "wchar_t")
109 && strcmp (from
, GDB_DEFAULT_HOST_CHARSET
))
111 if (strcmp (to
, "wchar_t") && strcmp (to
, GDB_DEFAULT_HOST_CHARSET
))
114 /* Return 1 if we are converting from UTF-32BE, 0 otherwise. This is
115 used as a flag in calls to iconv. */
116 return !strcmp (from
, "UTF-32BE");
120 iconv_close (iconv_t arg
)
126 iconv (iconv_t utf_flag
, const char **inbuf
, size_t *inbytesleft
,
127 char **outbuf
, size_t *outbytesleft
)
131 while (*inbytesleft
>= 4)
136 for (j
= 0; j
< 4; ++j
)
139 c
+= (*inbuf
)[j
] & 0xff;
154 if (*inbytesleft
< 4)
162 /* In all other cases we simply copy input bytes to the
164 size_t amt
= *inbytesleft
;
165 if (amt
> *outbytesleft
)
167 memcpy (*outbuf
, *inbuf
, amt
);
171 *outbytesleft
-= amt
;
180 /* The number of non-reversible conversions -- but they were all
189 /* The global lists of character sets and translations. */
192 #ifndef GDB_DEFAULT_TARGET_CHARSET
193 #define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
196 #ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
197 #define GDB_DEFAULT_TARGET_WIDE_CHARSET "UTF-32"
200 static const char *auto_host_charset_name
= GDB_DEFAULT_HOST_CHARSET
;
201 static const char *host_charset_name
= "auto";
203 show_host_charset_name (struct ui_file
*file
, int from_tty
,
204 struct cmd_list_element
*c
,
207 if (!strcmp (value
, "auto"))
208 fprintf_filtered (file
,
209 _("The host character set is \"auto; currently %s\".\n"),
210 auto_host_charset_name
);
212 fprintf_filtered (file
, _("The host character set is \"%s\".\n"), value
);
215 static const char *target_charset_name
= GDB_DEFAULT_TARGET_CHARSET
;
217 show_target_charset_name (struct ui_file
*file
, int from_tty
,
218 struct cmd_list_element
*c
, const char *value
)
220 fprintf_filtered (file
, _("The target character set is \"%s\".\n"),
224 static const char *target_wide_charset_name
= GDB_DEFAULT_TARGET_WIDE_CHARSET
;
226 show_target_wide_charset_name (struct ui_file
*file
, int from_tty
,
227 struct cmd_list_element
*c
, const char *value
)
229 fprintf_filtered (file
, _("The target wide character set is \"%s\".\n"),
233 static const char *default_charset_names
[] =
235 DEFAULT_CHARSET_NAMES
239 static const char **charset_enum
;
242 /* If the target wide character set has big- or little-endian
243 variants, these are the corresponding names. */
244 static const char *target_wide_charset_be_name
;
245 static const char *target_wide_charset_le_name
;
247 /* A helper function for validate which sets the target wide big- and
248 little-endian character set names, if possible. */
251 set_be_le_names (void)
255 target_wide_charset_le_name
= NULL
;
256 target_wide_charset_be_name
= NULL
;
258 len
= strlen (target_wide_charset_name
);
259 for (i
= 0; charset_enum
[i
]; ++i
)
261 if (strncmp (target_wide_charset_name
, charset_enum
[i
], len
))
263 if ((charset_enum
[i
][len
] == 'B'
264 || charset_enum
[i
][len
] == 'L')
265 && charset_enum
[i
][len
+ 1] == 'E'
266 && charset_enum
[i
][len
+ 2] == '\0')
268 if (charset_enum
[i
][len
] == 'B')
269 target_wide_charset_be_name
= charset_enum
[i
];
271 target_wide_charset_le_name
= charset_enum
[i
];
276 /* 'Set charset', 'set host-charset', 'set target-charset', 'set
277 target-wide-charset', 'set charset' sfunc's. */
283 const char *host_cset
= host_charset ();
285 desc
= iconv_open (target_wide_charset_name
, host_cset
);
286 if (desc
== (iconv_t
) -1)
287 error ("Cannot convert between character sets `%s' and `%s'",
288 target_wide_charset_name
, host_cset
);
291 desc
= iconv_open (target_charset_name
, host_cset
);
292 if (desc
== (iconv_t
) -1)
293 error ("Cannot convert between character sets `%s' and `%s'",
294 target_charset_name
, host_cset
);
300 /* This is the sfunc for the 'set charset' command. */
302 set_charset_sfunc (char *charset
, int from_tty
, struct cmd_list_element
*c
)
304 /* CAREFUL: set the target charset here as well. */
305 target_charset_name
= host_charset_name
;
309 /* 'set host-charset' command sfunc. We need a wrapper here because
310 the function needs to have a specific signature. */
312 set_host_charset_sfunc (char *charset
, int from_tty
,
313 struct cmd_list_element
*c
)
318 /* Wrapper for the 'set target-charset' command. */
320 set_target_charset_sfunc (char *charset
, int from_tty
,
321 struct cmd_list_element
*c
)
326 /* Wrapper for the 'set target-wide-charset' command. */
328 set_target_wide_charset_sfunc (char *charset
, int from_tty
,
329 struct cmd_list_element
*c
)
334 /* sfunc for the 'show charset' command. */
336 show_charset (struct ui_file
*file
, int from_tty
, struct cmd_list_element
*c
,
339 show_host_charset_name (file
, from_tty
, c
, host_charset_name
);
340 show_target_charset_name (file
, from_tty
, c
, target_charset_name
);
341 show_target_wide_charset_name (file
, from_tty
, c
, target_wide_charset_name
);
345 /* Accessor functions. */
350 if (!strcmp (host_charset_name
, "auto"))
351 return auto_host_charset_name
;
352 return host_charset_name
;
356 target_charset (void)
358 return target_charset_name
;
362 target_wide_charset (enum bfd_endian byte_order
)
364 if (byte_order
== BFD_ENDIAN_BIG
)
366 if (target_wide_charset_be_name
)
367 return target_wide_charset_be_name
;
371 if (target_wide_charset_le_name
)
372 return target_wide_charset_le_name
;
375 return target_wide_charset_name
;
379 /* Host character set management. For the time being, we assume that
380 the host character set is some superset of ASCII. */
383 host_letter_to_control_character (char c
)
390 /* Convert a host character, C, to its hex value. C must already have
391 been validated using isxdigit. */
394 host_hex_value (char c
)
398 if (c
>= 'a' && c
<= 'f')
400 gdb_assert (c
>= 'A' && c
<= 'F');
405 /* Public character management functions. */
407 /* A cleanup function which is run to close an iconv descriptor. */
410 cleanup_iconv (void *p
)
413 iconv_close (*descp
);
417 convert_between_encodings (const char *from
, const char *to
,
418 const gdb_byte
*bytes
, unsigned int num_bytes
,
419 int width
, struct obstack
*output
,
420 enum transliterations translit
)
423 struct cleanup
*cleanups
;
426 unsigned int space_request
;
428 /* Often, the host and target charsets will be the same. */
429 if (!strcmp (from
, to
))
431 obstack_grow (output
, bytes
, num_bytes
);
435 desc
= iconv_open (to
, from
);
436 if (desc
== (iconv_t
) -1)
437 perror_with_name ("Converting character sets");
438 cleanups
= make_cleanup (cleanup_iconv
, &desc
);
441 inp
= (char *) bytes
;
443 space_request
= num_bytes
;
451 old_size
= obstack_object_size (output
);
452 obstack_blank (output
, space_request
);
454 outp
= obstack_base (output
) + old_size
;
455 outleft
= space_request
;
457 r
= iconv (desc
, (ICONV_CONST
char **) &inp
, &inleft
, &outp
, &outleft
);
459 /* Now make sure that the object on the obstack only includes
460 bytes we have converted. */
461 obstack_blank (output
, - (int) outleft
);
463 if (r
== (size_t) -1)
471 /* Invalid input sequence. */
472 if (translit
== translit_none
)
473 error (_("Could not convert character to `%s' character set"),
476 /* We emit escape sequence for the bytes, skip them,
478 for (i
= 0; i
< width
; ++i
)
482 sprintf (octal
, "\\%.3o", *inp
& 0xff);
483 obstack_grow_str (output
, octal
);
492 /* We ran out of space in the output buffer. Make it
493 bigger next time around. */
498 /* Incomplete input sequence. FIXME: ought to report this
499 to the caller somehow. */
504 perror_with_name ("Internal error while converting character sets");
509 do_cleanups (cleanups
);
514 /* An iterator that returns host wchar_t's from a target string. */
515 struct wchar_iterator
517 /* The underlying iconv descriptor. */
520 /* The input string. This is updated as convert characters. */
522 /* The number of bytes remaining in the input. */
525 /* The width of an input character. */
528 /* The output buffer and its size. */
533 /* Create a new iterator. */
534 struct wchar_iterator
*
535 make_wchar_iterator (const gdb_byte
*input
, size_t bytes
, const char *charset
,
538 struct wchar_iterator
*result
;
541 desc
= iconv_open (INTERMEDIATE_ENCODING
, charset
);
542 if (desc
== (iconv_t
) -1)
543 perror_with_name ("Converting character sets");
545 result
= XNEW (struct wchar_iterator
);
547 result
->input
= (char *) input
;
548 result
->bytes
= bytes
;
549 result
->width
= width
;
551 result
->out
= XNEW (gdb_wchar_t
);
552 result
->out_size
= 1;
558 do_cleanup_iterator (void *p
)
560 struct wchar_iterator
*iter
= p
;
562 iconv_close (iter
->desc
);
568 make_cleanup_wchar_iterator (struct wchar_iterator
*iter
)
570 return make_cleanup (do_cleanup_iterator
, iter
);
574 wchar_iterate (struct wchar_iterator
*iter
,
575 enum wchar_iterate_result
*out_result
,
576 gdb_wchar_t
**out_chars
,
577 const gdb_byte
**ptr
,
582 /* Try to convert some characters. At first we try to convert just
583 a single character. The reason for this is that iconv does not
584 necessarily update its outgoing arguments when it encounters an
585 invalid input sequence -- but we want to reliably report this to
586 our caller so it can emit an escape sequence. */
588 while (iter
->bytes
> 0)
590 char *outptr
= (char *) &iter
->out
[0];
591 char *orig_inptr
= iter
->input
;
592 size_t orig_in
= iter
->bytes
;
593 size_t out_avail
= out_request
* sizeof (gdb_wchar_t
);
597 size_t r
= iconv (iter
->desc
,
598 (ICONV_CONST
char **) &iter
->input
, &iter
->bytes
,
599 &outptr
, &out_avail
);
600 if (r
== (size_t) -1)
605 /* Invalid input sequence. Skip it, and let the caller
607 *out_result
= wchar_iterate_invalid
;
610 iter
->input
+= iter
->width
;
611 iter
->bytes
-= iter
->width
;
615 /* We ran out of space. We still might have converted a
616 character; if so, return it. Otherwise, grow the
617 buffer and try again. */
618 if (out_avail
< out_request
* sizeof (gdb_wchar_t
))
622 if (out_request
> iter
->out_size
)
624 iter
->out_size
= out_request
;
625 iter
->out
= xrealloc (iter
->out
,
626 out_request
* sizeof (gdb_wchar_t
));
631 /* Incomplete input sequence. Let the caller know, and
632 arrange for future calls to see EOF. */
633 *out_result
= wchar_iterate_incomplete
;
640 perror_with_name ("Internal error while converting character sets");
644 /* We converted something. */
645 num
= out_request
- out_avail
/ sizeof (gdb_wchar_t
);
646 *out_result
= wchar_iterate_ok
;
647 *out_chars
= iter
->out
;
649 *len
= orig_in
- iter
->bytes
;
654 *out_result
= wchar_iterate_eof
;
659 /* The charset.c module initialization function. */
661 extern initialize_file_ftype _initialize_charset
; /* -Wmissing-prototype */
663 typedef char *char_ptr
;
664 DEF_VEC_P (char_ptr
);
666 static VEC (char_ptr
) *charsets
;
671 find_charset_names (void)
673 VEC_safe_push (char_ptr
, charsets
, GDB_DEFAULT_HOST_CHARSET
);
674 VEC_safe_push (char_ptr
, charsets
, NULL
);
677 #else /* PHONY_ICONV */
679 /* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
680 provides different symbols in the static and dynamic libraries.
681 So, configure may see libiconvlist but not iconvlist. But, calling
682 iconvlist is the right thing to do and will work. Hence we do a
683 check here but unconditionally call iconvlist below. */
684 #if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
686 /* A helper function that adds some character sets to the vector of
687 all character sets. This is a callback function for iconvlist. */
690 add_one (unsigned int count
, const char *const *names
, void *data
)
694 for (i
= 0; i
< count
; ++i
)
695 VEC_safe_push (char_ptr
, charsets
, xstrdup (names
[i
]));
701 find_charset_names (void)
703 iconvlist (add_one
, NULL
);
704 VEC_safe_push (char_ptr
, charsets
, NULL
);
709 /* Return non-zero if LINE (output from iconv) should be ignored.
710 Older iconv programs (e.g. 2.2.2) include the human readable
711 introduction even when stdout is not a tty. Newer versions omit
712 the intro if stdout is not a tty. */
715 ignore_line_p (const char *line
)
717 /* This table is used to filter the output. If this text appears
718 anywhere in the line, it is ignored (strstr is used). */
719 static const char * const ignore_lines
[] =
724 "listed with several",
729 for (i
= 0; ignore_lines
[i
] != NULL
; ++i
)
731 if (strstr (line
, ignore_lines
[i
]) != NULL
)
739 find_charset_names (void)
741 struct pex_obj
*child
;
745 struct gdb_environ
*iconv_env
;
747 /* Older iconvs, e.g. 2.2.2, don't omit the intro text if stdout is not
748 a tty. We need to recognize it and ignore it. This text is subject
749 to translation, so force LANGUAGE=C. */
750 iconv_env
= make_environ ();
751 init_environ (iconv_env
);
752 set_in_environ (iconv_env
, "LANGUAGE", "C");
753 set_in_environ (iconv_env
, "LC_ALL", "C");
755 child
= pex_init (0, "iconv", NULL
);
760 /* Note that we simply ignore errors here. */
761 if (!pex_run_in_environment (child
, PEX_SEARCH
| PEX_STDERR_TO_STDOUT
,
762 "iconv", args
, environ_vector (iconv_env
),
765 FILE *in
= pex_read_output (child
, 0);
767 /* POSIX says that iconv -l uses an unspecified format. We
768 parse the glibc and libiconv formats; feel free to add others
773 /* The size of buf is chosen arbitrarily. */
778 r
= fgets (buf
, sizeof (buf
), in
);
784 if (ignore_line_p (r
))
787 /* Strip off the newline. */
789 /* Strip off one or two '/'s. glibc will print lines like
790 "8859_7//", but also "10646-1:1993/UCS4/". */
791 if (buf
[len
- 1] == '/')
793 if (buf
[len
- 1] == '/')
797 /* libiconv will print multiple entries per line, separated
798 by spaces. Older iconvs will print multiple entries per line,
799 indented by two spaces, and separated by ", "
800 (i.e. the human readable form). */
807 /* Skip leading blanks. */
808 for (p
= start
; *p
&& *p
== ' '; ++p
)
811 /* Find the next space, comma, or end-of-line. */
812 for ( ; *p
&& *p
!= ' ' && *p
!= ','; ++p
)
814 /* Ignore an empty result. */
819 VEC_safe_push (char_ptr
, charsets
, xstrdup (start
));
822 /* Skip any extra spaces. */
823 for (start
= p
+ 1; *start
&& *start
== ' '; ++start
)
828 if (pex_get_status (child
, 1, &status
)
829 && WIFEXITED (status
) && !WEXITSTATUS (status
))
835 free_environ (iconv_env
);
839 /* Some error occurred, so drop the vector. */
842 for (ix
= 0; VEC_iterate (char_ptr
, charsets
, ix
, elt
); ++ix
)
844 VEC_truncate (char_ptr
, charsets
, 0);
847 VEC_safe_push (char_ptr
, charsets
, NULL
);
850 #endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
851 #endif /* PHONY_ICONV */
854 _initialize_charset (void)
856 struct cmd_list_element
*new_cmd
;
858 /* The first element is always "auto"; then we skip it for the
859 commands where it is not allowed. */
860 VEC_safe_push (char_ptr
, charsets
, xstrdup ("auto"));
861 find_charset_names ();
863 if (VEC_length (char_ptr
, charsets
) > 1)
864 charset_enum
= (const char **) VEC_address (char_ptr
, charsets
);
866 charset_enum
= default_charset_names
;
869 #ifdef HAVE_LANGINFO_CODESET
870 auto_host_charset_name
= nl_langinfo (CODESET
);
871 /* Solaris will return `646' here -- but the Solaris iconv then
872 does not accept this. Darwin (and maybe FreeBSD) may return "" here,
873 which GNU libiconv doesn't like (infinite loop). */
874 if (!strcmp (auto_host_charset_name
, "646") || !*auto_host_charset_name
)
875 auto_host_charset_name
= "ASCII";
876 target_charset_name
= auto_host_charset_name
;
882 add_setshow_enum_cmd ("charset", class_support
,
883 &charset_enum
[1], &host_charset_name
, _("\
884 Set the host and target character sets."), _("\
885 Show the host and target character sets."), _("\
886 The `host character set' is the one used by the system GDB is running on.\n\
887 The `target character set' is the one used by the program being debugged.\n\
888 You may only use supersets of ASCII for your host character set; GDB does\n\
889 not support any others.\n\
890 To see a list of the character sets GDB supports, type `set charset <TAB>'."),
891 /* Note that the sfunc below needs to set
892 target_charset_name, because the 'set
893 charset' command sets two variables. */
896 &setlist
, &showlist
);
898 add_setshow_enum_cmd ("host-charset", class_support
,
899 charset_enum
, &host_charset_name
, _("\
900 Set the host character set."), _("\
901 Show the host character set."), _("\
902 The `host character set' is the one used by the system GDB is running on.\n\
903 You may only use supersets of ASCII for your host character set; GDB does\n\
904 not support any others.\n\
905 To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
906 set_host_charset_sfunc
,
907 show_host_charset_name
,
908 &setlist
, &showlist
);
910 add_setshow_enum_cmd ("target-charset", class_support
,
911 &charset_enum
[1], &target_charset_name
, _("\
912 Set the target character set."), _("\
913 Show the target character set."), _("\
914 The `target character set' is the one used by the program being debugged.\n\
915 GDB translates characters and strings between the host and target\n\
916 character sets as needed.\n\
917 To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
918 set_target_charset_sfunc
,
919 show_target_charset_name
,
920 &setlist
, &showlist
);
922 add_setshow_enum_cmd ("target-wide-charset", class_support
,
923 &charset_enum
[1], &target_wide_charset_name
,
925 Set the target wide character set."), _("\
926 Show the target wide character set."), _("\
927 The `target wide character set' is the one used by the program being debugged.\n\
928 In particular it is the encoding used by `wchar_t'.\n\
929 GDB translates characters and strings between the host and target\n\
930 character sets as needed.\n\
931 To see a list of the character sets GDB supports, type\n\
932 `set target-wide-charset'<TAB>"),
933 set_target_wide_charset_sfunc
,
934 show_target_wide_charset_name
,
935 &setlist
, &showlist
);