Commit | Line | Data |
---|---|---|
535aade6 | 1 | /* Demangler for the Rust programming language |
82704155 | 2 | Copyright (C) 2016-2019 Free Software Foundation, Inc. |
535aade6 DT |
3 | Written by David Tolnay (dtolnay@gmail.com). |
4 | ||
5 | This file is part of the libiberty library. | |
6 | Libiberty is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Library General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2 of the License, or (at your option) any later version. | |
10 | ||
11 | In addition to the permissions in the GNU Library General Public | |
12 | License, the Free Software Foundation gives you unlimited permission | |
13 | to link the compiled version of this file into combinations with other | |
14 | programs, and to distribute those combinations without any restriction | |
15 | coming from the use of this file. (The Library Public License | |
16 | restrictions do apply in other respects; for example, they cover | |
17 | modification of the file, and distribution when not linked into a | |
18 | combined executable.) | |
19 | ||
20 | Libiberty is distributed in the hope that it will be useful, | |
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
23 | Library General Public License for more details. | |
24 | ||
25 | You should have received a copy of the GNU Library General Public | |
26 | License along with libiberty; see the file COPYING.LIB. | |
27 | If not, see <http://www.gnu.org/licenses/>. */ | |
28 | ||
29 | ||
30 | #ifdef HAVE_CONFIG_H | |
31 | #include "config.h" | |
32 | #endif | |
33 | ||
34 | #include "safe-ctype.h" | |
35 | ||
36 | #include <sys/types.h> | |
37 | #include <string.h> | |
38 | #include <stdio.h> | |
39 | ||
40 | #ifdef HAVE_STRING_H | |
41 | #include <string.h> | |
42 | #else | |
43 | extern size_t strlen(const char *s); | |
44 | extern int strncmp(const char *s1, const char *s2, size_t n); | |
45 | extern void *memset(void *s, int c, size_t n); | |
46 | #endif | |
47 | ||
48 | #include <demangle.h> | |
49 | #include "libiberty.h" | |
f211b8c0 | 50 | #include "rust-demangle.h" |
535aade6 DT |
51 | |
52 | ||
53 | /* Mangled Rust symbols look like this: | |
54 | _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a | |
55 | ||
56 | The original symbol is: | |
57 | <std::sys::fd::FileDesc as core::ops::Drop>::drop | |
58 | ||
59 | The last component of the path is a 64-bit hash in lowercase hex, | |
60 | prefixed with "h". Rust does not have a global namespace between | |
61 | crates, an illusion which Rust maintains by using the hash to | |
62 | distinguish things that would otherwise have the same symbol. | |
63 | ||
64 | Any path component not starting with a XID_Start character is | |
65 | prefixed with "_". | |
66 | ||
67 | The following escape sequences are used: | |
68 | ||
69 | "," => $C$ | |
70 | "@" => $SP$ | |
71 | "*" => $BP$ | |
72 | "&" => $RF$ | |
73 | "<" => $LT$ | |
74 | ">" => $GT$ | |
75 | "(" => $LP$ | |
76 | ")" => $RP$ | |
77 | " " => $u20$ | |
78 | "\"" => $u22$ | |
79 | "'" => $u27$ | |
80 | "+" => $u2b$ | |
81 | ";" => $u3b$ | |
82 | "[" => $u5b$ | |
83 | "]" => $u5d$ | |
84 | "{" => $u7b$ | |
85 | "}" => $u7d$ | |
86 | "~" => $u7e$ | |
87 | ||
88 | A double ".." means "::" and a single "." means "-". | |
89 | ||
90 | The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ */ | |
91 | ||
92 | static const char *hash_prefix = "::h"; | |
93 | static const size_t hash_prefix_len = 3; | |
94 | static const size_t hash_len = 16; | |
95 | ||
96 | static int is_prefixed_hash (const char *start); | |
97 | static int looks_like_rust (const char *sym, size_t len); | |
98 | static int unescape (const char **in, char **out, const char *seq, char value); | |
99 | ||
100 | /* INPUT: sym: symbol that has been through C++ (gnu v3) demangling | |
101 | ||
102 | This function looks for the following indicators: | |
103 | ||
104 | 1. The hash must consist of "h" followed by 16 lowercase hex digits. | |
105 | ||
106 | 2. As a sanity check, the hash must use between 5 and 15 of the 16 | |
107 | possible hex digits. This is true of 99.9998% of hashes so once | |
108 | in your life you may see a false negative. The point is to | |
109 | notice path components that could be Rust hashes but are | |
110 | probably not, like "haaaaaaaaaaaaaaaa". In this case a false | |
111 | positive (non-Rust symbol has an important path component | |
112 | removed because it looks like a Rust hash) is worse than a false | |
113 | negative (the rare Rust symbol is not demangled) so this sets | |
114 | the balance in favor of false negatives. | |
115 | ||
116 | 3. There must be no characters other than a-zA-Z0-9 and _.:$ | |
117 | ||
118 | 4. There must be no unrecognized $-sign sequences. | |
119 | ||
120 | 5. There must be no sequence of three or more dots in a row ("..."). */ | |
121 | ||
122 | int | |
123 | rust_is_mangled (const char *sym) | |
124 | { | |
125 | size_t len, len_without_hash; | |
126 | ||
127 | if (!sym) | |
128 | return 0; | |
129 | ||
130 | len = strlen (sym); | |
131 | if (len <= hash_prefix_len + hash_len) | |
132 | /* Not long enough to contain "::h" + hash + something else */ | |
133 | return 0; | |
134 | ||
135 | len_without_hash = len - (hash_prefix_len + hash_len); | |
136 | if (!is_prefixed_hash (sym + len_without_hash)) | |
137 | return 0; | |
138 | ||
139 | return looks_like_rust (sym, len_without_hash); | |
140 | } | |
141 | ||
142 | /* A hash is the prefix "::h" followed by 16 lowercase hex digits. The | |
143 | hex digits must comprise between 5 and 15 (inclusive) distinct | |
144 | digits. */ | |
145 | ||
146 | static int | |
147 | is_prefixed_hash (const char *str) | |
148 | { | |
149 | const char *end; | |
150 | char seen[16]; | |
151 | size_t i; | |
152 | int count; | |
153 | ||
154 | if (strncmp (str, hash_prefix, hash_prefix_len)) | |
155 | return 0; | |
156 | str += hash_prefix_len; | |
157 | ||
158 | memset (seen, 0, sizeof(seen)); | |
159 | for (end = str + hash_len; str < end; str++) | |
160 | if (*str >= '0' && *str <= '9') | |
161 | seen[*str - '0'] = 1; | |
162 | else if (*str >= 'a' && *str <= 'f') | |
163 | seen[*str - 'a' + 10] = 1; | |
164 | else | |
165 | return 0; | |
166 | ||
167 | /* Count how many distinct digits seen */ | |
168 | count = 0; | |
169 | for (i = 0; i < 16; i++) | |
170 | if (seen[i]) | |
171 | count++; | |
172 | ||
173 | return count >= 5 && count <= 15; | |
174 | } | |
175 | ||
176 | static int | |
177 | looks_like_rust (const char *str, size_t len) | |
178 | { | |
179 | const char *end = str + len; | |
180 | ||
181 | while (str < end) | |
182 | switch (*str) | |
183 | { | |
184 | case '$': | |
185 | if (!strncmp (str, "$C$", 3)) | |
186 | str += 3; | |
187 | else if (!strncmp (str, "$SP$", 4) | |
188 | || !strncmp (str, "$BP$", 4) | |
189 | || !strncmp (str, "$RF$", 4) | |
190 | || !strncmp (str, "$LT$", 4) | |
191 | || !strncmp (str, "$GT$", 4) | |
192 | || !strncmp (str, "$LP$", 4) | |
193 | || !strncmp (str, "$RP$", 4)) | |
194 | str += 4; | |
195 | else if (!strncmp (str, "$u20$", 5) | |
196 | || !strncmp (str, "$u22$", 5) | |
197 | || !strncmp (str, "$u27$", 5) | |
198 | || !strncmp (str, "$u2b$", 5) | |
199 | || !strncmp (str, "$u3b$", 5) | |
200 | || !strncmp (str, "$u5b$", 5) | |
201 | || !strncmp (str, "$u5d$", 5) | |
202 | || !strncmp (str, "$u7b$", 5) | |
203 | || !strncmp (str, "$u7d$", 5) | |
204 | || !strncmp (str, "$u7e$", 5)) | |
205 | str += 5; | |
206 | else | |
207 | return 0; | |
208 | break; | |
209 | case '.': | |
210 | /* Do not allow three or more consecutive dots */ | |
211 | if (!strncmp (str, "...", 3)) | |
212 | return 0; | |
213 | /* Fall through */ | |
214 | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
215 | case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': | |
216 | case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': | |
217 | case 's': case 't': case 'u': case 'v': case 'w': case 'x': | |
218 | case 'y': case 'z': | |
219 | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
220 | case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': | |
221 | case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': | |
222 | case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': | |
223 | case 'Y': case 'Z': | |
224 | case '0': case '1': case '2': case '3': case '4': case '5': | |
225 | case '6': case '7': case '8': case '9': | |
226 | case '_': | |
227 | case ':': | |
228 | str++; | |
229 | break; | |
230 | default: | |
231 | return 0; | |
232 | } | |
233 | ||
234 | return 1; | |
235 | } | |
236 | ||
237 | /* | |
238 | INPUT: sym: symbol for which rust_is_mangled(sym) returned 1. | |
239 | ||
240 | The input is demangled in-place because the mangled name is always | |
241 | longer than the demangled one. */ | |
242 | ||
243 | void | |
244 | rust_demangle_sym (char *sym) | |
245 | { | |
246 | const char *in; | |
247 | char *out; | |
248 | const char *end; | |
249 | ||
250 | if (!sym) | |
251 | return; | |
252 | ||
253 | in = sym; | |
254 | out = sym; | |
255 | end = sym + strlen (sym) - (hash_prefix_len + hash_len); | |
256 | ||
257 | while (in < end) | |
258 | switch (*in) | |
259 | { | |
260 | case '$': | |
261 | if (!(unescape (&in, &out, "$C$", ',') | |
262 | || unescape (&in, &out, "$SP$", '@') | |
263 | || unescape (&in, &out, "$BP$", '*') | |
264 | || unescape (&in, &out, "$RF$", '&') | |
265 | || unescape (&in, &out, "$LT$", '<') | |
266 | || unescape (&in, &out, "$GT$", '>') | |
267 | || unescape (&in, &out, "$LP$", '(') | |
268 | || unescape (&in, &out, "$RP$", ')') | |
269 | || unescape (&in, &out, "$u20$", ' ') | |
270 | || unescape (&in, &out, "$u22$", '\"') | |
271 | || unescape (&in, &out, "$u27$", '\'') | |
272 | || unescape (&in, &out, "$u2b$", '+') | |
273 | || unescape (&in, &out, "$u3b$", ';') | |
274 | || unescape (&in, &out, "$u5b$", '[') | |
275 | || unescape (&in, &out, "$u5d$", ']') | |
276 | || unescape (&in, &out, "$u7b$", '{') | |
277 | || unescape (&in, &out, "$u7d$", '}') | |
278 | || unescape (&in, &out, "$u7e$", '~'))) { | |
279 | /* unexpected escape sequence, not looks_like_rust. */ | |
280 | goto fail; | |
281 | } | |
282 | break; | |
283 | case '_': | |
284 | /* If this is the start of a path component and the next | |
285 | character is an escape sequence, ignore the underscore. The | |
286 | mangler inserts an underscore to make sure the path | |
287 | component begins with a XID_Start character. */ | |
288 | if ((in == sym || in[-1] == ':') && in[1] == '$') | |
289 | in++; | |
290 | else | |
291 | *out++ = *in++; | |
292 | break; | |
293 | case '.': | |
294 | if (in[1] == '.') | |
295 | { | |
296 | /* ".." becomes "::" */ | |
297 | *out++ = ':'; | |
298 | *out++ = ':'; | |
299 | in += 2; | |
300 | } | |
301 | else | |
302 | { | |
303 | /* "." becomes "-" */ | |
304 | *out++ = '-'; | |
305 | in++; | |
306 | } | |
307 | break; | |
308 | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
309 | case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': | |
310 | case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': | |
311 | case 's': case 't': case 'u': case 'v': case 'w': case 'x': | |
312 | case 'y': case 'z': | |
313 | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
314 | case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': | |
315 | case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': | |
316 | case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': | |
317 | case 'Y': case 'Z': | |
318 | case '0': case '1': case '2': case '3': case '4': case '5': | |
319 | case '6': case '7': case '8': case '9': | |
320 | case ':': | |
321 | *out++ = *in++; | |
322 | break; | |
323 | default: | |
324 | /* unexpected character in symbol, not looks_like_rust. */ | |
325 | goto fail; | |
326 | } | |
327 | goto done; | |
328 | ||
329 | fail: | |
330 | *out++ = '?'; /* This is pretty lame, but it's hard to do better. */ | |
331 | done: | |
332 | *out = '\0'; | |
333 | } | |
334 | ||
335 | static int | |
336 | unescape (const char **in, char **out, const char *seq, char value) | |
337 | { | |
338 | size_t len = strlen (seq); | |
339 | ||
340 | if (strncmp (*in, seq, len)) | |
341 | return 0; | |
342 | ||
343 | **out = value; | |
344 | ||
345 | *in += len; | |
346 | *out += 1; | |
347 | ||
348 | return 1; | |
349 | } |