Commit | Line | Data |
---|---|---|
535aade6 | 1 | /* Demangler for the Rust programming language |
2a8ae714 | 2 | Copyright (C) 2016-2018 Free Software Foundation, Inc. |
535aade6 DT |
3 | Written by David Tolnay (dtolnay@gmail.com). |
4 | ||
5 | This file is part of the libiberty library. | |
6 | Libiberty is free software; you can redistribute it and/or | |
7 | modify it under the terms of the GNU Library General Public | |
8 | License as published by the Free Software Foundation; either | |
9 | version 2 of the License, or (at your option) any later version. | |
10 | ||
11 | In addition to the permissions in the GNU Library General Public | |
12 | License, the Free Software Foundation gives you unlimited permission | |
13 | to link the compiled version of this file into combinations with other | |
14 | programs, and to distribute those combinations without any restriction | |
15 | coming from the use of this file. (The Library Public License | |
16 | restrictions do apply in other respects; for example, they cover | |
17 | modification of the file, and distribution when not linked into a | |
18 | combined executable.) | |
19 | ||
20 | Libiberty is distributed in the hope that it will be useful, | |
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
23 | Library General Public License for more details. | |
24 | ||
25 | You should have received a copy of the GNU Library General Public | |
26 | License along with libiberty; see the file COPYING.LIB. | |
27 | If not, see <http://www.gnu.org/licenses/>. */ | |
28 | ||
29 | ||
30 | #ifdef HAVE_CONFIG_H | |
31 | #include "config.h" | |
32 | #endif | |
33 | ||
34 | #include "safe-ctype.h" | |
35 | ||
36 | #include <sys/types.h> | |
37 | #include <string.h> | |
38 | #include <stdio.h> | |
39 | ||
40 | #ifdef HAVE_STRING_H | |
41 | #include <string.h> | |
42 | #else | |
43 | extern size_t strlen(const char *s); | |
44 | extern int strncmp(const char *s1, const char *s2, size_t n); | |
45 | extern void *memset(void *s, int c, size_t n); | |
46 | #endif | |
47 | ||
48 | #include <demangle.h> | |
49 | #include "libiberty.h" | |
50 | ||
51 | ||
52 | /* Mangled Rust symbols look like this: | |
53 | _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a | |
54 | ||
55 | The original symbol is: | |
56 | <std::sys::fd::FileDesc as core::ops::Drop>::drop | |
57 | ||
58 | The last component of the path is a 64-bit hash in lowercase hex, | |
59 | prefixed with "h". Rust does not have a global namespace between | |
60 | crates, an illusion which Rust maintains by using the hash to | |
61 | distinguish things that would otherwise have the same symbol. | |
62 | ||
63 | Any path component not starting with a XID_Start character is | |
64 | prefixed with "_". | |
65 | ||
66 | The following escape sequences are used: | |
67 | ||
68 | "," => $C$ | |
69 | "@" => $SP$ | |
70 | "*" => $BP$ | |
71 | "&" => $RF$ | |
72 | "<" => $LT$ | |
73 | ">" => $GT$ | |
74 | "(" => $LP$ | |
75 | ")" => $RP$ | |
76 | " " => $u20$ | |
77 | "\"" => $u22$ | |
78 | "'" => $u27$ | |
79 | "+" => $u2b$ | |
80 | ";" => $u3b$ | |
81 | "[" => $u5b$ | |
82 | "]" => $u5d$ | |
83 | "{" => $u7b$ | |
84 | "}" => $u7d$ | |
85 | "~" => $u7e$ | |
86 | ||
87 | A double ".." means "::" and a single "." means "-". | |
88 | ||
89 | The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ */ | |
90 | ||
91 | static const char *hash_prefix = "::h"; | |
92 | static const size_t hash_prefix_len = 3; | |
93 | static const size_t hash_len = 16; | |
94 | ||
95 | static int is_prefixed_hash (const char *start); | |
96 | static int looks_like_rust (const char *sym, size_t len); | |
97 | static int unescape (const char **in, char **out, const char *seq, char value); | |
98 | ||
99 | /* INPUT: sym: symbol that has been through C++ (gnu v3) demangling | |
100 | ||
101 | This function looks for the following indicators: | |
102 | ||
103 | 1. The hash must consist of "h" followed by 16 lowercase hex digits. | |
104 | ||
105 | 2. As a sanity check, the hash must use between 5 and 15 of the 16 | |
106 | possible hex digits. This is true of 99.9998% of hashes so once | |
107 | in your life you may see a false negative. The point is to | |
108 | notice path components that could be Rust hashes but are | |
109 | probably not, like "haaaaaaaaaaaaaaaa". In this case a false | |
110 | positive (non-Rust symbol has an important path component | |
111 | removed because it looks like a Rust hash) is worse than a false | |
112 | negative (the rare Rust symbol is not demangled) so this sets | |
113 | the balance in favor of false negatives. | |
114 | ||
115 | 3. There must be no characters other than a-zA-Z0-9 and _.:$ | |
116 | ||
117 | 4. There must be no unrecognized $-sign sequences. | |
118 | ||
119 | 5. There must be no sequence of three or more dots in a row ("..."). */ | |
120 | ||
121 | int | |
122 | rust_is_mangled (const char *sym) | |
123 | { | |
124 | size_t len, len_without_hash; | |
125 | ||
126 | if (!sym) | |
127 | return 0; | |
128 | ||
129 | len = strlen (sym); | |
130 | if (len <= hash_prefix_len + hash_len) | |
131 | /* Not long enough to contain "::h" + hash + something else */ | |
132 | return 0; | |
133 | ||
134 | len_without_hash = len - (hash_prefix_len + hash_len); | |
135 | if (!is_prefixed_hash (sym + len_without_hash)) | |
136 | return 0; | |
137 | ||
138 | return looks_like_rust (sym, len_without_hash); | |
139 | } | |
140 | ||
141 | /* A hash is the prefix "::h" followed by 16 lowercase hex digits. The | |
142 | hex digits must comprise between 5 and 15 (inclusive) distinct | |
143 | digits. */ | |
144 | ||
145 | static int | |
146 | is_prefixed_hash (const char *str) | |
147 | { | |
148 | const char *end; | |
149 | char seen[16]; | |
150 | size_t i; | |
151 | int count; | |
152 | ||
153 | if (strncmp (str, hash_prefix, hash_prefix_len)) | |
154 | return 0; | |
155 | str += hash_prefix_len; | |
156 | ||
157 | memset (seen, 0, sizeof(seen)); | |
158 | for (end = str + hash_len; str < end; str++) | |
159 | if (*str >= '0' && *str <= '9') | |
160 | seen[*str - '0'] = 1; | |
161 | else if (*str >= 'a' && *str <= 'f') | |
162 | seen[*str - 'a' + 10] = 1; | |
163 | else | |
164 | return 0; | |
165 | ||
166 | /* Count how many distinct digits seen */ | |
167 | count = 0; | |
168 | for (i = 0; i < 16; i++) | |
169 | if (seen[i]) | |
170 | count++; | |
171 | ||
172 | return count >= 5 && count <= 15; | |
173 | } | |
174 | ||
175 | static int | |
176 | looks_like_rust (const char *str, size_t len) | |
177 | { | |
178 | const char *end = str + len; | |
179 | ||
180 | while (str < end) | |
181 | switch (*str) | |
182 | { | |
183 | case '$': | |
184 | if (!strncmp (str, "$C$", 3)) | |
185 | str += 3; | |
186 | else if (!strncmp (str, "$SP$", 4) | |
187 | || !strncmp (str, "$BP$", 4) | |
188 | || !strncmp (str, "$RF$", 4) | |
189 | || !strncmp (str, "$LT$", 4) | |
190 | || !strncmp (str, "$GT$", 4) | |
191 | || !strncmp (str, "$LP$", 4) | |
192 | || !strncmp (str, "$RP$", 4)) | |
193 | str += 4; | |
194 | else if (!strncmp (str, "$u20$", 5) | |
195 | || !strncmp (str, "$u22$", 5) | |
196 | || !strncmp (str, "$u27$", 5) | |
197 | || !strncmp (str, "$u2b$", 5) | |
198 | || !strncmp (str, "$u3b$", 5) | |
199 | || !strncmp (str, "$u5b$", 5) | |
200 | || !strncmp (str, "$u5d$", 5) | |
201 | || !strncmp (str, "$u7b$", 5) | |
202 | || !strncmp (str, "$u7d$", 5) | |
203 | || !strncmp (str, "$u7e$", 5)) | |
204 | str += 5; | |
205 | else | |
206 | return 0; | |
207 | break; | |
208 | case '.': | |
209 | /* Do not allow three or more consecutive dots */ | |
210 | if (!strncmp (str, "...", 3)) | |
211 | return 0; | |
212 | /* Fall through */ | |
213 | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
214 | case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': | |
215 | case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': | |
216 | case 's': case 't': case 'u': case 'v': case 'w': case 'x': | |
217 | case 'y': case 'z': | |
218 | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
219 | case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': | |
220 | case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': | |
221 | case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': | |
222 | case 'Y': case 'Z': | |
223 | case '0': case '1': case '2': case '3': case '4': case '5': | |
224 | case '6': case '7': case '8': case '9': | |
225 | case '_': | |
226 | case ':': | |
227 | str++; | |
228 | break; | |
229 | default: | |
230 | return 0; | |
231 | } | |
232 | ||
233 | return 1; | |
234 | } | |
235 | ||
236 | /* | |
237 | INPUT: sym: symbol for which rust_is_mangled(sym) returned 1. | |
238 | ||
239 | The input is demangled in-place because the mangled name is always | |
240 | longer than the demangled one. */ | |
241 | ||
242 | void | |
243 | rust_demangle_sym (char *sym) | |
244 | { | |
245 | const char *in; | |
246 | char *out; | |
247 | const char *end; | |
248 | ||
249 | if (!sym) | |
250 | return; | |
251 | ||
252 | in = sym; | |
253 | out = sym; | |
254 | end = sym + strlen (sym) - (hash_prefix_len + hash_len); | |
255 | ||
256 | while (in < end) | |
257 | switch (*in) | |
258 | { | |
259 | case '$': | |
260 | if (!(unescape (&in, &out, "$C$", ',') | |
261 | || unescape (&in, &out, "$SP$", '@') | |
262 | || unescape (&in, &out, "$BP$", '*') | |
263 | || unescape (&in, &out, "$RF$", '&') | |
264 | || unescape (&in, &out, "$LT$", '<') | |
265 | || unescape (&in, &out, "$GT$", '>') | |
266 | || unescape (&in, &out, "$LP$", '(') | |
267 | || unescape (&in, &out, "$RP$", ')') | |
268 | || unescape (&in, &out, "$u20$", ' ') | |
269 | || unescape (&in, &out, "$u22$", '\"') | |
270 | || unescape (&in, &out, "$u27$", '\'') | |
271 | || unescape (&in, &out, "$u2b$", '+') | |
272 | || unescape (&in, &out, "$u3b$", ';') | |
273 | || unescape (&in, &out, "$u5b$", '[') | |
274 | || unescape (&in, &out, "$u5d$", ']') | |
275 | || unescape (&in, &out, "$u7b$", '{') | |
276 | || unescape (&in, &out, "$u7d$", '}') | |
277 | || unescape (&in, &out, "$u7e$", '~'))) { | |
278 | /* unexpected escape sequence, not looks_like_rust. */ | |
279 | goto fail; | |
280 | } | |
281 | break; | |
282 | case '_': | |
283 | /* If this is the start of a path component and the next | |
284 | character is an escape sequence, ignore the underscore. The | |
285 | mangler inserts an underscore to make sure the path | |
286 | component begins with a XID_Start character. */ | |
287 | if ((in == sym || in[-1] == ':') && in[1] == '$') | |
288 | in++; | |
289 | else | |
290 | *out++ = *in++; | |
291 | break; | |
292 | case '.': | |
293 | if (in[1] == '.') | |
294 | { | |
295 | /* ".." becomes "::" */ | |
296 | *out++ = ':'; | |
297 | *out++ = ':'; | |
298 | in += 2; | |
299 | } | |
300 | else | |
301 | { | |
302 | /* "." becomes "-" */ | |
303 | *out++ = '-'; | |
304 | in++; | |
305 | } | |
306 | break; | |
307 | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
308 | case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': | |
309 | case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': | |
310 | case 's': case 't': case 'u': case 'v': case 'w': case 'x': | |
311 | case 'y': case 'z': | |
312 | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
313 | case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': | |
314 | case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': | |
315 | case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': | |
316 | case 'Y': case 'Z': | |
317 | case '0': case '1': case '2': case '3': case '4': case '5': | |
318 | case '6': case '7': case '8': case '9': | |
319 | case ':': | |
320 | *out++ = *in++; | |
321 | break; | |
322 | default: | |
323 | /* unexpected character in symbol, not looks_like_rust. */ | |
324 | goto fail; | |
325 | } | |
326 | goto done; | |
327 | ||
328 | fail: | |
329 | *out++ = '?'; /* This is pretty lame, but it's hard to do better. */ | |
330 | done: | |
331 | *out = '\0'; | |
332 | } | |
333 | ||
334 | static int | |
335 | unescape (const char **in, char **out, const char *seq, char value) | |
336 | { | |
337 | size_t len = strlen (seq); | |
338 | ||
339 | if (strncmp (*in, seq, len)) | |
340 | return 0; | |
341 | ||
342 | **out = value; | |
343 | ||
344 | *in += len; | |
345 | *out += 1; | |
346 | ||
347 | return 1; | |
348 | } |