Commit | Line | Data |
---|---|---|
7de828df KC |
1 | /* |
2 | * kaslr.c | |
3 | * | |
4 | * This contains the routines needed to generate a reasonable level of | |
5 | * entropy to choose a randomized kernel base address offset in support | |
6 | * of Kernel Address Space Layout Randomization (KASLR). Additionally | |
7 | * handles walking the physical memory maps (and tracking memory regions | |
8 | * to avoid) in order to select a physical memory location that can | |
9 | * contain the entire properly aligned running kernel image. | |
10 | * | |
11 | */ | |
8ab3820f | 12 | #include "misc.h" |
dc425a6e | 13 | #include "error.h" |
8ab3820f | 14 | |
5bfce5ef KC |
15 | #include <asm/msr.h> |
16 | #include <asm/archrandom.h> | |
82fa9637 | 17 | #include <asm/e820.h> |
5bfce5ef | 18 | |
a653f356 KC |
19 | #include <generated/compile.h> |
20 | #include <linux/module.h> | |
21 | #include <linux/uts.h> | |
22 | #include <linux/utsname.h> | |
23 | #include <generated/utsrelease.h> | |
a653f356 KC |
24 | |
25 | /* Simplified build-specific string for starting entropy. */ | |
327f7d72 | 26 | static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" |
a653f356 KC |
27 | LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; |
28 | ||
5bfce5ef KC |
29 | #define I8254_PORT_CONTROL 0x43 |
30 | #define I8254_PORT_COUNTER0 0x40 | |
31 | #define I8254_CMD_READBACK 0xC0 | |
32 | #define I8254_SELECT_COUNTER0 0x02 | |
33 | #define I8254_STATUS_NOTREADY 0x40 | |
34 | static inline u16 i8254(void) | |
35 | { | |
36 | u16 status, timer; | |
37 | ||
38 | do { | |
39 | outb(I8254_PORT_CONTROL, | |
40 | I8254_CMD_READBACK | I8254_SELECT_COUNTER0); | |
41 | status = inb(I8254_PORT_COUNTER0); | |
42 | timer = inb(I8254_PORT_COUNTER0); | |
43 | timer |= inb(I8254_PORT_COUNTER0) << 8; | |
44 | } while (status & I8254_STATUS_NOTREADY); | |
45 | ||
46 | return timer; | |
47 | } | |
48 | ||
a653f356 KC |
49 | static unsigned long rotate_xor(unsigned long hash, const void *area, |
50 | size_t size) | |
51 | { | |
52 | size_t i; | |
53 | unsigned long *ptr = (unsigned long *)area; | |
54 | ||
55 | for (i = 0; i < size / sizeof(hash); i++) { | |
56 | /* Rotate by odd number of bits and XOR. */ | |
57 | hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); | |
58 | hash ^= ptr[i]; | |
59 | } | |
60 | ||
61 | return hash; | |
62 | } | |
63 | ||
64 | /* Attempt to create a simple but unpredictable starting entropy. */ | |
65 | static unsigned long get_random_boot(void) | |
66 | { | |
67 | unsigned long hash = 0; | |
68 | ||
69 | hash = rotate_xor(hash, build_str, sizeof(build_str)); | |
6655e0aa | 70 | hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); |
a653f356 KC |
71 | |
72 | return hash; | |
73 | } | |
74 | ||
5bfce5ef KC |
75 | static unsigned long get_random_long(void) |
76 | { | |
e8236c4d PA |
77 | #ifdef CONFIG_X86_64 |
78 | const unsigned long mix_const = 0x5d6008cbf3848dd3UL; | |
79 | #else | |
80 | const unsigned long mix_const = 0x3f39e593UL; | |
81 | #endif | |
a653f356 KC |
82 | unsigned long raw, random = get_random_boot(); |
83 | bool use_i8254 = true; | |
84 | ||
85 | debug_putstr("KASLR using"); | |
5bfce5ef KC |
86 | |
87 | if (has_cpuflag(X86_FEATURE_RDRAND)) { | |
a653f356 KC |
88 | debug_putstr(" RDRAND"); |
89 | if (rdrand_long(&raw)) { | |
90 | random ^= raw; | |
91 | use_i8254 = false; | |
92 | } | |
5bfce5ef KC |
93 | } |
94 | ||
95 | if (has_cpuflag(X86_FEATURE_TSC)) { | |
a653f356 | 96 | debug_putstr(" RDTSC"); |
4ea1636b | 97 | raw = rdtsc(); |
5bfce5ef | 98 | |
a653f356 KC |
99 | random ^= raw; |
100 | use_i8254 = false; | |
101 | } | |
5bfce5ef | 102 | |
a653f356 KC |
103 | if (use_i8254) { |
104 | debug_putstr(" i8254"); | |
105 | random ^= i8254(); | |
5bfce5ef KC |
106 | } |
107 | ||
e8236c4d PA |
108 | /* Circular multiply for better bit diffusion */ |
109 | asm("mul %3" | |
110 | : "=a" (random), "=d" (raw) | |
111 | : "a" (random), "rm" (mix_const)); | |
112 | random += raw; | |
113 | ||
a653f356 KC |
114 | debug_putstr("...\n"); |
115 | ||
5bfce5ef KC |
116 | return random; |
117 | } | |
8ab3820f | 118 | |
82fa9637 KC |
119 | struct mem_vector { |
120 | unsigned long start; | |
121 | unsigned long size; | |
122 | }; | |
123 | ||
ed09acde KC |
124 | enum mem_avoid_index { |
125 | MEM_AVOID_ZO_RANGE = 0, | |
126 | MEM_AVOID_INITRD, | |
127 | MEM_AVOID_CMDLINE, | |
128 | MEM_AVOID_BOOTPARAMS, | |
129 | MEM_AVOID_MAX, | |
130 | }; | |
131 | ||
e290e8c5 | 132 | static struct mem_vector mem_avoid[MEM_AVOID_MAX]; |
82fa9637 KC |
133 | |
134 | static bool mem_contains(struct mem_vector *region, struct mem_vector *item) | |
135 | { | |
136 | /* Item at least partially before region. */ | |
137 | if (item->start < region->start) | |
138 | return false; | |
139 | /* Item at least partially after region. */ | |
140 | if (item->start + item->size > region->start + region->size) | |
141 | return false; | |
142 | return true; | |
143 | } | |
144 | ||
145 | static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) | |
146 | { | |
147 | /* Item one is entirely before item two. */ | |
148 | if (one->start + one->size <= two->start) | |
149 | return false; | |
150 | /* Item one is entirely after item two. */ | |
151 | if (one->start >= two->start + two->size) | |
152 | return false; | |
153 | return true; | |
154 | } | |
155 | ||
9dc1969c | 156 | /* |
ed09acde KC |
157 | * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). |
158 | * The mem_avoid array is used to store the ranges that need to be avoided | |
159 | * when KASLR searches for an appropriate random address. We must avoid any | |
9dc1969c | 160 | * regions that are unsafe to overlap with during decompression, and other |
ed09acde KC |
161 | * things like the initrd, cmdline and boot_params. This comment seeks to |
162 | * explain mem_avoid as clearly as possible since incorrect mem_avoid | |
163 | * memory ranges lead to really hard to debug boot failures. | |
164 | * | |
165 | * The initrd, cmdline, and boot_params are trivial to identify for | |
cb18ef0d | 166 | * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and |
ed09acde KC |
167 | * MEM_AVOID_BOOTPARAMS respectively below. |
168 | * | |
169 | * What is not obvious how to avoid is the range of memory that is used | |
170 | * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover | |
171 | * the compressed kernel (ZO) and its run space, which is used to extract | |
172 | * the uncompressed kernel (VO) and relocs. | |
173 | * | |
174 | * ZO's full run size sits against the end of the decompression buffer, so | |
175 | * we can calculate where text, data, bss, etc of ZO are positioned more | |
176 | * easily. | |
177 | * | |
178 | * For additional background, the decompression calculations can be found | |
179 | * in header.S, and the memory diagram is based on the one found in misc.c. | |
180 | * | |
181 | * The following conditions are already enforced by the image layouts and | |
182 | * associated code: | |
183 | * - input + input_size >= output + output_size | |
184 | * - kernel_total_size <= init_size | |
185 | * - kernel_total_size <= output_size (see Note below) | |
186 | * - output + init_size >= output + output_size | |
9dc1969c | 187 | * |
ed09acde KC |
188 | * (Note that kernel_total_size and output_size have no fundamental |
189 | * relationship, but output_size is passed to choose_random_location | |
190 | * as a maximum of the two. The diagram is showing a case where | |
191 | * kernel_total_size is larger than output_size, but this case is | |
192 | * handled by bumping output_size.) | |
9dc1969c | 193 | * |
ed09acde | 194 | * The above conditions can be illustrated by a diagram: |
9dc1969c | 195 | * |
ed09acde KC |
196 | * 0 output input input+input_size output+init_size |
197 | * | | | | | | |
198 | * | | | | | | |
199 | * |-----|--------|--------|--------------|-----------|--|-------------| | |
200 | * | | | | |
201 | * | | | | |
202 | * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size | |
9dc1969c | 203 | * |
ed09acde KC |
204 | * [output, output+init_size) is the entire memory range used for |
205 | * extracting the compressed image. | |
9dc1969c | 206 | * |
ed09acde KC |
207 | * [output, output+kernel_total_size) is the range needed for the |
208 | * uncompressed kernel (VO) and its run size (bss, brk, etc). | |
9dc1969c | 209 | * |
ed09acde KC |
210 | * [output, output+output_size) is VO plus relocs (i.e. the entire |
211 | * uncompressed payload contained by ZO). This is the area of the buffer | |
212 | * written to during decompression. | |
9dc1969c | 213 | * |
ed09acde KC |
214 | * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case |
215 | * range of the copied ZO and decompression code. (i.e. the range | |
216 | * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) | |
9dc1969c | 217 | * |
ed09acde KC |
218 | * [input, input+input_size) is the original copied compressed image (ZO) |
219 | * (i.e. it does not include its run size). This range must be avoided | |
220 | * because it contains the data used for decompression. | |
9dc1969c | 221 | * |
ed09acde KC |
222 | * [input+input_size, output+init_size) is [_text, _end) for ZO. This |
223 | * range includes ZO's heap and stack, and must be avoided since it | |
224 | * performs the decompression. | |
9dc1969c | 225 | * |
ed09acde KC |
226 | * Since the above two ranges need to be avoided and they are adjacent, |
227 | * they can be merged, resulting in: [input, output+init_size) which | |
228 | * becomes the MEM_AVOID_ZO_RANGE below. | |
9dc1969c | 229 | */ |
82fa9637 | 230 | static void mem_avoid_init(unsigned long input, unsigned long input_size, |
9dc1969c | 231 | unsigned long output) |
82fa9637 | 232 | { |
9dc1969c | 233 | unsigned long init_size = boot_params->hdr.init_size; |
82fa9637 KC |
234 | u64 initrd_start, initrd_size; |
235 | u64 cmd_line, cmd_line_size; | |
82fa9637 KC |
236 | char *ptr; |
237 | ||
238 | /* | |
239 | * Avoid the region that is unsafe to overlap during | |
9dc1969c | 240 | * decompression. |
82fa9637 | 241 | */ |
ed09acde KC |
242 | mem_avoid[MEM_AVOID_ZO_RANGE].start = input; |
243 | mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; | |
3a94707d KC |
244 | add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, |
245 | mem_avoid[MEM_AVOID_ZO_RANGE].size); | |
82fa9637 KC |
246 | |
247 | /* Avoid initrd. */ | |
6655e0aa KC |
248 | initrd_start = (u64)boot_params->ext_ramdisk_image << 32; |
249 | initrd_start |= boot_params->hdr.ramdisk_image; | |
250 | initrd_size = (u64)boot_params->ext_ramdisk_size << 32; | |
251 | initrd_size |= boot_params->hdr.ramdisk_size; | |
ed09acde KC |
252 | mem_avoid[MEM_AVOID_INITRD].start = initrd_start; |
253 | mem_avoid[MEM_AVOID_INITRD].size = initrd_size; | |
3a94707d | 254 | /* No need to set mapping for initrd, it will be handled in VO. */ |
82fa9637 KC |
255 | |
256 | /* Avoid kernel command line. */ | |
6655e0aa KC |
257 | cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32; |
258 | cmd_line |= boot_params->hdr.cmd_line_ptr; | |
82fa9637 KC |
259 | /* Calculate size of cmd_line. */ |
260 | ptr = (char *)(unsigned long)cmd_line; | |
261 | for (cmd_line_size = 0; ptr[cmd_line_size++]; ) | |
262 | ; | |
ed09acde KC |
263 | mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; |
264 | mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; | |
3a94707d KC |
265 | add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, |
266 | mem_avoid[MEM_AVOID_CMDLINE].size); | |
82fa9637 | 267 | |
ed09acde KC |
268 | /* Avoid boot parameters. */ |
269 | mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; | |
270 | mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); | |
3a94707d KC |
271 | add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, |
272 | mem_avoid[MEM_AVOID_BOOTPARAMS].size); | |
273 | ||
274 | /* We don't need to set a mapping for setup_data. */ | |
275 | ||
276 | #ifdef CONFIG_X86_VERBOSE_BOOTUP | |
277 | /* Make sure video RAM can be used. */ | |
278 | add_identity_map(0, PMD_SIZE); | |
279 | #endif | |
82fa9637 KC |
280 | } |
281 | ||
282 | /* Does this memory vector overlap a known avoided area? */ | |
e290e8c5 | 283 | static bool mem_avoid_overlap(struct mem_vector *img) |
82fa9637 KC |
284 | { |
285 | int i; | |
0cacbfbe | 286 | struct setup_data *ptr; |
82fa9637 KC |
287 | |
288 | for (i = 0; i < MEM_AVOID_MAX; i++) { | |
289 | if (mem_overlaps(img, &mem_avoid[i])) | |
290 | return true; | |
291 | } | |
292 | ||
0cacbfbe | 293 | /* Avoid all entries in the setup_data linked list. */ |
6655e0aa | 294 | ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; |
0cacbfbe KC |
295 | while (ptr) { |
296 | struct mem_vector avoid; | |
297 | ||
20cc2888 | 298 | avoid.start = (unsigned long)ptr; |
0cacbfbe KC |
299 | avoid.size = sizeof(*ptr) + ptr->len; |
300 | ||
301 | if (mem_overlaps(img, &avoid)) | |
302 | return true; | |
303 | ||
304 | ptr = (struct setup_data *)(unsigned long)ptr->next; | |
305 | } | |
306 | ||
82fa9637 KC |
307 | return false; |
308 | } | |
309 | ||
e8581e3d | 310 | static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN]; |
e290e8c5 | 311 | static unsigned long slot_max; |
82fa9637 KC |
312 | |
313 | static void slots_append(unsigned long addr) | |
314 | { | |
315 | /* Overflowing the slots list should be impossible. */ | |
e8581e3d | 316 | if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN) |
82fa9637 KC |
317 | return; |
318 | ||
319 | slots[slot_max++] = addr; | |
320 | } | |
321 | ||
322 | static unsigned long slots_fetch_random(void) | |
323 | { | |
324 | /* Handle case of no slots stored. */ | |
325 | if (slot_max == 0) | |
326 | return 0; | |
327 | ||
328 | return slots[get_random_long() % slot_max]; | |
329 | } | |
330 | ||
331 | static void process_e820_entry(struct e820entry *entry, | |
332 | unsigned long minimum, | |
333 | unsigned long image_size) | |
334 | { | |
335 | struct mem_vector region, img; | |
336 | ||
337 | /* Skip non-RAM entries. */ | |
338 | if (entry->type != E820_RAM) | |
339 | return; | |
340 | ||
341 | /* Ignore entries entirely above our maximum. */ | |
e8581e3d | 342 | if (entry->addr >= KERNEL_IMAGE_SIZE) |
82fa9637 KC |
343 | return; |
344 | ||
345 | /* Ignore entries entirely below our minimum. */ | |
346 | if (entry->addr + entry->size < minimum) | |
347 | return; | |
348 | ||
349 | region.start = entry->addr; | |
350 | region.size = entry->size; | |
351 | ||
352 | /* Potentially raise address to minimum location. */ | |
353 | if (region.start < minimum) | |
354 | region.start = minimum; | |
355 | ||
356 | /* Potentially raise address to meet alignment requirements. */ | |
357 | region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); | |
358 | ||
359 | /* Did we raise the address above the bounds of this e820 region? */ | |
360 | if (region.start > entry->addr + entry->size) | |
361 | return; | |
362 | ||
363 | /* Reduce size by any delta from the original address. */ | |
364 | region.size -= region.start - entry->addr; | |
365 | ||
366 | /* Reduce maximum size to fit end of image within maximum limit. */ | |
e8581e3d BH |
367 | if (region.start + region.size > KERNEL_IMAGE_SIZE) |
368 | region.size = KERNEL_IMAGE_SIZE - region.start; | |
82fa9637 KC |
369 | |
370 | /* Walk each aligned slot and check for avoided areas. */ | |
371 | for (img.start = region.start, img.size = image_size ; | |
372 | mem_contains(®ion, &img) ; | |
373 | img.start += CONFIG_PHYSICAL_ALIGN) { | |
374 | if (mem_avoid_overlap(&img)) | |
375 | continue; | |
376 | slots_append(img.start); | |
377 | } | |
378 | } | |
379 | ||
380 | static unsigned long find_random_addr(unsigned long minimum, | |
381 | unsigned long size) | |
382 | { | |
383 | int i; | |
384 | unsigned long addr; | |
385 | ||
386 | /* Make sure minimum is aligned. */ | |
387 | minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); | |
388 | ||
389 | /* Verify potential e820 positions, appending to slots list. */ | |
6655e0aa KC |
390 | for (i = 0; i < boot_params->e820_entries; i++) { |
391 | process_e820_entry(&boot_params->e820_map[i], minimum, size); | |
82fa9637 KC |
392 | } |
393 | ||
394 | return slots_fetch_random(); | |
395 | } | |
396 | ||
549f90db BP |
397 | /* |
398 | * Since this function examines addresses much more numerically, | |
399 | * it takes the input and output pointers as 'unsigned long'. | |
400 | */ | |
401 | unsigned char *choose_random_location(unsigned long input, | |
8ab3820f | 402 | unsigned long input_size, |
549f90db | 403 | unsigned long output, |
8ab3820f KC |
404 | unsigned long output_size) |
405 | { | |
2bc1cd39 | 406 | unsigned long choice = output; |
9016875d | 407 | unsigned long random_addr; |
8ab3820f | 408 | |
24f2e027 KC |
409 | #ifdef CONFIG_HIBERNATION |
410 | if (!cmdline_find_option_bool("kaslr")) { | |
0f8ede1b | 411 | warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected)."); |
24f2e027 KC |
412 | goto out; |
413 | } | |
414 | #else | |
8ab3820f | 415 | if (cmdline_find_option_bool("nokaslr")) { |
0f8ede1b | 416 | warn("KASLR disabled: 'nokaslr' on cmdline."); |
8ab3820f KC |
417 | goto out; |
418 | } | |
24f2e027 | 419 | #endif |
8ab3820f | 420 | |
6655e0aa | 421 | boot_params->hdr.loadflags |= KASLR_FLAG; |
78cac48c | 422 | |
82fa9637 | 423 | /* Record the various known unsafe memory ranges. */ |
9dc1969c | 424 | mem_avoid_init(input, input_size, output); |
82fa9637 KC |
425 | |
426 | /* Walk e820 and find a random address. */ | |
2bc1cd39 | 427 | random_addr = find_random_addr(output, output_size); |
9016875d | 428 | if (!random_addr) { |
0f8ede1b | 429 | warn("KASLR disabled: could not find suitable E820 region!"); |
82fa9637 KC |
430 | goto out; |
431 | } | |
432 | ||
433 | /* Always enforce the minimum. */ | |
9016875d | 434 | if (random_addr < choice) |
82fa9637 | 435 | goto out; |
8ab3820f | 436 | |
9016875d | 437 | choice = random_addr; |
3a94707d KC |
438 | |
439 | add_identity_map(choice, output_size); | |
36a39ac9 BP |
440 | |
441 | /* This actually loads the identity pagetable on x86_64. */ | |
3a94707d | 442 | finalize_identity_maps(); |
8ab3820f KC |
443 | out: |
444 | return (unsigned char *)choice; | |
445 | } |