purgatory: core purgatory functionality
[deliverable/linux.git] / kernel / kexec.c
CommitLineData
dc009d92
EB
1/*
2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
cb105258
VG
9#define pr_fmt(fmt) "kexec: " fmt
10
c59ede7b 11#include <linux/capability.h>
dc009d92
EB
12#include <linux/mm.h>
13#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/fs.h>
16#include <linux/kexec.h>
8c5a1cf0 17#include <linux/mutex.h>
dc009d92
EB
18#include <linux/list.h>
19#include <linux/highmem.h>
20#include <linux/syscalls.h>
21#include <linux/reboot.h>
dc009d92 22#include <linux/ioport.h>
6e274d14 23#include <linux/hardirq.h>
85916f81
MD
24#include <linux/elf.h>
25#include <linux/elfcore.h>
fd59d231
KO
26#include <linux/utsname.h>
27#include <linux/numa.h>
3ab83521
HY
28#include <linux/suspend.h>
29#include <linux/device.h>
89081d17
HY
30#include <linux/freezer.h>
31#include <linux/pm.h>
32#include <linux/cpu.h>
33#include <linux/console.h>
5f41b8cd 34#include <linux/vmalloc.h>
06a7f711 35#include <linux/swap.h>
19234c08 36#include <linux/syscore_ops.h>
52f5684c 37#include <linux/compiler.h>
8f1d26d0 38#include <linux/hugetlb.h>
6e274d14 39
dc009d92
EB
40#include <asm/page.h>
41#include <asm/uaccess.h>
42#include <asm/io.h>
fd59d231 43#include <asm/sections.h>
dc009d92 44
cc571658 45/* Per cpu memory for storing cpu states in case of system crash. */
43cf38eb 46note_buf_t __percpu *crash_notes;
cc571658 47
fd59d231 48/* vmcoreinfo stuff */
edb79a21 49static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
fd59d231 50u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
d768281e
KO
51size_t vmcoreinfo_size;
52size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
fd59d231 53
4fc9bbf9
KA
54/* Flag to indicate we are going to kexec a new kernel */
55bool kexec_in_progress = false;
56
dc009d92
EB
57/* Location of the reserved area for the crash kernel */
58struct resource crashk_res = {
59 .name = "Crash kernel",
60 .start = 0,
61 .end = 0,
62 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
63};
0212f915 64struct resource crashk_low_res = {
157752d8 65 .name = "Crash kernel",
0212f915
YL
66 .start = 0,
67 .end = 0,
68 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
69};
dc009d92 70
6e274d14
AN
71int kexec_should_crash(struct task_struct *p)
72{
b460cbc5 73 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
6e274d14
AN
74 return 1;
75 return 0;
76}
77
dc009d92
EB
78/*
79 * When kexec transitions to the new kernel there is a one-to-one
80 * mapping between physical and virtual addresses. On processors
81 * where you can disable the MMU this is trivial, and easy. For
82 * others it is still a simple predictable page table to setup.
83 *
84 * In that environment kexec copies the new kernel to its final
85 * resting place. This means I can only support memory whose
86 * physical address can fit in an unsigned long. In particular
87 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
88 * If the assembly stub has more restrictive requirements
89 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
90 * defined more restrictively in <asm/kexec.h>.
91 *
92 * The code for the transition from the current kernel to the
93 * the new kernel is placed in the control_code_buffer, whose size
163f6876 94 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
dc009d92
EB
95 * page of memory is necessary, but some architectures require more.
96 * Because this memory must be identity mapped in the transition from
97 * virtual to physical addresses it must live in the range
98 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
99 * modifiable.
100 *
101 * The assembly stub in the control code buffer is passed a linked list
102 * of descriptor pages detailing the source pages of the new kernel,
103 * and the destination addresses of those source pages. As this data
104 * structure is not used in the context of the current OS, it must
105 * be self-contained.
106 *
107 * The code has been made to work with highmem pages and will use a
108 * destination page in its final resting place (if it happens
109 * to allocate it). The end product of this is that most of the
110 * physical address space, and most of RAM can be used.
111 *
112 * Future directions include:
113 * - allocating a page table with the control code buffer identity
114 * mapped, to simplify machine_kexec and make kexec_on_panic more
115 * reliable.
116 */
117
118/*
119 * KIMAGE_NO_DEST is an impossible destination address..., for
120 * allocating pages whose destination address we do not care about.
121 */
122#define KIMAGE_NO_DEST (-1UL)
123
72414d3f
MS
124static int kimage_is_destination_range(struct kimage *image,
125 unsigned long start, unsigned long end);
126static struct page *kimage_alloc_page(struct kimage *image,
9796fdd8 127 gfp_t gfp_mask,
72414d3f 128 unsigned long dest);
dc009d92 129
dabe7862
VG
130static int copy_user_segment_list(struct kimage *image,
131 unsigned long nr_segments,
132 struct kexec_segment __user *segments)
dc009d92 133{
dabe7862 134 int ret;
dc009d92 135 size_t segment_bytes;
dc009d92
EB
136
137 /* Read in the segments */
138 image->nr_segments = nr_segments;
139 segment_bytes = nr_segments * sizeof(*segments);
dabe7862
VG
140 ret = copy_from_user(image->segment, segments, segment_bytes);
141 if (ret)
142 ret = -EFAULT;
143
144 return ret;
145}
146
147static int sanity_check_segment_list(struct kimage *image)
148{
149 int result, i;
150 unsigned long nr_segments = image->nr_segments;
dc009d92
EB
151
152 /*
153 * Verify we have good destination addresses. The caller is
154 * responsible for making certain we don't attempt to load
155 * the new image into invalid or reserved areas of RAM. This
156 * just verifies it is an address we can use.
157 *
158 * Since the kernel does everything in page size chunks ensure
b595076a 159 * the destination addresses are page aligned. Too many
dc009d92
EB
160 * special cases crop of when we don't do this. The most
161 * insidious is getting overlapping destination addresses
162 * simply because addresses are changed to page size
163 * granularity.
164 */
165 result = -EADDRNOTAVAIL;
166 for (i = 0; i < nr_segments; i++) {
167 unsigned long mstart, mend;
72414d3f 168
dc009d92
EB
169 mstart = image->segment[i].mem;
170 mend = mstart + image->segment[i].memsz;
171 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
dabe7862 172 return result;
dc009d92 173 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
dabe7862 174 return result;
dc009d92
EB
175 }
176
177 /* Verify our destination addresses do not overlap.
178 * If we alloed overlapping destination addresses
179 * through very weird things can happen with no
180 * easy explanation as one segment stops on another.
181 */
182 result = -EINVAL;
72414d3f 183 for (i = 0; i < nr_segments; i++) {
dc009d92
EB
184 unsigned long mstart, mend;
185 unsigned long j;
72414d3f 186
dc009d92
EB
187 mstart = image->segment[i].mem;
188 mend = mstart + image->segment[i].memsz;
72414d3f 189 for (j = 0; j < i; j++) {
dc009d92
EB
190 unsigned long pstart, pend;
191 pstart = image->segment[j].mem;
192 pend = pstart + image->segment[j].memsz;
193 /* Do the segments overlap ? */
194 if ((mend > pstart) && (mstart < pend))
dabe7862 195 return result;
dc009d92
EB
196 }
197 }
198
199 /* Ensure our buffer sizes are strictly less than
200 * our memory sizes. This should always be the case,
201 * and it is easier to check up front than to be surprised
202 * later on.
203 */
204 result = -EINVAL;
72414d3f 205 for (i = 0; i < nr_segments; i++) {
dc009d92 206 if (image->segment[i].bufsz > image->segment[i].memsz)
dabe7862 207 return result;
dc009d92
EB
208 }
209
dabe7862
VG
210 /*
211 * Verify we have good destination addresses. Normally
212 * the caller is responsible for making certain we don't
213 * attempt to load the new image into invalid or reserved
214 * areas of RAM. But crash kernels are preloaded into a
215 * reserved area of ram. We must ensure the addresses
216 * are in the reserved area otherwise preloading the
217 * kernel could corrupt things.
218 */
72414d3f 219
dabe7862
VG
220 if (image->type == KEXEC_TYPE_CRASH) {
221 result = -EADDRNOTAVAIL;
222 for (i = 0; i < nr_segments; i++) {
223 unsigned long mstart, mend;
224
225 mstart = image->segment[i].mem;
226 mend = mstart + image->segment[i].memsz - 1;
227 /* Ensure we are within the crash kernel limits */
228 if ((mstart < crashk_res.start) ||
229 (mend > crashk_res.end))
230 return result;
231 }
232 }
dc009d92 233
dabe7862
VG
234 return 0;
235}
236
237static struct kimage *do_kimage_alloc_init(void)
238{
239 struct kimage *image;
240
241 /* Allocate a controlling structure */
242 image = kzalloc(sizeof(*image), GFP_KERNEL);
243 if (!image)
244 return NULL;
245
246 image->head = 0;
247 image->entry = &image->head;
248 image->last_entry = &image->head;
249 image->control_page = ~0; /* By default this does not apply */
250 image->type = KEXEC_TYPE_DEFAULT;
251
252 /* Initialize the list of control pages */
253 INIT_LIST_HEAD(&image->control_pages);
254
255 /* Initialize the list of destination pages */
256 INIT_LIST_HEAD(&image->dest_pages);
257
258 /* Initialize the list of unusable pages */
259 INIT_LIST_HEAD(&image->unusable_pages);
260
261 return image;
dc009d92
EB
262}
263
b92e7e0d
ZY
264static void kimage_free_page_list(struct list_head *list);
265
255aedd9
VG
266static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
267 unsigned long nr_segments,
268 struct kexec_segment __user *segments,
269 unsigned long flags)
dc009d92 270{
255aedd9 271 int ret;
dc009d92 272 struct kimage *image;
255aedd9
VG
273 bool kexec_on_panic = flags & KEXEC_ON_CRASH;
274
275 if (kexec_on_panic) {
276 /* Verify we have a valid entry point */
277 if ((entry < crashk_res.start) || (entry > crashk_res.end))
278 return -EADDRNOTAVAIL;
279 }
dc009d92
EB
280
281 /* Allocate and initialize a controlling structure */
dabe7862
VG
282 image = do_kimage_alloc_init();
283 if (!image)
284 return -ENOMEM;
285
286 image->start = entry;
287
255aedd9
VG
288 ret = copy_user_segment_list(image, nr_segments, segments);
289 if (ret)
dabe7862
VG
290 goto out_free_image;
291
255aedd9
VG
292 ret = sanity_check_segment_list(image);
293 if (ret)
dabe7862 294 goto out_free_image;
72414d3f 295
255aedd9
VG
296 /* Enable the special crash kernel control page allocation policy. */
297 if (kexec_on_panic) {
298 image->control_page = crashk_res.start;
299 image->type = KEXEC_TYPE_CRASH;
300 }
301
dc009d92
EB
302 /*
303 * Find a location for the control code buffer, and add it
304 * the vector of segments so that it's pages will also be
305 * counted as destination pages.
306 */
255aedd9 307 ret = -ENOMEM;
dc009d92 308 image->control_code_page = kimage_alloc_control_pages(image,
163f6876 309 get_order(KEXEC_CONTROL_PAGE_SIZE));
dc009d92 310 if (!image->control_code_page) {
e1bebcf4 311 pr_err("Could not allocate control_code_buffer\n");
dabe7862 312 goto out_free_image;
dc009d92
EB
313 }
314
255aedd9
VG
315 if (!kexec_on_panic) {
316 image->swap_page = kimage_alloc_control_pages(image, 0);
317 if (!image->swap_page) {
318 pr_err("Could not allocate swap buffer\n");
319 goto out_free_control_pages;
320 }
3ab83521
HY
321 }
322
b92e7e0d
ZY
323 *rimage = image;
324 return 0;
dabe7862 325out_free_control_pages:
b92e7e0d 326 kimage_free_page_list(&image->control_pages);
dabe7862 327out_free_image:
b92e7e0d 328 kfree(image);
255aedd9 329 return ret;
dc009d92
EB
330}
331
cb105258
VG
332static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
333{
334 struct fd f = fdget(fd);
335 int ret;
336 struct kstat stat;
337 loff_t pos;
338 ssize_t bytes = 0;
339
340 if (!f.file)
341 return -EBADF;
342
343 ret = vfs_getattr(&f.file->f_path, &stat);
344 if (ret)
345 goto out;
346
347 if (stat.size > INT_MAX) {
348 ret = -EFBIG;
349 goto out;
350 }
351
352 /* Don't hand 0 to vmalloc, it whines. */
353 if (stat.size == 0) {
354 ret = -EINVAL;
355 goto out;
356 }
357
358 *buf = vmalloc(stat.size);
359 if (!*buf) {
360 ret = -ENOMEM;
361 goto out;
362 }
363
364 pos = 0;
365 while (pos < stat.size) {
366 bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
367 stat.size - pos);
368 if (bytes < 0) {
369 vfree(*buf);
370 ret = bytes;
371 goto out;
372 }
373
374 if (bytes == 0)
375 break;
376 pos += bytes;
377 }
378
379 if (pos != stat.size) {
380 ret = -EBADF;
381 vfree(*buf);
382 goto out;
383 }
384
385 *buf_len = pos;
386out:
387 fdput(f);
388 return ret;
389}
390
391/* Architectures can provide this probe function */
392int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
393 unsigned long buf_len)
394{
395 return -ENOEXEC;
396}
397
398void * __weak arch_kexec_kernel_image_load(struct kimage *image)
399{
400 return ERR_PTR(-ENOEXEC);
401}
402
403void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
404{
405}
406
407/*
408 * Free up memory used by kernel, initrd, and comand line. This is temporary
409 * memory allocation which is not needed any more after these buffers have
410 * been loaded into separate segments and have been copied elsewhere.
411 */
412static void kimage_file_post_load_cleanup(struct kimage *image)
413{
414 vfree(image->kernel_buf);
415 image->kernel_buf = NULL;
416
417 vfree(image->initrd_buf);
418 image->initrd_buf = NULL;
419
420 kfree(image->cmdline_buf);
421 image->cmdline_buf = NULL;
422
423 /* See if architecture has anything to cleanup post load */
424 arch_kimage_file_post_load_cleanup(image);
425}
426
427/*
428 * In file mode list of segments is prepared by kernel. Copy relevant
429 * data from user space, do error checking, prepare segment list
430 */
431static int
432kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
433 const char __user *cmdline_ptr,
434 unsigned long cmdline_len, unsigned flags)
435{
436 int ret = 0;
437 void *ldata;
438
439 ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
440 &image->kernel_buf_len);
441 if (ret)
442 return ret;
443
444 /* Call arch image probe handlers */
445 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
446 image->kernel_buf_len);
447
448 if (ret)
449 goto out;
450
451 /* It is possible that there no initramfs is being loaded */
452 if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
453 ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
454 &image->initrd_buf_len);
455 if (ret)
456 goto out;
457 }
458
459 if (cmdline_len) {
460 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
461 if (!image->cmdline_buf) {
462 ret = -ENOMEM;
463 goto out;
464 }
465
466 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
467 cmdline_len);
468 if (ret) {
469 ret = -EFAULT;
470 goto out;
471 }
472
473 image->cmdline_buf_len = cmdline_len;
474
475 /* command line should be a string with last byte null */
476 if (image->cmdline_buf[cmdline_len - 1] != '\0') {
477 ret = -EINVAL;
478 goto out;
479 }
480 }
481
482 /* Call arch image load handlers */
483 ldata = arch_kexec_kernel_image_load(image);
484
485 if (IS_ERR(ldata)) {
486 ret = PTR_ERR(ldata);
487 goto out;
488 }
489
490 image->image_loader_data = ldata;
491out:
492 /* In case of error, free up all allocated memory in this function */
493 if (ret)
494 kimage_file_post_load_cleanup(image);
495 return ret;
496}
497
498static int
499kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
500 int initrd_fd, const char __user *cmdline_ptr,
501 unsigned long cmdline_len, unsigned long flags)
502{
503 int ret;
504 struct kimage *image;
505
506 image = do_kimage_alloc_init();
507 if (!image)
508 return -ENOMEM;
509
510 image->file_mode = 1;
511
512 ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
513 cmdline_ptr, cmdline_len, flags);
514 if (ret)
515 goto out_free_image;
516
517 ret = sanity_check_segment_list(image);
518 if (ret)
519 goto out_free_post_load_bufs;
520
521 ret = -ENOMEM;
522 image->control_code_page = kimage_alloc_control_pages(image,
523 get_order(KEXEC_CONTROL_PAGE_SIZE));
524 if (!image->control_code_page) {
525 pr_err("Could not allocate control_code_buffer\n");
526 goto out_free_post_load_bufs;
527 }
528
529 image->swap_page = kimage_alloc_control_pages(image, 0);
530 if (!image->swap_page) {
531 pr_err(KERN_ERR "Could not allocate swap buffer\n");
532 goto out_free_control_pages;
533 }
534
535 *rimage = image;
536 return 0;
537out_free_control_pages:
538 kimage_free_page_list(&image->control_pages);
539out_free_post_load_bufs:
540 kimage_file_post_load_cleanup(image);
541 kfree(image->image_loader_data);
542out_free_image:
543 kfree(image);
544 return ret;
545}
546
72414d3f
MS
547static int kimage_is_destination_range(struct kimage *image,
548 unsigned long start,
549 unsigned long end)
dc009d92
EB
550{
551 unsigned long i;
552
553 for (i = 0; i < image->nr_segments; i++) {
554 unsigned long mstart, mend;
72414d3f 555
dc009d92 556 mstart = image->segment[i].mem;
72414d3f
MS
557 mend = mstart + image->segment[i].memsz;
558 if ((end > mstart) && (start < mend))
dc009d92 559 return 1;
dc009d92 560 }
72414d3f 561
dc009d92
EB
562 return 0;
563}
564
9796fdd8 565static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
dc009d92
EB
566{
567 struct page *pages;
72414d3f 568
dc009d92
EB
569 pages = alloc_pages(gfp_mask, order);
570 if (pages) {
571 unsigned int count, i;
572 pages->mapping = NULL;
4c21e2f2 573 set_page_private(pages, order);
dc009d92 574 count = 1 << order;
72414d3f 575 for (i = 0; i < count; i++)
dc009d92 576 SetPageReserved(pages + i);
dc009d92 577 }
72414d3f 578
dc009d92
EB
579 return pages;
580}
581
582static void kimage_free_pages(struct page *page)
583{
584 unsigned int order, count, i;
72414d3f 585
4c21e2f2 586 order = page_private(page);
dc009d92 587 count = 1 << order;
72414d3f 588 for (i = 0; i < count; i++)
dc009d92 589 ClearPageReserved(page + i);
dc009d92
EB
590 __free_pages(page, order);
591}
592
593static void kimage_free_page_list(struct list_head *list)
594{
595 struct list_head *pos, *next;
72414d3f 596
dc009d92
EB
597 list_for_each_safe(pos, next, list) {
598 struct page *page;
599
600 page = list_entry(pos, struct page, lru);
601 list_del(&page->lru);
dc009d92
EB
602 kimage_free_pages(page);
603 }
604}
605
72414d3f
MS
606static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
607 unsigned int order)
dc009d92
EB
608{
609 /* Control pages are special, they are the intermediaries
610 * that are needed while we copy the rest of the pages
611 * to their final resting place. As such they must
612 * not conflict with either the destination addresses
613 * or memory the kernel is already using.
614 *
615 * The only case where we really need more than one of
616 * these are for architectures where we cannot disable
617 * the MMU and must instead generate an identity mapped
618 * page table for all of the memory.
619 *
620 * At worst this runs in O(N) of the image size.
621 */
622 struct list_head extra_pages;
623 struct page *pages;
624 unsigned int count;
625
626 count = 1 << order;
627 INIT_LIST_HEAD(&extra_pages);
628
629 /* Loop while I can allocate a page and the page allocated
630 * is a destination page.
631 */
632 do {
633 unsigned long pfn, epfn, addr, eaddr;
72414d3f 634
dc009d92
EB
635 pages = kimage_alloc_pages(GFP_KERNEL, order);
636 if (!pages)
637 break;
638 pfn = page_to_pfn(pages);
639 epfn = pfn + count;
640 addr = pfn << PAGE_SHIFT;
641 eaddr = epfn << PAGE_SHIFT;
642 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
72414d3f 643 kimage_is_destination_range(image, addr, eaddr)) {
dc009d92
EB
644 list_add(&pages->lru, &extra_pages);
645 pages = NULL;
646 }
72414d3f
MS
647 } while (!pages);
648
dc009d92
EB
649 if (pages) {
650 /* Remember the allocated page... */
651 list_add(&pages->lru, &image->control_pages);
652
653 /* Because the page is already in it's destination
654 * location we will never allocate another page at
655 * that address. Therefore kimage_alloc_pages
656 * will not return it (again) and we don't need
657 * to give it an entry in image->segment[].
658 */
659 }
660 /* Deal with the destination pages I have inadvertently allocated.
661 *
662 * Ideally I would convert multi-page allocations into single
25985edc 663 * page allocations, and add everything to image->dest_pages.
dc009d92
EB
664 *
665 * For now it is simpler to just free the pages.
666 */
667 kimage_free_page_list(&extra_pages);
dc009d92 668
72414d3f 669 return pages;
dc009d92
EB
670}
671
72414d3f
MS
672static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
673 unsigned int order)
dc009d92
EB
674{
675 /* Control pages are special, they are the intermediaries
676 * that are needed while we copy the rest of the pages
677 * to their final resting place. As such they must
678 * not conflict with either the destination addresses
679 * or memory the kernel is already using.
680 *
681 * Control pages are also the only pags we must allocate
682 * when loading a crash kernel. All of the other pages
683 * are specified by the segments and we just memcpy
684 * into them directly.
685 *
686 * The only case where we really need more than one of
687 * these are for architectures where we cannot disable
688 * the MMU and must instead generate an identity mapped
689 * page table for all of the memory.
690 *
691 * Given the low demand this implements a very simple
692 * allocator that finds the first hole of the appropriate
693 * size in the reserved memory region, and allocates all
694 * of the memory up to and including the hole.
695 */
696 unsigned long hole_start, hole_end, size;
697 struct page *pages;
72414d3f 698
dc009d92
EB
699 pages = NULL;
700 size = (1 << order) << PAGE_SHIFT;
701 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
702 hole_end = hole_start + size - 1;
72414d3f 703 while (hole_end <= crashk_res.end) {
dc009d92 704 unsigned long i;
72414d3f 705
3d214fae 706 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
dc009d92 707 break;
dc009d92 708 /* See if I overlap any of the segments */
72414d3f 709 for (i = 0; i < image->nr_segments; i++) {
dc009d92 710 unsigned long mstart, mend;
72414d3f 711
dc009d92
EB
712 mstart = image->segment[i].mem;
713 mend = mstart + image->segment[i].memsz - 1;
714 if ((hole_end >= mstart) && (hole_start <= mend)) {
715 /* Advance the hole to the end of the segment */
716 hole_start = (mend + (size - 1)) & ~(size - 1);
717 hole_end = hole_start + size - 1;
718 break;
719 }
720 }
721 /* If I don't overlap any segments I have found my hole! */
722 if (i == image->nr_segments) {
723 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
724 break;
725 }
726 }
72414d3f 727 if (pages)
dc009d92 728 image->control_page = hole_end;
72414d3f 729
dc009d92
EB
730 return pages;
731}
732
733
72414d3f
MS
734struct page *kimage_alloc_control_pages(struct kimage *image,
735 unsigned int order)
dc009d92
EB
736{
737 struct page *pages = NULL;
72414d3f
MS
738
739 switch (image->type) {
dc009d92
EB
740 case KEXEC_TYPE_DEFAULT:
741 pages = kimage_alloc_normal_control_pages(image, order);
742 break;
743 case KEXEC_TYPE_CRASH:
744 pages = kimage_alloc_crash_control_pages(image, order);
745 break;
746 }
72414d3f 747
dc009d92
EB
748 return pages;
749}
750
751static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
752{
72414d3f 753 if (*image->entry != 0)
dc009d92 754 image->entry++;
72414d3f 755
dc009d92
EB
756 if (image->entry == image->last_entry) {
757 kimage_entry_t *ind_page;
758 struct page *page;
72414d3f 759
dc009d92 760 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
72414d3f 761 if (!page)
dc009d92 762 return -ENOMEM;
72414d3f 763
dc009d92
EB
764 ind_page = page_address(page);
765 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
766 image->entry = ind_page;
72414d3f
MS
767 image->last_entry = ind_page +
768 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
dc009d92
EB
769 }
770 *image->entry = entry;
771 image->entry++;
772 *image->entry = 0;
72414d3f 773
dc009d92
EB
774 return 0;
775}
776
72414d3f
MS
777static int kimage_set_destination(struct kimage *image,
778 unsigned long destination)
dc009d92
EB
779{
780 int result;
781
782 destination &= PAGE_MASK;
783 result = kimage_add_entry(image, destination | IND_DESTINATION);
72414d3f 784 if (result == 0)
dc009d92 785 image->destination = destination;
72414d3f 786
dc009d92
EB
787 return result;
788}
789
790
791static int kimage_add_page(struct kimage *image, unsigned long page)
792{
793 int result;
794
795 page &= PAGE_MASK;
796 result = kimage_add_entry(image, page | IND_SOURCE);
72414d3f 797 if (result == 0)
dc009d92 798 image->destination += PAGE_SIZE;
72414d3f 799
dc009d92
EB
800 return result;
801}
802
803
804static void kimage_free_extra_pages(struct kimage *image)
805{
806 /* Walk through and free any extra destination pages I may have */
807 kimage_free_page_list(&image->dest_pages);
808
25985edc 809 /* Walk through and free any unusable pages I have cached */
7d3e2bca 810 kimage_free_page_list(&image->unusable_pages);
dc009d92
EB
811
812}
7fccf032 813static void kimage_terminate(struct kimage *image)
dc009d92 814{
72414d3f 815 if (*image->entry != 0)
dc009d92 816 image->entry++;
72414d3f 817
dc009d92 818 *image->entry = IND_DONE;
dc009d92
EB
819}
820
821#define for_each_kimage_entry(image, ptr, entry) \
822 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
e1bebcf4
FF
823 ptr = (entry & IND_INDIRECTION) ? \
824 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
dc009d92
EB
825
826static void kimage_free_entry(kimage_entry_t entry)
827{
828 struct page *page;
829
830 page = pfn_to_page(entry >> PAGE_SHIFT);
831 kimage_free_pages(page);
832}
833
834static void kimage_free(struct kimage *image)
835{
836 kimage_entry_t *ptr, entry;
837 kimage_entry_t ind = 0;
838
839 if (!image)
840 return;
72414d3f 841
dc009d92
EB
842 kimage_free_extra_pages(image);
843 for_each_kimage_entry(image, ptr, entry) {
844 if (entry & IND_INDIRECTION) {
845 /* Free the previous indirection page */
72414d3f 846 if (ind & IND_INDIRECTION)
dc009d92 847 kimage_free_entry(ind);
dc009d92
EB
848 /* Save this indirection page until we are
849 * done with it.
850 */
851 ind = entry;
e1bebcf4 852 } else if (entry & IND_SOURCE)
dc009d92 853 kimage_free_entry(entry);
dc009d92
EB
854 }
855 /* Free the final indirection page */
72414d3f 856 if (ind & IND_INDIRECTION)
dc009d92 857 kimage_free_entry(ind);
dc009d92
EB
858
859 /* Handle any machine specific cleanup */
860 machine_kexec_cleanup(image);
861
862 /* Free the kexec control pages... */
863 kimage_free_page_list(&image->control_pages);
cb105258
VG
864
865 kfree(image->image_loader_data);
866
867 /*
868 * Free up any temporary buffers allocated. This might hit if
869 * error occurred much later after buffer allocation.
870 */
871 if (image->file_mode)
872 kimage_file_post_load_cleanup(image);
873
dc009d92
EB
874 kfree(image);
875}
876
72414d3f
MS
877static kimage_entry_t *kimage_dst_used(struct kimage *image,
878 unsigned long page)
dc009d92
EB
879{
880 kimage_entry_t *ptr, entry;
881 unsigned long destination = 0;
882
883 for_each_kimage_entry(image, ptr, entry) {
72414d3f 884 if (entry & IND_DESTINATION)
dc009d92 885 destination = entry & PAGE_MASK;
dc009d92 886 else if (entry & IND_SOURCE) {
72414d3f 887 if (page == destination)
dc009d92 888 return ptr;
dc009d92
EB
889 destination += PAGE_SIZE;
890 }
891 }
72414d3f 892
314b6a4d 893 return NULL;
dc009d92
EB
894}
895
72414d3f 896static struct page *kimage_alloc_page(struct kimage *image,
9796fdd8 897 gfp_t gfp_mask,
72414d3f 898 unsigned long destination)
dc009d92
EB
899{
900 /*
901 * Here we implement safeguards to ensure that a source page
902 * is not copied to its destination page before the data on
903 * the destination page is no longer useful.
904 *
905 * To do this we maintain the invariant that a source page is
906 * either its own destination page, or it is not a
907 * destination page at all.
908 *
909 * That is slightly stronger than required, but the proof
910 * that no problems will not occur is trivial, and the
911 * implementation is simply to verify.
912 *
913 * When allocating all pages normally this algorithm will run
914 * in O(N) time, but in the worst case it will run in O(N^2)
915 * time. If the runtime is a problem the data structures can
916 * be fixed.
917 */
918 struct page *page;
919 unsigned long addr;
920
921 /*
922 * Walk through the list of destination pages, and see if I
923 * have a match.
924 */
925 list_for_each_entry(page, &image->dest_pages, lru) {
926 addr = page_to_pfn(page) << PAGE_SHIFT;
927 if (addr == destination) {
928 list_del(&page->lru);
929 return page;
930 }
931 }
932 page = NULL;
933 while (1) {
934 kimage_entry_t *old;
935
936 /* Allocate a page, if we run out of memory give up */
937 page = kimage_alloc_pages(gfp_mask, 0);
72414d3f 938 if (!page)
314b6a4d 939 return NULL;
dc009d92 940 /* If the page cannot be used file it away */
72414d3f
MS
941 if (page_to_pfn(page) >
942 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
7d3e2bca 943 list_add(&page->lru, &image->unusable_pages);
dc009d92
EB
944 continue;
945 }
946 addr = page_to_pfn(page) << PAGE_SHIFT;
947
948 /* If it is the destination page we want use it */
949 if (addr == destination)
950 break;
951
952 /* If the page is not a destination page use it */
72414d3f
MS
953 if (!kimage_is_destination_range(image, addr,
954 addr + PAGE_SIZE))
dc009d92
EB
955 break;
956
957 /*
958 * I know that the page is someones destination page.
959 * See if there is already a source page for this
960 * destination page. And if so swap the source pages.
961 */
962 old = kimage_dst_used(image, addr);
963 if (old) {
964 /* If so move it */
965 unsigned long old_addr;
966 struct page *old_page;
967
968 old_addr = *old & PAGE_MASK;
969 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
970 copy_highpage(page, old_page);
971 *old = addr | (*old & ~PAGE_MASK);
972
973 /* The old page I have found cannot be a
f9092f35
JS
974 * destination page, so return it if it's
975 * gfp_flags honor the ones passed in.
dc009d92 976 */
f9092f35
JS
977 if (!(gfp_mask & __GFP_HIGHMEM) &&
978 PageHighMem(old_page)) {
979 kimage_free_pages(old_page);
980 continue;
981 }
dc009d92
EB
982 addr = old_addr;
983 page = old_page;
984 break;
e1bebcf4 985 } else {
dc009d92
EB
986 /* Place the page on the destination list I
987 * will use it later.
988 */
989 list_add(&page->lru, &image->dest_pages);
990 }
991 }
72414d3f 992
dc009d92
EB
993 return page;
994}
995
996static int kimage_load_normal_segment(struct kimage *image,
72414d3f 997 struct kexec_segment *segment)
dc009d92
EB
998{
999 unsigned long maddr;
310faaa9 1000 size_t ubytes, mbytes;
dc009d92 1001 int result;
cb105258
VG
1002 unsigned char __user *buf = NULL;
1003 unsigned char *kbuf = NULL;
dc009d92
EB
1004
1005 result = 0;
cb105258
VG
1006 if (image->file_mode)
1007 kbuf = segment->kbuf;
1008 else
1009 buf = segment->buf;
dc009d92
EB
1010 ubytes = segment->bufsz;
1011 mbytes = segment->memsz;
1012 maddr = segment->mem;
1013
1014 result = kimage_set_destination(image, maddr);
72414d3f 1015 if (result < 0)
dc009d92 1016 goto out;
72414d3f
MS
1017
1018 while (mbytes) {
dc009d92
EB
1019 struct page *page;
1020 char *ptr;
1021 size_t uchunk, mchunk;
72414d3f 1022
dc009d92 1023 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
c80544dc 1024 if (!page) {
dc009d92
EB
1025 result = -ENOMEM;
1026 goto out;
1027 }
72414d3f
MS
1028 result = kimage_add_page(image, page_to_pfn(page)
1029 << PAGE_SHIFT);
1030 if (result < 0)
dc009d92 1031 goto out;
72414d3f 1032
dc009d92
EB
1033 ptr = kmap(page);
1034 /* Start with a clear page */
3ecb01df 1035 clear_page(ptr);
dc009d92 1036 ptr += maddr & ~PAGE_MASK;
31c3a3fe
ZY
1037 mchunk = min_t(size_t, mbytes,
1038 PAGE_SIZE - (maddr & ~PAGE_MASK));
1039 uchunk = min(ubytes, mchunk);
72414d3f 1040
cb105258
VG
1041 /* For file based kexec, source pages are in kernel memory */
1042 if (image->file_mode)
1043 memcpy(ptr, kbuf, uchunk);
1044 else
1045 result = copy_from_user(ptr, buf, uchunk);
dc009d92
EB
1046 kunmap(page);
1047 if (result) {
f65a03f6 1048 result = -EFAULT;
dc009d92
EB
1049 goto out;
1050 }
1051 ubytes -= uchunk;
1052 maddr += mchunk;
cb105258
VG
1053 if (image->file_mode)
1054 kbuf += mchunk;
1055 else
1056 buf += mchunk;
dc009d92
EB
1057 mbytes -= mchunk;
1058 }
72414d3f 1059out:
dc009d92
EB
1060 return result;
1061}
1062
1063static int kimage_load_crash_segment(struct kimage *image,
72414d3f 1064 struct kexec_segment *segment)
dc009d92
EB
1065{
1066 /* For crash dumps kernels we simply copy the data from
1067 * user space to it's destination.
1068 * We do things a page at a time for the sake of kmap.
1069 */
1070 unsigned long maddr;
310faaa9 1071 size_t ubytes, mbytes;
dc009d92 1072 int result;
314b6a4d 1073 unsigned char __user *buf;
dc009d92
EB
1074
1075 result = 0;
1076 buf = segment->buf;
1077 ubytes = segment->bufsz;
1078 mbytes = segment->memsz;
1079 maddr = segment->mem;
72414d3f 1080 while (mbytes) {
dc009d92
EB
1081 struct page *page;
1082 char *ptr;
1083 size_t uchunk, mchunk;
72414d3f 1084
dc009d92 1085 page = pfn_to_page(maddr >> PAGE_SHIFT);
c80544dc 1086 if (!page) {
dc009d92
EB
1087 result = -ENOMEM;
1088 goto out;
1089 }
1090 ptr = kmap(page);
1091 ptr += maddr & ~PAGE_MASK;
31c3a3fe
ZY
1092 mchunk = min_t(size_t, mbytes,
1093 PAGE_SIZE - (maddr & ~PAGE_MASK));
1094 uchunk = min(ubytes, mchunk);
1095 if (mchunk > uchunk) {
dc009d92
EB
1096 /* Zero the trailing part of the page */
1097 memset(ptr + uchunk, 0, mchunk - uchunk);
1098 }
1099 result = copy_from_user(ptr, buf, uchunk);
a7956113 1100 kexec_flush_icache_page(page);
dc009d92
EB
1101 kunmap(page);
1102 if (result) {
f65a03f6 1103 result = -EFAULT;
dc009d92
EB
1104 goto out;
1105 }
1106 ubytes -= uchunk;
1107 maddr += mchunk;
1108 buf += mchunk;
1109 mbytes -= mchunk;
1110 }
72414d3f 1111out:
dc009d92
EB
1112 return result;
1113}
1114
1115static int kimage_load_segment(struct kimage *image,
72414d3f 1116 struct kexec_segment *segment)
dc009d92
EB
1117{
1118 int result = -ENOMEM;
72414d3f
MS
1119
1120 switch (image->type) {
dc009d92
EB
1121 case KEXEC_TYPE_DEFAULT:
1122 result = kimage_load_normal_segment(image, segment);
1123 break;
1124 case KEXEC_TYPE_CRASH:
1125 result = kimage_load_crash_segment(image, segment);
1126 break;
1127 }
72414d3f 1128
dc009d92
EB
1129 return result;
1130}
1131
1132/*
1133 * Exec Kernel system call: for obvious reasons only root may call it.
1134 *
1135 * This call breaks up into three pieces.
1136 * - A generic part which loads the new kernel from the current
1137 * address space, and very carefully places the data in the
1138 * allocated pages.
1139 *
1140 * - A generic part that interacts with the kernel and tells all of
1141 * the devices to shut down. Preventing on-going dmas, and placing
1142 * the devices in a consistent state so a later kernel can
1143 * reinitialize them.
1144 *
1145 * - A machine specific part that includes the syscall number
002ace78 1146 * and then copies the image to it's final destination. And
dc009d92
EB
1147 * jumps into the image at entry.
1148 *
1149 * kexec does not sync, or unmount filesystems so if you need
1150 * that to happen you need to do that yourself.
1151 */
c330dda9
JM
1152struct kimage *kexec_image;
1153struct kimage *kexec_crash_image;
7984754b 1154int kexec_load_disabled;
8c5a1cf0
AM
1155
1156static DEFINE_MUTEX(kexec_mutex);
dc009d92 1157
754fe8d2
HC
1158SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1159 struct kexec_segment __user *, segments, unsigned long, flags)
dc009d92
EB
1160{
1161 struct kimage **dest_image, *image;
dc009d92
EB
1162 int result;
1163
1164 /* We only trust the superuser with rebooting the system. */
7984754b 1165 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
dc009d92
EB
1166 return -EPERM;
1167
1168 /*
1169 * Verify we have a legal set of flags
1170 * This leaves us room for future extensions.
1171 */
1172 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
1173 return -EINVAL;
1174
1175 /* Verify we are on the appropriate architecture */
1176 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
1177 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
dc009d92 1178 return -EINVAL;
dc009d92
EB
1179
1180 /* Put an artificial cap on the number
1181 * of segments passed to kexec_load.
1182 */
1183 if (nr_segments > KEXEC_SEGMENT_MAX)
1184 return -EINVAL;
1185
1186 image = NULL;
1187 result = 0;
1188
1189 /* Because we write directly to the reserved memory
1190 * region when loading crash kernels we need a mutex here to
1191 * prevent multiple crash kernels from attempting to load
1192 * simultaneously, and to prevent a crash kernel from loading
1193 * over the top of a in use crash kernel.
1194 *
1195 * KISS: always take the mutex.
1196 */
8c5a1cf0 1197 if (!mutex_trylock(&kexec_mutex))
dc009d92 1198 return -EBUSY;
72414d3f 1199
dc009d92 1200 dest_image = &kexec_image;
72414d3f 1201 if (flags & KEXEC_ON_CRASH)
dc009d92 1202 dest_image = &kexec_crash_image;
dc009d92
EB
1203 if (nr_segments > 0) {
1204 unsigned long i;
72414d3f 1205
dc009d92 1206 /* Loading another kernel to reboot into */
72414d3f 1207 if ((flags & KEXEC_ON_CRASH) == 0)
255aedd9
VG
1208 result = kimage_alloc_init(&image, entry, nr_segments,
1209 segments, flags);
dc009d92
EB
1210 /* Loading another kernel to switch to if this one crashes */
1211 else if (flags & KEXEC_ON_CRASH) {
1212 /* Free any current crash dump kernel before
1213 * we corrupt it.
1214 */
1215 kimage_free(xchg(&kexec_crash_image, NULL));
255aedd9
VG
1216 result = kimage_alloc_init(&image, entry, nr_segments,
1217 segments, flags);
558df720 1218 crash_map_reserved_pages();
dc009d92 1219 }
72414d3f 1220 if (result)
dc009d92 1221 goto out;
72414d3f 1222
3ab83521
HY
1223 if (flags & KEXEC_PRESERVE_CONTEXT)
1224 image->preserve_context = 1;
dc009d92 1225 result = machine_kexec_prepare(image);
72414d3f 1226 if (result)
dc009d92 1227 goto out;
72414d3f
MS
1228
1229 for (i = 0; i < nr_segments; i++) {
dc009d92 1230 result = kimage_load_segment(image, &image->segment[i]);
72414d3f 1231 if (result)
dc009d92 1232 goto out;
dc009d92 1233 }
7fccf032 1234 kimage_terminate(image);
558df720
MH
1235 if (flags & KEXEC_ON_CRASH)
1236 crash_unmap_reserved_pages();
dc009d92
EB
1237 }
1238 /* Install the new kernel, and Uninstall the old */
1239 image = xchg(dest_image, image);
1240
72414d3f 1241out:
8c5a1cf0 1242 mutex_unlock(&kexec_mutex);
dc009d92 1243 kimage_free(image);
72414d3f 1244
dc009d92
EB
1245 return result;
1246}
1247
558df720
MH
1248/*
1249 * Add and remove page tables for crashkernel memory
1250 *
1251 * Provide an empty default implementation here -- architecture
1252 * code may override this
1253 */
1254void __weak crash_map_reserved_pages(void)
1255{}
1256
1257void __weak crash_unmap_reserved_pages(void)
1258{}
1259
dc009d92 1260#ifdef CONFIG_COMPAT
ca2c405a
HC
1261COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1262 compat_ulong_t, nr_segments,
1263 struct compat_kexec_segment __user *, segments,
1264 compat_ulong_t, flags)
dc009d92
EB
1265{
1266 struct compat_kexec_segment in;
1267 struct kexec_segment out, __user *ksegments;
1268 unsigned long i, result;
1269
1270 /* Don't allow clients that don't understand the native
1271 * architecture to do anything.
1272 */
72414d3f 1273 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
dc009d92 1274 return -EINVAL;
dc009d92 1275
72414d3f 1276 if (nr_segments > KEXEC_SEGMENT_MAX)
dc009d92 1277 return -EINVAL;
dc009d92
EB
1278
1279 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
e1bebcf4 1280 for (i = 0; i < nr_segments; i++) {
dc009d92 1281 result = copy_from_user(&in, &segments[i], sizeof(in));
72414d3f 1282 if (result)
dc009d92 1283 return -EFAULT;
dc009d92
EB
1284
1285 out.buf = compat_ptr(in.buf);
1286 out.bufsz = in.bufsz;
1287 out.mem = in.mem;
1288 out.memsz = in.memsz;
1289
1290 result = copy_to_user(&ksegments[i], &out, sizeof(out));
72414d3f 1291 if (result)
dc009d92 1292 return -EFAULT;
dc009d92
EB
1293 }
1294
1295 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1296}
1297#endif
1298
f0895685
VG
1299SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
1300 unsigned long, cmdline_len, const char __user *, cmdline_ptr,
1301 unsigned long, flags)
1302{
cb105258
VG
1303 int ret = 0, i;
1304 struct kimage **dest_image, *image;
1305
1306 /* We only trust the superuser with rebooting the system. */
1307 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
1308 return -EPERM;
1309
1310 /* Make sure we have a legal set of flags */
1311 if (flags != (flags & KEXEC_FILE_FLAGS))
1312 return -EINVAL;
1313
1314 image = NULL;
1315
1316 if (!mutex_trylock(&kexec_mutex))
1317 return -EBUSY;
1318
1319 dest_image = &kexec_image;
1320 if (flags & KEXEC_FILE_ON_CRASH)
1321 dest_image = &kexec_crash_image;
1322
1323 if (flags & KEXEC_FILE_UNLOAD)
1324 goto exchange;
1325
1326 /*
1327 * In case of crash, new kernel gets loaded in reserved region. It is
1328 * same memory where old crash kernel might be loaded. Free any
1329 * current crash dump kernel before we corrupt it.
1330 */
1331 if (flags & KEXEC_FILE_ON_CRASH)
1332 kimage_free(xchg(&kexec_crash_image, NULL));
1333
1334 ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
1335 cmdline_len, flags);
1336 if (ret)
1337 goto out;
1338
1339 ret = machine_kexec_prepare(image);
1340 if (ret)
1341 goto out;
1342
1343 for (i = 0; i < image->nr_segments; i++) {
1344 struct kexec_segment *ksegment;
1345
1346 ksegment = &image->segment[i];
1347 pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
1348 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
1349 ksegment->memsz);
1350
1351 ret = kimage_load_segment(image, &image->segment[i]);
1352 if (ret)
1353 goto out;
1354 }
1355
1356 kimage_terminate(image);
1357
1358 /*
1359 * Free up any temporary buffers allocated which are not needed
1360 * after image has been loaded
1361 */
1362 kimage_file_post_load_cleanup(image);
1363exchange:
1364 image = xchg(dest_image, image);
1365out:
1366 mutex_unlock(&kexec_mutex);
1367 kimage_free(image);
1368 return ret;
f0895685
VG
1369}
1370
6e274d14 1371void crash_kexec(struct pt_regs *regs)
dc009d92 1372{
8c5a1cf0 1373 /* Take the kexec_mutex here to prevent sys_kexec_load
dc009d92
EB
1374 * running on one cpu from replacing the crash kernel
1375 * we are using after a panic on a different cpu.
1376 *
1377 * If the crash kernel was not located in a fixed area
1378 * of memory the xchg(&kexec_crash_image) would be
1379 * sufficient. But since I reuse the memory...
1380 */
8c5a1cf0 1381 if (mutex_trylock(&kexec_mutex)) {
c0ce7d08 1382 if (kexec_crash_image) {
e996e581 1383 struct pt_regs fixed_regs;
0f4bd46e 1384
e996e581 1385 crash_setup_regs(&fixed_regs, regs);
fd59d231 1386 crash_save_vmcoreinfo();
e996e581 1387 machine_crash_shutdown(&fixed_regs);
c0ce7d08 1388 machine_kexec(kexec_crash_image);
dc009d92 1389 }
8c5a1cf0 1390 mutex_unlock(&kexec_mutex);
dc009d92
EB
1391 }
1392}
cc571658 1393
06a7f711
AW
1394size_t crash_get_memory_size(void)
1395{
e05bd336 1396 size_t size = 0;
06a7f711 1397 mutex_lock(&kexec_mutex);
e05bd336 1398 if (crashk_res.end != crashk_res.start)
28f65c11 1399 size = resource_size(&crashk_res);
06a7f711
AW
1400 mutex_unlock(&kexec_mutex);
1401 return size;
1402}
1403
c0bb9e45
AB
1404void __weak crash_free_reserved_phys_range(unsigned long begin,
1405 unsigned long end)
06a7f711
AW
1406{
1407 unsigned long addr;
1408
e07cee23
JL
1409 for (addr = begin; addr < end; addr += PAGE_SIZE)
1410 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
06a7f711
AW
1411}
1412
1413int crash_shrink_memory(unsigned long new_size)
1414{
1415 int ret = 0;
1416 unsigned long start, end;
bec013c4 1417 unsigned long old_size;
6480e5a0 1418 struct resource *ram_res;
06a7f711
AW
1419
1420 mutex_lock(&kexec_mutex);
1421
1422 if (kexec_crash_image) {
1423 ret = -ENOENT;
1424 goto unlock;
1425 }
1426 start = crashk_res.start;
1427 end = crashk_res.end;
bec013c4
MH
1428 old_size = (end == 0) ? 0 : end - start + 1;
1429 if (new_size >= old_size) {
1430 ret = (new_size == old_size) ? 0 : -EINVAL;
06a7f711
AW
1431 goto unlock;
1432 }
1433
6480e5a0
MH
1434 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1435 if (!ram_res) {
1436 ret = -ENOMEM;
1437 goto unlock;
1438 }
1439
558df720
MH
1440 start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
1441 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
06a7f711 1442
558df720 1443 crash_map_reserved_pages();
c0bb9e45 1444 crash_free_reserved_phys_range(end, crashk_res.end);
06a7f711 1445
e05bd336 1446 if ((start == end) && (crashk_res.parent != NULL))
06a7f711 1447 release_resource(&crashk_res);
6480e5a0
MH
1448
1449 ram_res->start = end;
1450 ram_res->end = crashk_res.end;
1451 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1452 ram_res->name = "System RAM";
1453
475f9aa6 1454 crashk_res.end = end - 1;
6480e5a0
MH
1455
1456 insert_resource(&iomem_resource, ram_res);
558df720 1457 crash_unmap_reserved_pages();
06a7f711
AW
1458
1459unlock:
1460 mutex_unlock(&kexec_mutex);
1461 return ret;
1462}
1463
85916f81
MD
1464static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1465 size_t data_len)
1466{
1467 struct elf_note note;
1468
1469 note.n_namesz = strlen(name) + 1;
1470 note.n_descsz = data_len;
1471 note.n_type = type;
1472 memcpy(buf, &note, sizeof(note));
1473 buf += (sizeof(note) + 3)/4;
1474 memcpy(buf, name, note.n_namesz);
1475 buf += (note.n_namesz + 3)/4;
1476 memcpy(buf, data, note.n_descsz);
1477 buf += (note.n_descsz + 3)/4;
1478
1479 return buf;
1480}
1481
1482static void final_note(u32 *buf)
1483{
1484 struct elf_note note;
1485
1486 note.n_namesz = 0;
1487 note.n_descsz = 0;
1488 note.n_type = 0;
1489 memcpy(buf, &note, sizeof(note));
1490}
1491
1492void crash_save_cpu(struct pt_regs *regs, int cpu)
1493{
1494 struct elf_prstatus prstatus;
1495 u32 *buf;
1496
4f4b6c1a 1497 if ((cpu < 0) || (cpu >= nr_cpu_ids))
85916f81
MD
1498 return;
1499
1500 /* Using ELF notes here is opportunistic.
1501 * I need a well defined structure format
1502 * for the data I pass, and I need tags
1503 * on the data to indicate what information I have
1504 * squirrelled away. ELF notes happen to provide
1505 * all of that, so there is no need to invent something new.
1506 */
e1bebcf4 1507 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
85916f81
MD
1508 if (!buf)
1509 return;
1510 memset(&prstatus, 0, sizeof(prstatus));
1511 prstatus.pr_pid = current->pid;
6cd61c0b 1512 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
6672f76a 1513 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
e1bebcf4 1514 &prstatus, sizeof(prstatus));
85916f81
MD
1515 final_note(buf);
1516}
1517
cc571658
VG
1518static int __init crash_notes_memory_init(void)
1519{
1520 /* Allocate memory for saving cpu registers. */
1521 crash_notes = alloc_percpu(note_buf_t);
1522 if (!crash_notes) {
e1bebcf4 1523 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
cc571658
VG
1524 return -ENOMEM;
1525 }
1526 return 0;
1527}
c96d6660 1528subsys_initcall(crash_notes_memory_init);
fd59d231 1529
cba63c30
BW
1530
1531/*
1532 * parsing the "crashkernel" commandline
1533 *
1534 * this code is intended to be called from architecture specific code
1535 */
1536
1537
1538/*
1539 * This function parses command lines in the format
1540 *
1541 * crashkernel=ramsize-range:size[,...][@offset]
1542 *
1543 * The function returns 0 on success and -EINVAL on failure.
1544 */
e1bebcf4
FF
1545static int __init parse_crashkernel_mem(char *cmdline,
1546 unsigned long long system_ram,
1547 unsigned long long *crash_size,
1548 unsigned long long *crash_base)
cba63c30
BW
1549{
1550 char *cur = cmdline, *tmp;
1551
1552 /* for each entry of the comma-separated list */
1553 do {
1554 unsigned long long start, end = ULLONG_MAX, size;
1555
1556 /* get the start of the range */
1557 start = memparse(cur, &tmp);
1558 if (cur == tmp) {
e1bebcf4 1559 pr_warn("crashkernel: Memory value expected\n");
cba63c30
BW
1560 return -EINVAL;
1561 }
1562 cur = tmp;
1563 if (*cur != '-') {
e1bebcf4 1564 pr_warn("crashkernel: '-' expected\n");
cba63c30
BW
1565 return -EINVAL;
1566 }
1567 cur++;
1568
1569 /* if no ':' is here, than we read the end */
1570 if (*cur != ':') {
1571 end = memparse(cur, &tmp);
1572 if (cur == tmp) {
e1bebcf4 1573 pr_warn("crashkernel: Memory value expected\n");
cba63c30
BW
1574 return -EINVAL;
1575 }
1576 cur = tmp;
1577 if (end <= start) {
e1bebcf4 1578 pr_warn("crashkernel: end <= start\n");
cba63c30
BW
1579 return -EINVAL;
1580 }
1581 }
1582
1583 if (*cur != ':') {
e1bebcf4 1584 pr_warn("crashkernel: ':' expected\n");
cba63c30
BW
1585 return -EINVAL;
1586 }
1587 cur++;
1588
1589 size = memparse(cur, &tmp);
1590 if (cur == tmp) {
e1bebcf4 1591 pr_warn("Memory value expected\n");
cba63c30
BW
1592 return -EINVAL;
1593 }
1594 cur = tmp;
1595 if (size >= system_ram) {
e1bebcf4 1596 pr_warn("crashkernel: invalid size\n");
cba63c30
BW
1597 return -EINVAL;
1598 }
1599
1600 /* match ? */
be089d79 1601 if (system_ram >= start && system_ram < end) {
cba63c30
BW
1602 *crash_size = size;
1603 break;
1604 }
1605 } while (*cur++ == ',');
1606
1607 if (*crash_size > 0) {
11c7da4b 1608 while (*cur && *cur != ' ' && *cur != '@')
cba63c30
BW
1609 cur++;
1610 if (*cur == '@') {
1611 cur++;
1612 *crash_base = memparse(cur, &tmp);
1613 if (cur == tmp) {
e1bebcf4 1614 pr_warn("Memory value expected after '@'\n");
cba63c30
BW
1615 return -EINVAL;
1616 }
1617 }
1618 }
1619
1620 return 0;
1621}
1622
1623/*
1624 * That function parses "simple" (old) crashkernel command lines like
1625 *
e1bebcf4 1626 * crashkernel=size[@offset]
cba63c30
BW
1627 *
1628 * It returns 0 on success and -EINVAL on failure.
1629 */
e1bebcf4
FF
1630static int __init parse_crashkernel_simple(char *cmdline,
1631 unsigned long long *crash_size,
1632 unsigned long long *crash_base)
cba63c30
BW
1633{
1634 char *cur = cmdline;
1635
1636 *crash_size = memparse(cmdline, &cur);
1637 if (cmdline == cur) {
e1bebcf4 1638 pr_warn("crashkernel: memory value expected\n");
cba63c30
BW
1639 return -EINVAL;
1640 }
1641
1642 if (*cur == '@')
1643 *crash_base = memparse(cur+1, &cur);
eaa3be6a 1644 else if (*cur != ' ' && *cur != '\0') {
e1bebcf4 1645 pr_warn("crashkernel: unrecognized char\n");
eaa3be6a
ZD
1646 return -EINVAL;
1647 }
cba63c30
BW
1648
1649 return 0;
1650}
1651
adbc742b
YL
1652#define SUFFIX_HIGH 0
1653#define SUFFIX_LOW 1
1654#define SUFFIX_NULL 2
1655static __initdata char *suffix_tbl[] = {
1656 [SUFFIX_HIGH] = ",high",
1657 [SUFFIX_LOW] = ",low",
1658 [SUFFIX_NULL] = NULL,
1659};
1660
cba63c30 1661/*
adbc742b
YL
1662 * That function parses "suffix" crashkernel command lines like
1663 *
1664 * crashkernel=size,[high|low]
1665 *
1666 * It returns 0 on success and -EINVAL on failure.
cba63c30 1667 */
adbc742b
YL
1668static int __init parse_crashkernel_suffix(char *cmdline,
1669 unsigned long long *crash_size,
1670 unsigned long long *crash_base,
1671 const char *suffix)
1672{
1673 char *cur = cmdline;
1674
1675 *crash_size = memparse(cmdline, &cur);
1676 if (cmdline == cur) {
1677 pr_warn("crashkernel: memory value expected\n");
1678 return -EINVAL;
1679 }
1680
1681 /* check with suffix */
1682 if (strncmp(cur, suffix, strlen(suffix))) {
1683 pr_warn("crashkernel: unrecognized char\n");
1684 return -EINVAL;
1685 }
1686 cur += strlen(suffix);
1687 if (*cur != ' ' && *cur != '\0') {
1688 pr_warn("crashkernel: unrecognized char\n");
1689 return -EINVAL;
1690 }
1691
1692 return 0;
1693}
1694
1695static __init char *get_last_crashkernel(char *cmdline,
1696 const char *name,
1697 const char *suffix)
1698{
1699 char *p = cmdline, *ck_cmdline = NULL;
1700
1701 /* find crashkernel and use the last one if there are more */
1702 p = strstr(p, name);
1703 while (p) {
1704 char *end_p = strchr(p, ' ');
1705 char *q;
1706
1707 if (!end_p)
1708 end_p = p + strlen(p);
1709
1710 if (!suffix) {
1711 int i;
1712
1713 /* skip the one with any known suffix */
1714 for (i = 0; suffix_tbl[i]; i++) {
1715 q = end_p - strlen(suffix_tbl[i]);
1716 if (!strncmp(q, suffix_tbl[i],
1717 strlen(suffix_tbl[i])))
1718 goto next;
1719 }
1720 ck_cmdline = p;
1721 } else {
1722 q = end_p - strlen(suffix);
1723 if (!strncmp(q, suffix, strlen(suffix)))
1724 ck_cmdline = p;
1725 }
1726next:
1727 p = strstr(p+1, name);
1728 }
1729
1730 if (!ck_cmdline)
1731 return NULL;
1732
1733 return ck_cmdline;
1734}
1735
0212f915 1736static int __init __parse_crashkernel(char *cmdline,
cba63c30
BW
1737 unsigned long long system_ram,
1738 unsigned long long *crash_size,
0212f915 1739 unsigned long long *crash_base,
adbc742b
YL
1740 const char *name,
1741 const char *suffix)
cba63c30 1742{
cba63c30 1743 char *first_colon, *first_space;
adbc742b 1744 char *ck_cmdline;
cba63c30
BW
1745
1746 BUG_ON(!crash_size || !crash_base);
1747 *crash_size = 0;
1748 *crash_base = 0;
1749
adbc742b 1750 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
cba63c30
BW
1751
1752 if (!ck_cmdline)
1753 return -EINVAL;
1754
0212f915 1755 ck_cmdline += strlen(name);
cba63c30 1756
adbc742b
YL
1757 if (suffix)
1758 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1759 crash_base, suffix);
cba63c30
BW
1760 /*
1761 * if the commandline contains a ':', then that's the extended
1762 * syntax -- if not, it must be the classic syntax
1763 */
1764 first_colon = strchr(ck_cmdline, ':');
1765 first_space = strchr(ck_cmdline, ' ');
1766 if (first_colon && (!first_space || first_colon < first_space))
1767 return parse_crashkernel_mem(ck_cmdline, system_ram,
1768 crash_size, crash_base);
cba63c30 1769
80c74f6a 1770 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
cba63c30
BW
1771}
1772
adbc742b
YL
1773/*
1774 * That function is the entry point for command line parsing and should be
1775 * called from the arch-specific code.
1776 */
0212f915
YL
1777int __init parse_crashkernel(char *cmdline,
1778 unsigned long long system_ram,
1779 unsigned long long *crash_size,
1780 unsigned long long *crash_base)
1781{
1782 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
adbc742b 1783 "crashkernel=", NULL);
0212f915 1784}
55a20ee7
YL
1785
1786int __init parse_crashkernel_high(char *cmdline,
1787 unsigned long long system_ram,
1788 unsigned long long *crash_size,
1789 unsigned long long *crash_base)
1790{
1791 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
adbc742b 1792 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
55a20ee7 1793}
0212f915
YL
1794
1795int __init parse_crashkernel_low(char *cmdline,
1796 unsigned long long system_ram,
1797 unsigned long long *crash_size,
1798 unsigned long long *crash_base)
1799{
1800 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
adbc742b 1801 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
0212f915 1802}
cba63c30 1803
fa8ff292 1804static void update_vmcoreinfo_note(void)
fd59d231 1805{
fa8ff292 1806 u32 *buf = vmcoreinfo_note;
fd59d231
KO
1807
1808 if (!vmcoreinfo_size)
1809 return;
fd59d231
KO
1810 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1811 vmcoreinfo_size);
fd59d231
KO
1812 final_note(buf);
1813}
1814
fa8ff292
MH
1815void crash_save_vmcoreinfo(void)
1816{
63dca8d5 1817 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
fa8ff292
MH
1818 update_vmcoreinfo_note();
1819}
1820
fd59d231
KO
1821void vmcoreinfo_append_str(const char *fmt, ...)
1822{
1823 va_list args;
1824 char buf[0x50];
310faaa9 1825 size_t r;
fd59d231
KO
1826
1827 va_start(args, fmt);
a19428e5 1828 r = vscnprintf(buf, sizeof(buf), fmt, args);
fd59d231
KO
1829 va_end(args);
1830
31c3a3fe 1831 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
fd59d231
KO
1832
1833 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1834
1835 vmcoreinfo_size += r;
1836}
1837
1838/*
1839 * provide an empty default implementation here -- architecture
1840 * code may override this
1841 */
52f5684c 1842void __weak arch_crash_save_vmcoreinfo(void)
fd59d231
KO
1843{}
1844
52f5684c 1845unsigned long __weak paddr_vmcoreinfo_note(void)
fd59d231
KO
1846{
1847 return __pa((unsigned long)(char *)&vmcoreinfo_note);
1848}
1849
1850static int __init crash_save_vmcoreinfo_init(void)
1851{
bba1f603
KO
1852 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1853 VMCOREINFO_PAGESIZE(PAGE_SIZE);
fd59d231 1854
bcbba6c1
KO
1855 VMCOREINFO_SYMBOL(init_uts_ns);
1856 VMCOREINFO_SYMBOL(node_online_map);
d034cfab 1857#ifdef CONFIG_MMU
bcbba6c1 1858 VMCOREINFO_SYMBOL(swapper_pg_dir);
d034cfab 1859#endif
bcbba6c1 1860 VMCOREINFO_SYMBOL(_stext);
f1c4069e 1861 VMCOREINFO_SYMBOL(vmap_area_list);
fd59d231
KO
1862
1863#ifndef CONFIG_NEED_MULTIPLE_NODES
bcbba6c1
KO
1864 VMCOREINFO_SYMBOL(mem_map);
1865 VMCOREINFO_SYMBOL(contig_page_data);
fd59d231
KO
1866#endif
1867#ifdef CONFIG_SPARSEMEM
bcbba6c1
KO
1868 VMCOREINFO_SYMBOL(mem_section);
1869 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
c76f860c 1870 VMCOREINFO_STRUCT_SIZE(mem_section);
bcbba6c1 1871 VMCOREINFO_OFFSET(mem_section, section_mem_map);
fd59d231 1872#endif
c76f860c
KO
1873 VMCOREINFO_STRUCT_SIZE(page);
1874 VMCOREINFO_STRUCT_SIZE(pglist_data);
1875 VMCOREINFO_STRUCT_SIZE(zone);
1876 VMCOREINFO_STRUCT_SIZE(free_area);
1877 VMCOREINFO_STRUCT_SIZE(list_head);
1878 VMCOREINFO_SIZE(nodemask_t);
bcbba6c1
KO
1879 VMCOREINFO_OFFSET(page, flags);
1880 VMCOREINFO_OFFSET(page, _count);
1881 VMCOREINFO_OFFSET(page, mapping);
1882 VMCOREINFO_OFFSET(page, lru);
8d67091e
AK
1883 VMCOREINFO_OFFSET(page, _mapcount);
1884 VMCOREINFO_OFFSET(page, private);
bcbba6c1
KO
1885 VMCOREINFO_OFFSET(pglist_data, node_zones);
1886 VMCOREINFO_OFFSET(pglist_data, nr_zones);
fd59d231 1887#ifdef CONFIG_FLAT_NODE_MEM_MAP
bcbba6c1 1888 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
fd59d231 1889#endif
bcbba6c1
KO
1890 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1891 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1892 VMCOREINFO_OFFSET(pglist_data, node_id);
1893 VMCOREINFO_OFFSET(zone, free_area);
1894 VMCOREINFO_OFFSET(zone, vm_stat);
1895 VMCOREINFO_OFFSET(zone, spanned_pages);
1896 VMCOREINFO_OFFSET(free_area, free_list);
1897 VMCOREINFO_OFFSET(list_head, next);
1898 VMCOREINFO_OFFSET(list_head, prev);
13ba3fcb
AK
1899 VMCOREINFO_OFFSET(vmap_area, va_start);
1900 VMCOREINFO_OFFSET(vmap_area, list);
bcbba6c1 1901 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
04d491ab 1902 log_buf_kexec_setup();
83a08e7c 1903 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
bcbba6c1 1904 VMCOREINFO_NUMBER(NR_FREE_PAGES);
122c7a59
KO
1905 VMCOREINFO_NUMBER(PG_lru);
1906 VMCOREINFO_NUMBER(PG_private);
1907 VMCOREINFO_NUMBER(PG_swapcache);
8d67091e 1908 VMCOREINFO_NUMBER(PG_slab);
0d0bf667
MT
1909#ifdef CONFIG_MEMORY_FAILURE
1910 VMCOREINFO_NUMBER(PG_hwpoison);
1911#endif
b3acc56b 1912 VMCOREINFO_NUMBER(PG_head_mask);
8d67091e 1913 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
3a1122d2 1914#ifdef CONFIG_HUGETLBFS
8f1d26d0 1915 VMCOREINFO_SYMBOL(free_huge_page);
3a1122d2 1916#endif
fd59d231
KO
1917
1918 arch_crash_save_vmcoreinfo();
fa8ff292 1919 update_vmcoreinfo_note();
fd59d231
KO
1920
1921 return 0;
1922}
1923
c96d6660 1924subsys_initcall(crash_save_vmcoreinfo_init);
3ab83521 1925
cb105258
VG
1926static int __kexec_add_segment(struct kimage *image, char *buf,
1927 unsigned long bufsz, unsigned long mem,
1928 unsigned long memsz)
1929{
1930 struct kexec_segment *ksegment;
1931
1932 ksegment = &image->segment[image->nr_segments];
1933 ksegment->kbuf = buf;
1934 ksegment->bufsz = bufsz;
1935 ksegment->mem = mem;
1936 ksegment->memsz = memsz;
1937 image->nr_segments++;
1938
1939 return 0;
1940}
1941
1942static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
1943 struct kexec_buf *kbuf)
1944{
1945 struct kimage *image = kbuf->image;
1946 unsigned long temp_start, temp_end;
1947
1948 temp_end = min(end, kbuf->buf_max);
1949 temp_start = temp_end - kbuf->memsz;
1950
1951 do {
1952 /* align down start */
1953 temp_start = temp_start & (~(kbuf->buf_align - 1));
1954
1955 if (temp_start < start || temp_start < kbuf->buf_min)
1956 return 0;
1957
1958 temp_end = temp_start + kbuf->memsz - 1;
1959
1960 /*
1961 * Make sure this does not conflict with any of existing
1962 * segments
1963 */
1964 if (kimage_is_destination_range(image, temp_start, temp_end)) {
1965 temp_start = temp_start - PAGE_SIZE;
1966 continue;
1967 }
1968
1969 /* We found a suitable memory range */
1970 break;
1971 } while (1);
1972
1973 /* If we are here, we found a suitable memory range */
1974 __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
1975 kbuf->memsz);
1976
1977 /* Success, stop navigating through remaining System RAM ranges */
1978 return 1;
1979}
1980
1981static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
1982 struct kexec_buf *kbuf)
1983{
1984 struct kimage *image = kbuf->image;
1985 unsigned long temp_start, temp_end;
1986
1987 temp_start = max(start, kbuf->buf_min);
1988
1989 do {
1990 temp_start = ALIGN(temp_start, kbuf->buf_align);
1991 temp_end = temp_start + kbuf->memsz - 1;
1992
1993 if (temp_end > end || temp_end > kbuf->buf_max)
1994 return 0;
1995 /*
1996 * Make sure this does not conflict with any of existing
1997 * segments
1998 */
1999 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2000 temp_start = temp_start + PAGE_SIZE;
2001 continue;
2002 }
2003
2004 /* We found a suitable memory range */
2005 break;
2006 } while (1);
2007
2008 /* If we are here, we found a suitable memory range */
2009 __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
2010 kbuf->memsz);
2011
2012 /* Success, stop navigating through remaining System RAM ranges */
2013 return 1;
2014}
2015
2016static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
2017{
2018 struct kexec_buf *kbuf = (struct kexec_buf *)arg;
2019 unsigned long sz = end - start + 1;
2020
2021 /* Returning 0 will take to next memory range */
2022 if (sz < kbuf->memsz)
2023 return 0;
2024
2025 if (end < kbuf->buf_min || start > kbuf->buf_max)
2026 return 0;
2027
2028 /*
2029 * Allocate memory top down with-in ram range. Otherwise bottom up
2030 * allocation.
2031 */
2032 if (kbuf->top_down)
2033 return locate_mem_hole_top_down(start, end, kbuf);
2034 return locate_mem_hole_bottom_up(start, end, kbuf);
2035}
2036
2037/*
2038 * Helper function for placing a buffer in a kexec segment. This assumes
2039 * that kexec_mutex is held.
2040 */
2041int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
2042 unsigned long memsz, unsigned long buf_align,
2043 unsigned long buf_min, unsigned long buf_max,
2044 bool top_down, unsigned long *load_addr)
2045{
2046
2047 struct kexec_segment *ksegment;
2048 struct kexec_buf buf, *kbuf;
2049 int ret;
2050
2051 /* Currently adding segment this way is allowed only in file mode */
2052 if (!image->file_mode)
2053 return -EINVAL;
2054
2055 if (image->nr_segments >= KEXEC_SEGMENT_MAX)
2056 return -EINVAL;
2057
2058 /*
2059 * Make sure we are not trying to add buffer after allocating
2060 * control pages. All segments need to be placed first before
2061 * any control pages are allocated. As control page allocation
2062 * logic goes through list of segments to make sure there are
2063 * no destination overlaps.
2064 */
2065 if (!list_empty(&image->control_pages)) {
2066 WARN_ON(1);
2067 return -EINVAL;
2068 }
2069
2070 memset(&buf, 0, sizeof(struct kexec_buf));
2071 kbuf = &buf;
2072 kbuf->image = image;
2073 kbuf->buffer = buffer;
2074 kbuf->bufsz = bufsz;
2075
2076 kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
2077 kbuf->buf_align = max(buf_align, PAGE_SIZE);
2078 kbuf->buf_min = buf_min;
2079 kbuf->buf_max = buf_max;
2080 kbuf->top_down = top_down;
2081
2082 /* Walk the RAM ranges and allocate a suitable range for the buffer */
2083 ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback);
2084 if (ret != 1) {
2085 /* A suitable memory range could not be found for buffer */
2086 return -EADDRNOTAVAIL;
2087 }
2088
2089 /* Found a suitable memory range */
2090 ksegment = &image->segment[image->nr_segments - 1];
2091 *load_addr = ksegment->mem;
2092 return 0;
2093}
2094
2095
7ade3fcc
HY
2096/*
2097 * Move into place and start executing a preloaded standalone
2098 * executable. If nothing was preloaded return an error.
3ab83521
HY
2099 */
2100int kernel_kexec(void)
2101{
2102 int error = 0;
2103
8c5a1cf0 2104 if (!mutex_trylock(&kexec_mutex))
3ab83521
HY
2105 return -EBUSY;
2106 if (!kexec_image) {
2107 error = -EINVAL;
2108 goto Unlock;
2109 }
2110
3ab83521 2111#ifdef CONFIG_KEXEC_JUMP
7ade3fcc 2112 if (kexec_image->preserve_context) {
bcda53fa 2113 lock_system_sleep();
89081d17
HY
2114 pm_prepare_console();
2115 error = freeze_processes();
2116 if (error) {
2117 error = -EBUSY;
2118 goto Restore_console;
2119 }
2120 suspend_console();
d1616302 2121 error = dpm_suspend_start(PMSG_FREEZE);
89081d17
HY
2122 if (error)
2123 goto Resume_console;
d1616302 2124 /* At this point, dpm_suspend_start() has been called,
cf579dfb
RW
2125 * but *not* dpm_suspend_end(). We *must* call
2126 * dpm_suspend_end() now. Otherwise, drivers for
89081d17
HY
2127 * some devices (e.g. interrupt controllers) become
2128 * desynchronized with the actual state of the
2129 * hardware at resume time, and evil weirdness ensues.
2130 */
cf579dfb 2131 error = dpm_suspend_end(PMSG_FREEZE);
89081d17 2132 if (error)
749b0afc
RW
2133 goto Resume_devices;
2134 error = disable_nonboot_cpus();
2135 if (error)
2136 goto Enable_cpus;
2ed8d2b3 2137 local_irq_disable();
2e711c04 2138 error = syscore_suspend();
770824bd 2139 if (error)
749b0afc 2140 goto Enable_irqs;
7ade3fcc 2141 } else
3ab83521 2142#endif
7ade3fcc 2143 {
4fc9bbf9 2144 kexec_in_progress = true;
ca195b7f 2145 kernel_restart_prepare(NULL);
c97102ba 2146 migrate_to_reboot_cpu();
011e4b02
SB
2147
2148 /*
2149 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
2150 * no further code needs to use CPU hotplug (which is true in
2151 * the reboot case). However, the kexec path depends on using
2152 * CPU hotplug again; so re-enable it here.
2153 */
2154 cpu_hotplug_enable();
e1bebcf4 2155 pr_emerg("Starting new kernel\n");
3ab83521
HY
2156 machine_shutdown();
2157 }
2158
2159 machine_kexec(kexec_image);
2160
3ab83521 2161#ifdef CONFIG_KEXEC_JUMP
7ade3fcc 2162 if (kexec_image->preserve_context) {
19234c08 2163 syscore_resume();
749b0afc 2164 Enable_irqs:
3ab83521 2165 local_irq_enable();
749b0afc 2166 Enable_cpus:
89081d17 2167 enable_nonboot_cpus();
cf579dfb 2168 dpm_resume_start(PMSG_RESTORE);
89081d17 2169 Resume_devices:
d1616302 2170 dpm_resume_end(PMSG_RESTORE);
89081d17
HY
2171 Resume_console:
2172 resume_console();
2173 thaw_processes();
2174 Restore_console:
2175 pm_restore_console();
bcda53fa 2176 unlock_system_sleep();
3ab83521 2177 }
7ade3fcc 2178#endif
3ab83521
HY
2179
2180 Unlock:
8c5a1cf0 2181 mutex_unlock(&kexec_mutex);
3ab83521
HY
2182 return error;
2183}
This page took 1.870069 seconds and 5 git commands to generate.