Commit | Line | Data |
---|---|---|
5234f5eb EB |
1 | /* |
2 | * machine_kexec.c - handle transition of Linux booting another kernel | |
3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> | |
4 | * | |
5 | * This source code is licensed under the GNU General Public License, | |
6 | * Version 2. See the file COPYING for more details. | |
7 | */ | |
8 | ||
9 | #include <linux/mm.h> | |
10 | #include <linux/kexec.h> | |
11 | #include <linux/delay.h> | |
12 | #include <linux/string.h> | |
13 | #include <linux/reboot.h> | |
14 | #include <asm/pda.h> | |
15 | #include <asm/pgtable.h> | |
16 | #include <asm/pgalloc.h> | |
17 | #include <asm/tlbflush.h> | |
18 | #include <asm/mmu_context.h> | |
19 | #include <asm/io.h> | |
20 | #include <asm/apic.h> | |
21 | #include <asm/cpufeature.h> | |
22 | #include <asm/hw_irq.h> | |
23 | ||
24 | #define LEVEL0_SIZE (1UL << 12UL) | |
25 | #define LEVEL1_SIZE (1UL << 21UL) | |
26 | #define LEVEL2_SIZE (1UL << 30UL) | |
27 | #define LEVEL3_SIZE (1UL << 39UL) | |
28 | #define LEVEL4_SIZE (1UL << 48UL) | |
29 | ||
30 | #define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | |
31 | #define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) | |
32 | #define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | |
33 | #define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | |
34 | ||
72414d3f | 35 | static void init_level2_page(u64 *level2p, unsigned long addr) |
5234f5eb EB |
36 | { |
37 | unsigned long end_addr; | |
72414d3f | 38 | |
5234f5eb EB |
39 | addr &= PAGE_MASK; |
40 | end_addr = addr + LEVEL2_SIZE; | |
72414d3f | 41 | while (addr < end_addr) { |
5234f5eb EB |
42 | *(level2p++) = addr | L1_ATTR; |
43 | addr += LEVEL1_SIZE; | |
44 | } | |
45 | } | |
46 | ||
72414d3f MS |
47 | static int init_level3_page(struct kimage *image, u64 *level3p, |
48 | unsigned long addr, unsigned long last_addr) | |
5234f5eb EB |
49 | { |
50 | unsigned long end_addr; | |
51 | int result; | |
72414d3f | 52 | |
5234f5eb EB |
53 | result = 0; |
54 | addr &= PAGE_MASK; | |
55 | end_addr = addr + LEVEL3_SIZE; | |
72414d3f | 56 | while ((addr < last_addr) && (addr < end_addr)) { |
5234f5eb EB |
57 | struct page *page; |
58 | u64 *level2p; | |
72414d3f | 59 | |
5234f5eb EB |
60 | page = kimage_alloc_control_pages(image, 0); |
61 | if (!page) { | |
62 | result = -ENOMEM; | |
63 | goto out; | |
64 | } | |
65 | level2p = (u64 *)page_address(page); | |
66 | init_level2_page(level2p, addr); | |
67 | *(level3p++) = __pa(level2p) | L2_ATTR; | |
68 | addr += LEVEL2_SIZE; | |
69 | } | |
70 | /* clear the unused entries */ | |
72414d3f | 71 | while (addr < end_addr) { |
5234f5eb EB |
72 | *(level3p++) = 0; |
73 | addr += LEVEL2_SIZE; | |
74 | } | |
75 | out: | |
76 | return result; | |
77 | } | |
78 | ||
79 | ||
72414d3f MS |
80 | static int init_level4_page(struct kimage *image, u64 *level4p, |
81 | unsigned long addr, unsigned long last_addr) | |
5234f5eb EB |
82 | { |
83 | unsigned long end_addr; | |
84 | int result; | |
72414d3f | 85 | |
5234f5eb EB |
86 | result = 0; |
87 | addr &= PAGE_MASK; | |
88 | end_addr = addr + LEVEL4_SIZE; | |
72414d3f | 89 | while ((addr < last_addr) && (addr < end_addr)) { |
5234f5eb EB |
90 | struct page *page; |
91 | u64 *level3p; | |
72414d3f | 92 | |
5234f5eb EB |
93 | page = kimage_alloc_control_pages(image, 0); |
94 | if (!page) { | |
95 | result = -ENOMEM; | |
96 | goto out; | |
97 | } | |
98 | level3p = (u64 *)page_address(page); | |
99 | result = init_level3_page(image, level3p, addr, last_addr); | |
100 | if (result) { | |
101 | goto out; | |
102 | } | |
103 | *(level4p++) = __pa(level3p) | L3_ATTR; | |
104 | addr += LEVEL3_SIZE; | |
105 | } | |
106 | /* clear the unused entries */ | |
72414d3f | 107 | while (addr < end_addr) { |
5234f5eb EB |
108 | *(level4p++) = 0; |
109 | addr += LEVEL3_SIZE; | |
110 | } | |
72414d3f | 111 | out: |
5234f5eb EB |
112 | return result; |
113 | } | |
114 | ||
115 | ||
116 | static int init_pgtable(struct kimage *image, unsigned long start_pgtable) | |
117 | { | |
118 | u64 *level4p; | |
119 | level4p = (u64 *)__va(start_pgtable); | |
72414d3f | 120 | return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); |
5234f5eb EB |
121 | } |
122 | ||
123 | static void set_idt(void *newidt, u16 limit) | |
124 | { | |
36c4fd23 | 125 | struct desc_ptr curidt; |
5234f5eb EB |
126 | |
127 | /* x86-64 supports unaliged loads & stores */ | |
36c4fd23 EB |
128 | curidt.size = limit; |
129 | curidt.address = (unsigned long)newidt; | |
5234f5eb EB |
130 | |
131 | __asm__ __volatile__ ( | |
36c4fd23 EB |
132 | "lidtq %0\n" |
133 | : : "m" (curidt) | |
5234f5eb EB |
134 | ); |
135 | }; | |
136 | ||
137 | ||
138 | static void set_gdt(void *newgdt, u16 limit) | |
139 | { | |
36c4fd23 | 140 | struct desc_ptr curgdt; |
5234f5eb EB |
141 | |
142 | /* x86-64 supports unaligned loads & stores */ | |
36c4fd23 EB |
143 | curgdt.size = limit; |
144 | curgdt.address = (unsigned long)newgdt; | |
5234f5eb EB |
145 | |
146 | __asm__ __volatile__ ( | |
36c4fd23 EB |
147 | "lgdtq %0\n" |
148 | : : "m" (curgdt) | |
5234f5eb EB |
149 | ); |
150 | }; | |
151 | ||
152 | static void load_segments(void) | |
153 | { | |
154 | __asm__ __volatile__ ( | |
36c4fd23 EB |
155 | "\tmovl %0,%%ds\n" |
156 | "\tmovl %0,%%es\n" | |
157 | "\tmovl %0,%%ss\n" | |
158 | "\tmovl %0,%%fs\n" | |
159 | "\tmovl %0,%%gs\n" | |
160 | : : "a" (__KERNEL_DS) | |
5234f5eb | 161 | ); |
5234f5eb EB |
162 | } |
163 | ||
72414d3f MS |
164 | typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, |
165 | unsigned long control_code_buffer, | |
166 | unsigned long start_address, | |
167 | unsigned long pgtable) ATTRIB_NORET; | |
5234f5eb EB |
168 | |
169 | const extern unsigned char relocate_new_kernel[]; | |
170 | const extern unsigned long relocate_new_kernel_size; | |
171 | ||
172 | int machine_kexec_prepare(struct kimage *image) | |
173 | { | |
174 | unsigned long start_pgtable, control_code_buffer; | |
175 | int result; | |
176 | ||
177 | /* Calculate the offsets */ | |
72414d3f | 178 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; |
5234f5eb EB |
179 | control_code_buffer = start_pgtable + 4096UL; |
180 | ||
181 | /* Setup the identity mapped 64bit page table */ | |
182 | result = init_pgtable(image, start_pgtable); | |
72414d3f | 183 | if (result) |
5234f5eb | 184 | return result; |
5234f5eb EB |
185 | |
186 | /* Place the code in the reboot code buffer */ | |
72414d3f MS |
187 | memcpy(__va(control_code_buffer), relocate_new_kernel, |
188 | relocate_new_kernel_size); | |
5234f5eb EB |
189 | |
190 | return 0; | |
191 | } | |
192 | ||
193 | void machine_kexec_cleanup(struct kimage *image) | |
194 | { | |
195 | return; | |
196 | } | |
197 | ||
198 | /* | |
199 | * Do not allocate memory (or fail in any way) in machine_kexec(). | |
200 | * We are past the point of no return, committed to rebooting now. | |
201 | */ | |
202 | NORET_TYPE void machine_kexec(struct kimage *image) | |
203 | { | |
204 | unsigned long page_list; | |
205 | unsigned long control_code_buffer; | |
206 | unsigned long start_pgtable; | |
207 | relocate_new_kernel_t rnk; | |
208 | ||
209 | /* Interrupts aren't acceptable while we reboot */ | |
210 | local_irq_disable(); | |
211 | ||
212 | /* Calculate the offsets */ | |
72414d3f MS |
213 | page_list = image->head; |
214 | start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; | |
5234f5eb EB |
215 | control_code_buffer = start_pgtable + 4096UL; |
216 | ||
217 | /* Set the low half of the page table to my identity mapped | |
218 | * page table for kexec. Leave the high half pointing at the | |
219 | * kernel pages. Don't bother to flush the global pages | |
220 | * as that will happen when I fully switch to my identity mapped | |
221 | * page table anyway. | |
222 | */ | |
223 | memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); | |
224 | __flush_tlb(); | |
225 | ||
226 | ||
227 | /* The segment registers are funny things, they are | |
228 | * automatically loaded from a table, in memory wherever you | |
229 | * set them to a specific selector, but this table is never | |
230 | * accessed again unless you set the segment to a different selector. | |
231 | * | |
232 | * The more common model are caches where the behide | |
233 | * the scenes work is done, but is also dropped at arbitrary | |
234 | * times. | |
235 | * | |
236 | * I take advantage of this here by force loading the | |
237 | * segments, before I zap the gdt with an invalid value. | |
238 | */ | |
239 | load_segments(); | |
240 | /* The gdt & idt are now invalid. | |
241 | * If you want to load them you must set up your own idt & gdt. | |
242 | */ | |
243 | set_gdt(phys_to_virt(0),0); | |
244 | set_idt(phys_to_virt(0),0); | |
245 | /* now call it */ | |
246 | rnk = (relocate_new_kernel_t) control_code_buffer; | |
247 | (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); | |
248 | } |