| 1 | /* |
| 2 | * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit |
| 3 | * |
| 4 | * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE |
| 5 | * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> |
| 6 | * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> |
| 7 | * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> |
| 8 | * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> |
| 9 | */ |
| 10 | |
| 11 | |
| 12 | #include <linux/linkage.h> |
| 13 | #include <linux/threads.h> |
| 14 | #include <linux/init.h> |
| 15 | #include <asm/segment.h> |
| 16 | #include <asm/pgtable.h> |
| 17 | #include <asm/page.h> |
| 18 | #include <asm/msr.h> |
| 19 | #include <asm/cache.h> |
| 20 | #include <asm/processor-flags.h> |
| 21 | #include <asm/percpu.h> |
| 22 | #include <asm/nops.h> |
| 23 | #include "../entry/calling.h" |
| 24 | |
| 25 | #ifdef CONFIG_PARAVIRT |
| 26 | #include <asm/asm-offsets.h> |
| 27 | #include <asm/paravirt.h> |
| 28 | #define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg |
| 29 | #else |
| 30 | #define GET_CR2_INTO(reg) movq %cr2, reg |
| 31 | #define INTERRUPT_RETURN iretq |
| 32 | #endif |
| 33 | |
| 34 | /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE |
| 35 | * because we need identity-mapped pages. |
| 36 | * |
| 37 | */ |
| 38 | |
| 39 | #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) |
| 40 | |
| 41 | L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) |
| 42 | L4_START_KERNEL = pgd_index(__START_KERNEL_map) |
| 43 | L3_START_KERNEL = pud_index(__START_KERNEL_map) |
| 44 | |
| 45 | .text |
| 46 | __HEAD |
| 47 | .code64 |
| 48 | .globl startup_64 |
| 49 | startup_64: |
| 50 | /* |
| 51 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, |
| 52 | * and someone has loaded an identity mapped page table |
| 53 | * for us. These identity mapped page tables map all of the |
| 54 | * kernel pages and possibly all of memory. |
| 55 | * |
| 56 | * %rsi holds a physical pointer to real_mode_data. |
| 57 | * |
| 58 | * We come here either directly from a 64bit bootloader, or from |
| 59 | * arch/x86/boot/compressed/head_64.S. |
| 60 | * |
| 61 | * We only come here initially at boot nothing else comes here. |
| 62 | * |
| 63 | * Since we may be loaded at an address different from what we were |
| 64 | * compiled to run at we first fixup the physical addresses in our page |
| 65 | * tables and then reload them. |
| 66 | */ |
| 67 | |
| 68 | /* |
| 69 | * Setup stack for verify_cpu(). "-8" because stack_start is defined |
| 70 | * this way, see below. Our best guess is a NULL ptr for stack |
| 71 | * termination heuristics and we don't want to break anything which |
| 72 | * might depend on it (kgdb, ...). |
| 73 | */ |
| 74 | leaq (__end_init_task - 8)(%rip), %rsp |
| 75 | |
| 76 | /* Sanitize CPU configuration */ |
| 77 | call verify_cpu |
| 78 | |
| 79 | /* |
| 80 | * Compute the delta between the address I am compiled to run at and the |
| 81 | * address I am actually running at. |
| 82 | */ |
| 83 | leaq _text(%rip), %rbp |
| 84 | subq $_text - __START_KERNEL_map, %rbp |
| 85 | |
| 86 | /* Is the address not 2M aligned? */ |
| 87 | testl $~PMD_PAGE_MASK, %ebp |
| 88 | jnz bad_address |
| 89 | |
| 90 | /* |
| 91 | * Is the address too large? |
| 92 | */ |
| 93 | leaq _text(%rip), %rax |
| 94 | shrq $MAX_PHYSMEM_BITS, %rax |
| 95 | jnz bad_address |
| 96 | |
| 97 | /* |
| 98 | * Fixup the physical addresses in the page table |
| 99 | */ |
| 100 | addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) |
| 101 | |
| 102 | addq %rbp, level3_kernel_pgt + (510*8)(%rip) |
| 103 | addq %rbp, level3_kernel_pgt + (511*8)(%rip) |
| 104 | |
| 105 | addq %rbp, level2_fixmap_pgt + (506*8)(%rip) |
| 106 | |
| 107 | /* |
| 108 | * Set up the identity mapping for the switchover. These |
| 109 | * entries should *NOT* have the global bit set! This also |
| 110 | * creates a bunch of nonsense entries but that is fine -- |
| 111 | * it avoids problems around wraparound. |
| 112 | */ |
| 113 | leaq _text(%rip), %rdi |
| 114 | leaq early_level4_pgt(%rip), %rbx |
| 115 | |
| 116 | movq %rdi, %rax |
| 117 | shrq $PGDIR_SHIFT, %rax |
| 118 | |
| 119 | leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx |
| 120 | movq %rdx, 0(%rbx,%rax,8) |
| 121 | movq %rdx, 8(%rbx,%rax,8) |
| 122 | |
| 123 | addq $4096, %rdx |
| 124 | movq %rdi, %rax |
| 125 | shrq $PUD_SHIFT, %rax |
| 126 | andl $(PTRS_PER_PUD-1), %eax |
| 127 | movq %rdx, 4096(%rbx,%rax,8) |
| 128 | incl %eax |
| 129 | andl $(PTRS_PER_PUD-1), %eax |
| 130 | movq %rdx, 4096(%rbx,%rax,8) |
| 131 | |
| 132 | addq $8192, %rbx |
| 133 | movq %rdi, %rax |
| 134 | shrq $PMD_SHIFT, %rdi |
| 135 | addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax |
| 136 | leaq (_end - 1)(%rip), %rcx |
| 137 | shrq $PMD_SHIFT, %rcx |
| 138 | subq %rdi, %rcx |
| 139 | incl %ecx |
| 140 | |
| 141 | 1: |
| 142 | andq $(PTRS_PER_PMD - 1), %rdi |
| 143 | movq %rax, (%rbx,%rdi,8) |
| 144 | incq %rdi |
| 145 | addq $PMD_SIZE, %rax |
| 146 | decl %ecx |
| 147 | jnz 1b |
| 148 | |
| 149 | /* |
| 150 | * Fixup the kernel text+data virtual addresses. Note that |
| 151 | * we might write invalid pmds, when the kernel is relocated |
| 152 | * cleanup_highmap() fixes this up along with the mappings |
| 153 | * beyond _end. |
| 154 | */ |
| 155 | leaq level2_kernel_pgt(%rip), %rdi |
| 156 | leaq 4096(%rdi), %r8 |
| 157 | /* See if it is a valid page table entry */ |
| 158 | 1: testb $1, 0(%rdi) |
| 159 | jz 2f |
| 160 | addq %rbp, 0(%rdi) |
| 161 | /* Go to the next page */ |
| 162 | 2: addq $8, %rdi |
| 163 | cmp %r8, %rdi |
| 164 | jne 1b |
| 165 | |
| 166 | /* Fixup phys_base */ |
| 167 | addq %rbp, phys_base(%rip) |
| 168 | |
| 169 | movq $(early_level4_pgt - __START_KERNEL_map), %rax |
| 170 | jmp 1f |
| 171 | ENTRY(secondary_startup_64) |
| 172 | /* |
| 173 | * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, |
| 174 | * and someone has loaded a mapped page table. |
| 175 | * |
| 176 | * %rsi holds a physical pointer to real_mode_data. |
| 177 | * |
| 178 | * We come here either from startup_64 (using physical addresses) |
| 179 | * or from trampoline.S (using virtual addresses). |
| 180 | * |
| 181 | * Using virtual addresses from trampoline.S removes the need |
| 182 | * to have any identity mapped pages in the kernel page table |
| 183 | * after the boot processor executes this code. |
| 184 | */ |
| 185 | |
| 186 | /* Sanitize CPU configuration */ |
| 187 | call verify_cpu |
| 188 | |
| 189 | movq $(init_level4_pgt - __START_KERNEL_map), %rax |
| 190 | 1: |
| 191 | |
| 192 | /* Enable PAE mode and PGE */ |
| 193 | movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx |
| 194 | movq %rcx, %cr4 |
| 195 | |
| 196 | /* Setup early boot stage 4 level pagetables. */ |
| 197 | addq phys_base(%rip), %rax |
| 198 | movq %rax, %cr3 |
| 199 | |
| 200 | /* Ensure I am executing from virtual addresses */ |
| 201 | movq $1f, %rax |
| 202 | jmp *%rax |
| 203 | 1: |
| 204 | |
| 205 | /* Check if nx is implemented */ |
| 206 | movl $0x80000001, %eax |
| 207 | cpuid |
| 208 | movl %edx,%edi |
| 209 | |
| 210 | /* Setup EFER (Extended Feature Enable Register) */ |
| 211 | movl $MSR_EFER, %ecx |
| 212 | rdmsr |
| 213 | btsl $_EFER_SCE, %eax /* Enable System Call */ |
| 214 | btl $20,%edi /* No Execute supported? */ |
| 215 | jnc 1f |
| 216 | btsl $_EFER_NX, %eax |
| 217 | btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) |
| 218 | 1: wrmsr /* Make changes effective */ |
| 219 | |
| 220 | /* Setup cr0 */ |
| 221 | #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ |
| 222 | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ |
| 223 | X86_CR0_PG) |
| 224 | movl $CR0_STATE, %eax |
| 225 | /* Make changes effective */ |
| 226 | movq %rax, %cr0 |
| 227 | |
| 228 | /* Setup a boot time stack */ |
| 229 | movq stack_start(%rip), %rsp |
| 230 | |
| 231 | /* zero EFLAGS after setting rsp */ |
| 232 | pushq $0 |
| 233 | popfq |
| 234 | |
| 235 | /* |
| 236 | * We must switch to a new descriptor in kernel space for the GDT |
| 237 | * because soon the kernel won't have access anymore to the userspace |
| 238 | * addresses where we're currently running on. We have to do that here |
| 239 | * because in 32bit we couldn't load a 64bit linear address. |
| 240 | */ |
| 241 | lgdt early_gdt_descr(%rip) |
| 242 | |
| 243 | /* set up data segments */ |
| 244 | xorl %eax,%eax |
| 245 | movl %eax,%ds |
| 246 | movl %eax,%ss |
| 247 | movl %eax,%es |
| 248 | |
| 249 | /* |
| 250 | * We don't really need to load %fs or %gs, but load them anyway |
| 251 | * to kill any stale realmode selectors. This allows execution |
| 252 | * under VT hardware. |
| 253 | */ |
| 254 | movl %eax,%fs |
| 255 | movl %eax,%gs |
| 256 | |
| 257 | /* Set up %gs. |
| 258 | * |
| 259 | * The base of %gs always points to the bottom of the irqstack |
| 260 | * union. If the stack protector canary is enabled, it is |
| 261 | * located at %gs:40. Note that, on SMP, the boot cpu uses |
| 262 | * init data section till per cpu areas are set up. |
| 263 | */ |
| 264 | movl $MSR_GS_BASE,%ecx |
| 265 | movl initial_gs(%rip),%eax |
| 266 | movl initial_gs+4(%rip),%edx |
| 267 | wrmsr |
| 268 | |
| 269 | /* rsi is pointer to real mode structure with interesting info. |
| 270 | pass it to C */ |
| 271 | movq %rsi, %rdi |
| 272 | |
| 273 | /* Finally jump to run C code and to be on real kernel address |
| 274 | * Since we are running on identity-mapped space we have to jump |
| 275 | * to the full 64bit address, this is only possible as indirect |
| 276 | * jump. In addition we need to ensure %cs is set so we make this |
| 277 | * a far return. |
| 278 | * |
| 279 | * Note: do not change to far jump indirect with 64bit offset. |
| 280 | * |
| 281 | * AMD does not support far jump indirect with 64bit offset. |
| 282 | * AMD64 Architecture Programmer's Manual, Volume 3: states only |
| 283 | * JMP FAR mem16:16 FF /5 Far jump indirect, |
| 284 | * with the target specified by a far pointer in memory. |
| 285 | * JMP FAR mem16:32 FF /5 Far jump indirect, |
| 286 | * with the target specified by a far pointer in memory. |
| 287 | * |
| 288 | * Intel64 does support 64bit offset. |
| 289 | * Software Developer Manual Vol 2: states: |
| 290 | * FF /5 JMP m16:16 Jump far, absolute indirect, |
| 291 | * address given in m16:16 |
| 292 | * FF /5 JMP m16:32 Jump far, absolute indirect, |
| 293 | * address given in m16:32. |
| 294 | * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, |
| 295 | * address given in m16:64. |
| 296 | */ |
| 297 | movq initial_code(%rip),%rax |
| 298 | pushq $0 # fake return address to stop unwinder |
| 299 | pushq $__KERNEL_CS # set correct cs |
| 300 | pushq %rax # target address in negative space |
| 301 | lretq |
| 302 | |
| 303 | #include "verify_cpu.S" |
| 304 | |
| 305 | #ifdef CONFIG_HOTPLUG_CPU |
| 306 | /* |
| 307 | * Boot CPU0 entry point. It's called from play_dead(). Everything has been set |
| 308 | * up already except stack. We just set up stack here. Then call |
| 309 | * start_secondary(). |
| 310 | */ |
| 311 | ENTRY(start_cpu0) |
| 312 | movq stack_start(%rip),%rsp |
| 313 | movq initial_code(%rip),%rax |
| 314 | pushq $0 # fake return address to stop unwinder |
| 315 | pushq $__KERNEL_CS # set correct cs |
| 316 | pushq %rax # target address in negative space |
| 317 | lretq |
| 318 | ENDPROC(start_cpu0) |
| 319 | #endif |
| 320 | |
| 321 | /* SMP bootup changes these two */ |
| 322 | __REFDATA |
| 323 | .balign 8 |
| 324 | GLOBAL(initial_code) |
| 325 | .quad x86_64_start_kernel |
| 326 | GLOBAL(initial_gs) |
| 327 | .quad INIT_PER_CPU_VAR(irq_stack_union) |
| 328 | |
| 329 | GLOBAL(stack_start) |
| 330 | .quad init_thread_union+THREAD_SIZE-8 |
| 331 | .word 0 |
| 332 | __FINITDATA |
| 333 | |
| 334 | bad_address: |
| 335 | jmp bad_address |
| 336 | |
| 337 | __INIT |
| 338 | ENTRY(early_idt_handler_array) |
| 339 | # 104(%rsp) %rflags |
| 340 | # 96(%rsp) %cs |
| 341 | # 88(%rsp) %rip |
| 342 | # 80(%rsp) error code |
| 343 | i = 0 |
| 344 | .rept NUM_EXCEPTION_VECTORS |
| 345 | .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 |
| 346 | pushq $0 # Dummy error code, to make stack frame uniform |
| 347 | .endif |
| 348 | pushq $i # 72(%rsp) Vector number |
| 349 | jmp early_idt_handler_common |
| 350 | i = i + 1 |
| 351 | .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc |
| 352 | .endr |
| 353 | ENDPROC(early_idt_handler_array) |
| 354 | |
| 355 | early_idt_handler_common: |
| 356 | /* |
| 357 | * The stack is the hardware frame, an error code or zero, and the |
| 358 | * vector number. |
| 359 | */ |
| 360 | cld |
| 361 | |
| 362 | incl early_recursion_flag(%rip) |
| 363 | |
| 364 | /* The vector number is currently in the pt_regs->di slot. */ |
| 365 | pushq %rsi /* pt_regs->si */ |
| 366 | movq 8(%rsp), %rsi /* RSI = vector number */ |
| 367 | movq %rdi, 8(%rsp) /* pt_regs->di = RDI */ |
| 368 | pushq %rdx /* pt_regs->dx */ |
| 369 | pushq %rcx /* pt_regs->cx */ |
| 370 | pushq %rax /* pt_regs->ax */ |
| 371 | pushq %r8 /* pt_regs->r8 */ |
| 372 | pushq %r9 /* pt_regs->r9 */ |
| 373 | pushq %r10 /* pt_regs->r10 */ |
| 374 | pushq %r11 /* pt_regs->r11 */ |
| 375 | pushq %rbx /* pt_regs->bx */ |
| 376 | pushq %rbp /* pt_regs->bp */ |
| 377 | pushq %r12 /* pt_regs->r12 */ |
| 378 | pushq %r13 /* pt_regs->r13 */ |
| 379 | pushq %r14 /* pt_regs->r14 */ |
| 380 | pushq %r15 /* pt_regs->r15 */ |
| 381 | |
| 382 | cmpq $14,%rsi /* Page fault? */ |
| 383 | jnz 10f |
| 384 | GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */ |
| 385 | call early_make_pgtable |
| 386 | andl %eax,%eax |
| 387 | jz 20f /* All good */ |
| 388 | |
| 389 | 10: |
| 390 | movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ |
| 391 | call early_fixup_exception |
| 392 | |
| 393 | 20: |
| 394 | decl early_recursion_flag(%rip) |
| 395 | jmp restore_regs_and_iret |
| 396 | ENDPROC(early_idt_handler_common) |
| 397 | |
| 398 | __INITDATA |
| 399 | |
| 400 | .balign 4 |
| 401 | GLOBAL(early_recursion_flag) |
| 402 | .long 0 |
| 403 | |
| 404 | #define NEXT_PAGE(name) \ |
| 405 | .balign PAGE_SIZE; \ |
| 406 | GLOBAL(name) |
| 407 | |
| 408 | /* Automate the creation of 1 to 1 mapping pmd entries */ |
| 409 | #define PMDS(START, PERM, COUNT) \ |
| 410 | i = 0 ; \ |
| 411 | .rept (COUNT) ; \ |
| 412 | .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ |
| 413 | i = i + 1 ; \ |
| 414 | .endr |
| 415 | |
| 416 | __INITDATA |
| 417 | NEXT_PAGE(early_level4_pgt) |
| 418 | .fill 511,8,0 |
| 419 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
| 420 | |
| 421 | NEXT_PAGE(early_dynamic_pgts) |
| 422 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 |
| 423 | |
| 424 | .data |
| 425 | |
| 426 | #ifndef CONFIG_XEN |
| 427 | NEXT_PAGE(init_level4_pgt) |
| 428 | .fill 512,8,0 |
| 429 | #else |
| 430 | NEXT_PAGE(init_level4_pgt) |
| 431 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| 432 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 |
| 433 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| 434 | .org init_level4_pgt + L4_START_KERNEL*8, 0 |
| 435 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
| 436 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
| 437 | |
| 438 | NEXT_PAGE(level3_ident_pgt) |
| 439 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| 440 | .fill 511, 8, 0 |
| 441 | NEXT_PAGE(level2_ident_pgt) |
| 442 | /* Since I easily can, map the first 1G. |
| 443 | * Don't set NX because code runs from these pages. |
| 444 | */ |
| 445 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
| 446 | #endif |
| 447 | |
| 448 | NEXT_PAGE(level3_kernel_pgt) |
| 449 | .fill L3_START_KERNEL,8,0 |
| 450 | /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ |
| 451 | .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE |
| 452 | .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE |
| 453 | |
| 454 | NEXT_PAGE(level2_kernel_pgt) |
| 455 | /* |
| 456 | * 512 MB kernel mapping. We spend a full page on this pagetable |
| 457 | * anyway. |
| 458 | * |
| 459 | * The kernel code+data+bss must not be bigger than that. |
| 460 | * |
| 461 | * (NOTE: at +512MB starts the module area, see MODULES_VADDR. |
| 462 | * If you want to increase this then increase MODULES_VADDR |
| 463 | * too.) |
| 464 | */ |
| 465 | PMDS(0, __PAGE_KERNEL_LARGE_EXEC, |
| 466 | KERNEL_IMAGE_SIZE/PMD_SIZE) |
| 467 | |
| 468 | NEXT_PAGE(level2_fixmap_pgt) |
| 469 | .fill 506,8,0 |
| 470 | .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE |
| 471 | /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ |
| 472 | .fill 5,8,0 |
| 473 | |
| 474 | NEXT_PAGE(level1_fixmap_pgt) |
| 475 | .fill 512,8,0 |
| 476 | |
| 477 | #undef PMDS |
| 478 | |
| 479 | .data |
| 480 | .align 16 |
| 481 | .globl early_gdt_descr |
| 482 | early_gdt_descr: |
| 483 | .word GDT_ENTRIES*8-1 |
| 484 | early_gdt_descr_base: |
| 485 | .quad INIT_PER_CPU_VAR(gdt_page) |
| 486 | |
| 487 | ENTRY(phys_base) |
| 488 | /* This must match the first entry in level2_kernel_pgt */ |
| 489 | .quad 0x0000000000000000 |
| 490 | |
| 491 | #include "../../x86/xen/xen-head.S" |
| 492 | |
| 493 | __PAGE_ALIGNED_BSS |
| 494 | NEXT_PAGE(empty_zero_page) |
| 495 | .skip PAGE_SIZE |
| 496 | |