arch/x86/xen/setup.c

   1 /*
   2  * Machine specific setup for xen
   3  *
   4  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
   5  */
   6
   7 #include <linux/module.h>
   8 #include <linux/sched.h>
   9 #include <linux/mm.h>
  10 #include <linux/pm.h>
  11 #include <linux/memblock.h>
  12 #include <linux/cpuidle.h>
  13 #include <linux/cpufreq.h>
  14
  15 #include <asm/elf.h>
  16 #include <asm/vdso.h>
  17 #include <asm/e820.h>
  18 #include <asm/setup.h>
  19 #include <asm/acpi.h>
  20 #include <asm/xen/hypervisor.h>
  21 #include <asm/xen/hypercall.h>
  22
  23 #include <xen/xen.h>
  24 #include <xen/page.h>
  25 #include <xen/interface/callback.h>
  26 #include <xen/interface/memory.h>
  27 #include <xen/interface/physdev.h>
  28 #include <xen/features.h>
  29 #include "xen-ops.h"
  30 #include "vdso.h"
  31
  32 /* These are code, but not functions.  Defined in entry.S */
  33 extern const char xen_hypervisor_callback[];
  34 extern const char xen_failsafe_callback[];
  35 extern void xen_sysenter_target(void);
  36 extern void xen_syscall_target(void);
  37 extern void xen_syscall32_target(void);
  38
  39 /* Amount of extra memory space we add to the e820 ranges */
  40 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
  41
  42 /* Number of pages released from the initial allocation. */
  43 unsigned long xen_released_pages;
  44
  45 /*
  46  * The maximum amount of extra memory compared to the base size.  The
  47  * main scaling factor is the size of struct page.  At extreme ratios
  48  * of base:extra, all the base memory can be filled with page
  49  * structures for the extra memory, leaving no space for anything
  50  * else.
  51  *
  52  * 10x seems like a reasonable balance between scaling flexibility and
  53  * leaving a practically usable system.
  54  */
  55 #define EXTRA_MEM_RATIO         (10)
  56
  57 static void __init xen_add_extra_mem(u64 start, u64 size)
  58 {
  59         unsigned long pfn;
  60         int i;
  61
  62         for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
  63                 /* Add new region. */
  64                 if (xen_extra_mem[i].size == 0) {
  65                         xen_extra_mem[i].start = start;
  66                         xen_extra_mem[i].size  = size;
  67                         break;
  68                 }
  69                 /* Append to existing region. */
  70                 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
  71                         xen_extra_mem[i].size += size;
  72                         break;
  73                 }
  74         }
  75         if (i == XEN_EXTRA_MEM_MAX_REGIONS)
  76                 printk(KERN_WARNING "Warning: not enough extra memory regions\n");
  77
  78         memblock_reserve(start, size);
  79
  80         xen_max_p2m_pfn = PFN_DOWN(start + size);
  81
  82         for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
  83                 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
  84 }
  85
  86 static unsigned long __init xen_do_chunk(unsigned long start,
  87                                          unsigned long end, bool release)
  88 {
  89         struct xen_memory_reservation reservation = {
  90                 .address_bits = 0,
  91                 .extent_order = 0,
  92                 .domid        = DOMID_SELF
  93         };
  94         unsigned long len = 0;
  95         unsigned long pfn;
  96         int ret;
  97
  98         for (pfn = start; pfn < end; pfn++) {
  99                 unsigned long frame;
 100                 unsigned long mfn = pfn_to_mfn(pfn);
 101
 102                 if (release) {
 103                         /* Make sure pfn exists to start with */
 104                         if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
 105                                 continue;
 106                         frame = mfn;
 107                 } else {
 108                         if (mfn != INVALID_P2M_ENTRY)
 109                                 continue;
 110                         frame = pfn;
 111                 }
 112                 set_xen_guest_handle(reservation.extent_start, &frame);
 113                 reservation.nr_extents = 1;
 114
 115                 ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
 116                                            &reservation);
 117                 WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
 118                      release ? "release" : "populate", pfn, ret);
 119
 120                 if (ret == 1) {
 121                         if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
 122                                 if (release)
 123                                         break;
 124                                 set_xen_guest_handle(reservation.extent_start, &frame);
 125                                 reservation.nr_extents = 1;
 126                                 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
 127                                                            &reservation);
 128                                 break;
 129                         }
 130                         len++;
 131                 } else
 132                         break;
 133         }
 134         if (len)
 135                 printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
 136                        release ? "Freeing" : "Populating",
 137                        start, end, len,
 138                        release ? "freed" : "added");
 139
 140         return len;
 141 }
 142
 143 static unsigned long __init xen_release_chunk(unsigned long start,
 144                                               unsigned long end)
 145 {
 146         return xen_do_chunk(start, end, true);
 147 }
 148
 149 static unsigned long __init xen_populate_chunk(
 150         const struct e820entry *list, size_t map_size,
 151         unsigned long max_pfn, unsigned long *last_pfn,
 152         unsigned long credits_left)
 153 {
 154         const struct e820entry *entry;
 155         unsigned int i;
 156         unsigned long done = 0;
 157         unsigned long dest_pfn;
 158
 159         for (i = 0, entry = list; i < map_size; i++, entry++) {
 160                 unsigned long credits = credits_left;
 161                 unsigned long s_pfn;
 162                 unsigned long e_pfn;
 163                 unsigned long pfns;
 164                 long capacity;
 165
 166                 if (credits <= 0)
 167                         break;
 168
 169                 if (entry->type != E820_RAM)
 170                         continue;
 171
 172                 e_pfn = PFN_UP(entry->addr + entry->size);
 173
 174                 /* We only care about E820 after the xen_start_info->nr_pages */
 175                 if (e_pfn <= max_pfn)
 176                         continue;
 177
 178                 s_pfn = PFN_DOWN(entry->addr);
 179                 /* If the E820 falls within the nr_pages, we want to start
 180                  * at the nr_pages PFN.
 181                  * If that would mean going past the E820 entry, skip it
 182                  */
 183                 if (s_pfn <= max_pfn) {
 184                         capacity = e_pfn - max_pfn;
 185                         dest_pfn = max_pfn;
 186                 } else {
 187                         /* last_pfn MUST be within E820_RAM regions */
 188                         if (*last_pfn && e_pfn >= *last_pfn)
 189                                 s_pfn = *last_pfn;
 190                         capacity = e_pfn - s_pfn;
 191                         dest_pfn = s_pfn;
 192                 }
 193                 /* If we had filled this E820_RAM entry, go to the next one. */
 194                 if (capacity <= 0)
 195                         continue;
 196
 197                 if (credits > capacity)
 198                         credits = capacity;
 199
 200                 pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
 201                 done += pfns;
 202                 credits_left -= pfns;
 203                 *last_pfn = (dest_pfn + pfns);
 204         }
 205         return done;
 206 }
 207
 208 static void __init xen_set_identity_and_release_chunk(
 209         unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
 210         unsigned long *released, unsigned long *identity)
 211 {
 212         unsigned long pfn;
 213
 214         /*
 215          * If the PFNs are currently mapped, the VA mapping also needs
 216          * to be updated to be 1:1.
 217          */
 218         for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
 219                 (void)HYPERVISOR_update_va_mapping(
 220                         (unsigned long)__va(pfn << PAGE_SHIFT),
 221                         mfn_pte(pfn, PAGE_KERNEL_IO), 0);
 222
 223         if (start_pfn < nr_pages)
 224                 *released += xen_release_chunk(
 225                         start_pfn, min(end_pfn, nr_pages));
 226
 227         *identity += set_phys_range_identity(start_pfn, end_pfn);
 228 }
 229
 230 static unsigned long __init xen_set_identity_and_release(
 231         const struct e820entry *list, size_t map_size, unsigned long nr_pages)
 232 {
 233         phys_addr_t start = 0;
 234         unsigned long released = 0;
 235         unsigned long identity = 0;
 236         const struct e820entry *entry;
 237         int i;
 238
 239         /*
 240          * Combine non-RAM regions and gaps until a RAM region (or the
 241          * end of the map) is reached, then set the 1:1 map and
 242          * release the pages (if available) in those non-RAM regions.
 243          *
 244          * The combined non-RAM regions are rounded to a whole number
 245          * of pages so any partial pages are accessible via the 1:1
 246          * mapping.  This is needed for some BIOSes that put (for
 247          * example) the DMI tables in a reserved region that begins on
 248          * a non-page boundary.
 249          */
 250         for (i = 0, entry = list; i < map_size; i++, entry++) {
 251                 phys_addr_t end = entry->addr + entry->size;
 252                 if (entry->type == E820_RAM || i == map_size - 1) {
 253                         unsigned long start_pfn = PFN_DOWN(start);
 254                         unsigned long end_pfn = PFN_UP(end);
 255
 256                         if (entry->type == E820_RAM)
 257                                 end_pfn = PFN_UP(entry->addr);
 258
 259                         if (start_pfn < end_pfn)
 260                                 xen_set_identity_and_release_chunk(
 261                                         start_pfn, end_pfn, nr_pages,
 262                                         &released, &identity);
 263
 264                         start = end;
 265                 }
 266         }
 267
 268         if (released)
 269                 printk(KERN_INFO "Released %lu pages of unused memory\n", released);
 270         if (identity)
 271                 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
 272
 273         return released;
 274 }
 275
 276 static unsigned long __init xen_get_max_pages(void)
 277 {
 278         unsigned long max_pages = MAX_DOMAIN_PAGES;
 279         domid_t domid = DOMID_SELF;
 280         int ret;
 281
 282         /*
 283          * For the initial domain we use the maximum reservation as
 284          * the maximum page.
 285          *
 286          * For guest domains the current maximum reservation reflects
 287          * the current maximum rather than the static maximum. In this
 288          * case the e820 map provided to us will cover the static
 289          * maximum region.
 290          */
 291         if (xen_initial_domain()) {
 292                 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
 293                 if (ret > 0)
 294                         max_pages = ret;
 295         }
 296
 297         return min(max_pages, MAX_DOMAIN_PAGES);
 298 }
 299
 300 static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
 301 {
 302         u64 end = start + size;
 303
 304         /* Align RAM regions to page boundaries. */
 305         if (type == E820_RAM) {
 306                 start = PAGE_ALIGN(start);
 307                 end &= ~((u64)PAGE_SIZE - 1);
 308         }
 309
 310         e820_add_region(start, end - start, type);
 311 }
 312
 313 /**
 314  * machine_specific_memory_setup - Hook for machine specific memory setup.
 315  **/
 316 char * __init xen_memory_setup(void)
 317 {
 318         static struct e820entry map[E820MAX] __initdata;
 319
 320         unsigned long max_pfn = xen_start_info->nr_pages;
 321         unsigned long long mem_end;
 322         int rc;
 323         struct xen_memory_map memmap;
 324         unsigned long max_pages;
 325         unsigned long last_pfn = 0;
 326         unsigned long extra_pages = 0;
 327         unsigned long populated;
 328         int i;
 329         int op;
 330
 331         max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
 332         mem_end = PFN_PHYS(max_pfn);
 333
 334         memmap.nr_entries = E820MAX;
 335         set_xen_guest_handle(memmap.buffer, map);
 336
 337         op = xen_initial_domain() ?
 338                 XENMEM_machine_memory_map :
 339                 XENMEM_memory_map;
 340         rc = HYPERVISOR_memory_op(op, &memmap);
 341         if (rc == -ENOSYS) {
 342                 BUG_ON(xen_initial_domain());
 343                 memmap.nr_entries = 1;
 344                 map[0].addr = 0ULL;
 345                 map[0].size = mem_end;
 346                 /* 8MB slack (to balance backend allocations). */
 347                 map[0].size += 8ULL << 20;
 348                 map[0].type = E820_RAM;
 349                 rc = 0;
 350         }
 351         BUG_ON(rc);
 352
 353         /* Make sure the Xen-supplied memory map is well-ordered. */
 354         sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
 355
 356         max_pages = xen_get_max_pages();
 357         if (max_pages > max_pfn)
 358                 extra_pages += max_pages - max_pfn;
 359
 360         /*
 361          * Set P2M for all non-RAM pages and E820 gaps to be identity
 362          * type PFNs.  Any RAM pages that would be made inaccesible by
 363          * this are first released.
 364          */
 365         xen_released_pages = xen_set_identity_and_release(
 366                 map, memmap.nr_entries, max_pfn);
 367
 368         /*
 369          * Populate back the non-RAM pages and E820 gaps that had been
 370          * released. */
 371         populated = xen_populate_chunk(map, memmap.nr_entries,
 372                         max_pfn, &last_pfn, xen_released_pages);
 373
 374         xen_released_pages -= populated;
 375         extra_pages += xen_released_pages;
 376
 377         if (last_pfn > max_pfn) {
 378                 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
 379                 mem_end = PFN_PHYS(max_pfn);
 380         }
 381         /*
 382          * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
 383          * factor the base size.  On non-highmem systems, the base
 384          * size is the full initial memory allocation; on highmem it
 385          * is limited to the max size of lowmem, so that it doesn't
 386          * get completely filled.
 387          *
 388          * In principle there could be a problem in lowmem systems if
 389          * the initial memory is also very large with respect to
 390          * lowmem, but we won't try to deal with that here.
 391          */
 392         extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
 393                           extra_pages);
 394         i = 0;
 395         while (i < memmap.nr_entries) {
 396                 u64 addr = map[i].addr;
 397                 u64 size = map[i].size;
 398                 u32 type = map[i].type;
 399
 400                 if (type == E820_RAM) {
 401                         if (addr < mem_end) {
 402                                 size = min(size, mem_end - addr);
 403                         } else if (extra_pages) {
 404                                 size = min(size, (u64)extra_pages * PAGE_SIZE);
 405                                 extra_pages -= size / PAGE_SIZE;
 406                                 xen_add_extra_mem(addr, size);
 407                         } else
 408                                 type = E820_UNUSABLE;
 409                 }
 410
 411                 xen_align_and_add_e820_region(addr, size, type);
 412
 413                 map[i].addr += size;
 414                 map[i].size -= size;
 415                 if (map[i].size == 0)
 416                         i++;
 417         }
 418
 419         /*
 420          * In domU, the ISA region is normal, usable memory, but we
 421          * reserve ISA memory anyway because too many things poke
 422          * about in there.
 423          */
 424         e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 425                         E820_RESERVED);
 426
 427         /*
 428          * Reserve Xen bits:
 429          *  - mfn_list
 430          *  - xen_start_info
 431          * See comment above "struct start_info" in <xen/interface/xen.h>
 432          */
 433         memblock_reserve(__pa(xen_start_info->mfn_list),
 434                          xen_start_info->pt_base - xen_start_info->mfn_list);
 435
 436         sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 437
 438         return "Xen";
 439 }
 440
 441 /*
 442  * Set the bit indicating "nosegneg" library variants should be used.
 443  * We only need to bother in pure 32-bit mode; compat 32-bit processes
 444  * can have un-truncated segments, so wrapping around is allowed.
 445  */
 446 static void __init fiddle_vdso(void)
 447 {
 448 #ifdef CONFIG_X86_32
 449         u32 *mask;
 450         mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
 451         *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 452         mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
 453         *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 454 #endif
 455 }
 456
 457 static int __cpuinit register_callback(unsigned type, const void *func)
 458 {
 459         struct callback_register callback = {
 460                 .type = type,
 461                 .address = XEN_CALLBACK(__KERNEL_CS, func),
 462                 .flags = CALLBACKF_mask_events,
 463         };
 464
 465         return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
 466 }
 467
 468 void __cpuinit xen_enable_sysenter(void)
 469 {
 470         int ret;
 471         unsigned sysenter_feature;
 472
 473 #ifdef CONFIG_X86_32
 474         sysenter_feature = X86_FEATURE_SEP;
 475 #else
 476         sysenter_feature = X86_FEATURE_SYSENTER32;
 477 #endif
 478
 479         if (!boot_cpu_has(sysenter_feature))
 480                 return;
 481
 482         ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
 483         if(ret != 0)
 484                 setup_clear_cpu_cap(sysenter_feature);
 485 }
 486
 487 void __cpuinit xen_enable_syscall(void)
 488 {
 489 #ifdef CONFIG_X86_64
 490         int ret;
 491
 492         ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
 493         if (ret != 0) {
 494                 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
 495                 /* Pretty fatal; 64-bit userspace has no other
 496                    mechanism for syscalls. */
 497         }
 498
 499         if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
 500                 ret = register_callback(CALLBACKTYPE_syscall32,
 501                                         xen_syscall32_target);
 502                 if (ret != 0)
 503                         setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
 504         }
 505 #endif /* CONFIG_X86_64 */
 506 }
 507
 508 void __init xen_arch_setup(void)
 509 {
 510         xen_panic_handler_init();
 511
 512         HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
 513         HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
 514
 515         if (!xen_feature(XENFEAT_auto_translated_physmap))
 516                 HYPERVISOR_vm_assist(VMASST_CMD_enable,
 517                                      VMASST_TYPE_pae_extended_cr3);
 518
 519         if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
 520             register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
 521                 BUG();
 522
 523         xen_enable_sysenter();
 524         xen_enable_syscall();
 525
 526 #ifdef CONFIG_ACPI
 527         if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
 528                 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
 529                 disable_acpi();
 530         }
 531 #endif
 532
 533         memcpy(boot_command_line, xen_start_info->cmd_line,
 534                MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
 535                COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
 536
 537         /* Set up idle, making sure it calls safe_halt() pvop */
 538 #ifdef CONFIG_X86_32
 539         boot_cpu_data.hlt_works_ok = 1;
 540 #endif
 541         disable_cpuidle();
 542         disable_cpufreq();
 543         WARN_ON(set_pm_idle_to_default());
 544         fiddle_vdso();
 545 }