Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
[deliverable/linux.git] / mm / memory_hotplug.c
index 46c58be2fdc4cdc84ea1f17c788b5a438d74b65d..b81a367b9f39bb056cc4127a674e8a3dd4d2567d 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/suspend.h>
 #include <linux/mm_inline.h>
 #include <linux/firmware-map.h>
+#include <linux/stop_machine.h>
 
 #include <asm/tlbflush.h>
 
@@ -123,6 +124,7 @@ void __ref put_page_bootmem(struct page *page)
                mutex_lock(&ppb_lock);
                __free_pages_bootmem(page, 0);
                mutex_unlock(&ppb_lock);
+               totalram_pages++;
        }
 
 }
@@ -216,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
        }
 
        pfn = pgdat->node_start_pfn;
-       end_pfn = pfn + pgdat->node_spanned_pages;
+       end_pfn = pgdat_end_pfn(pgdat);
 
        /* register_section info */
        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -280,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
                set_page_links(pfn_to_page(pfn), zid, nid, pfn);
 }
 
+/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
+ * alloc_bootmem_node_nopanic() */
+static int __ref ensure_zone_is_initialized(struct zone *zone,
+                       unsigned long start_pfn, unsigned long num_pages)
+{
+       if (!zone_is_initialized(zone))
+               return init_currently_empty_zone(zone, start_pfn, num_pages,
+                                                MEMMAP_HOTPLUG);
+       return 0;
+}
+
 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
                unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -287,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
        unsigned long flags;
        unsigned long z1_start_pfn;
 
-       if (!z1->wait_table) {
-               ret = init_currently_empty_zone(z1, start_pfn,
-                       end_pfn - start_pfn, MEMMAP_HOTPLUG);
-               if (ret)
-                       return ret;
-       }
+       ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
+       if (ret)
+               return ret;
 
        pgdat_resize_lock(z1->zone_pgdat, &flags);
 
        /* can't move pfns which are higher than @z2 */
-       if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
+       if (end_pfn > zone_end_pfn(z2))
                goto out_fail;
        /* the move out part mast at the left most of @z2 */
        if (start_pfn > z2->zone_start_pfn)
@@ -313,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
                z1_start_pfn = start_pfn;
 
        resize_zone(z1, z1_start_pfn, end_pfn);
-       resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
+       resize_zone(z2, end_pfn, zone_end_pfn(z2));
 
        pgdat_resize_unlock(z1->zone_pgdat, &flags);
 
@@ -332,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
        unsigned long flags;
        unsigned long z2_end_pfn;
 
-       if (!z2->wait_table) {
-               ret = init_currently_empty_zone(z2, start_pfn,
-                       end_pfn - start_pfn, MEMMAP_HOTPLUG);
-               if (ret)
-                       return ret;
-       }
+       ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
+       if (ret)
+               return ret;
 
        pgdat_resize_lock(z1->zone_pgdat, &flags);
 
@@ -345,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
        if (z1->zone_start_pfn > start_pfn)
                goto out_fail;
        /* the move out part mast at the right most of @z1 */
-       if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn)
+       if (zone_end_pfn(z1) >  end_pfn)
                goto out_fail;
        /* must included/overlap */
-       if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
+       if (start_pfn >= zone_end_pfn(z1))
                goto out_fail;
 
        /* use end_pfn for z2's end_pfn if z2 is empty */
        if (z2->spanned_pages)
-               z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
+               z2_end_pfn = zone_end_pfn(z2);
        else
                z2_end_pfn = end_pfn;
 
@@ -390,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        int nid = pgdat->node_id;
        int zone_type;
        unsigned long flags;
+       int ret;
 
        zone_type = zone - pgdat->node_zones;
-       if (!zone->wait_table) {
-               int ret;
+       ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
+       if (ret)
+               return ret;
 
-               ret = init_currently_empty_zone(zone, phys_start_pfn,
-                                               nr_pages, MEMMAP_HOTPLUG);
-               if (ret)
-                       return ret;
-       }
        pgdat_resize_lock(zone->zone_pgdat, &flags);
        grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
        grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
@@ -432,8 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
        return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
+/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
+static int find_smallest_section_pfn(int nid, struct zone *zone,
+                                    unsigned long start_pfn,
+                                    unsigned long end_pfn)
+{
+       struct mem_section *ms;
+
+       for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(start_pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (unlikely(pfn_to_nid(start_pfn) != nid))
+                       continue;
+
+               if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+                       continue;
+
+               return start_pfn;
+       }
+
+       return 0;
+}
+
+/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
+static int find_biggest_section_pfn(int nid, struct zone *zone,
+                                   unsigned long start_pfn,
+                                   unsigned long end_pfn)
+{
+       struct mem_section *ms;
+       unsigned long pfn;
+
+       /* pfn is the end pfn of a memory section. */
+       pfn = end_pfn - 1;
+       for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (unlikely(pfn_to_nid(pfn) != nid))
+                       continue;
+
+               if (zone && zone != page_zone(pfn_to_page(pfn)))
+                       continue;
+
+               return pfn;
+       }
+
+       return 0;
+}
+
+static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
+                            unsigned long end_pfn)
+{
+       unsigned long zone_start_pfn =  zone->zone_start_pfn;
+       unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+       unsigned long pfn;
+       struct mem_section *ms;
+       int nid = zone_to_nid(zone);
+
+       zone_span_writelock(zone);
+       if (zone_start_pfn == start_pfn) {
+               /*
+                * If the section is smallest section in the zone, it need
+                * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
+                * In this case, we find second smallest valid mem_section
+                * for shrinking zone.
+                */
+               pfn = find_smallest_section_pfn(nid, zone, end_pfn,
+                                               zone_end_pfn);
+               if (pfn) {
+                       zone->zone_start_pfn = pfn;
+                       zone->spanned_pages = zone_end_pfn - pfn;
+               }
+       } else if (zone_end_pfn == end_pfn) {
+               /*
+                * If the section is biggest section in the zone, it need
+                * shrink zone->spanned_pages.
+                * In this case, we find second biggest valid mem_section for
+                * shrinking zone.
+                */
+               pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+                                              start_pfn);
+               if (pfn)
+                       zone->spanned_pages = pfn - zone_start_pfn + 1;
+       }
+
+       /*
+        * The section is not biggest or smallest mem_section in the zone, it
+        * only creates a hole in the zone. So in this case, we need not
+        * change the zone. But perhaps, the zone has only hole data. Thus
+        * it check the zone has only hole or not.
+        */
+       pfn = zone_start_pfn;
+       for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (page_zone(pfn_to_page(pfn)) != zone)
+                       continue;
+
+                /* If the section is current section, it continues the loop */
+               if (start_pfn == pfn)
+                       continue;
+
+               /* If we find valid section, we have nothing to do */
+               zone_span_writeunlock(zone);
+               return;
+       }
+
+       /* The zone has no valid section */
+       zone->zone_start_pfn = 0;
+       zone->spanned_pages = 0;
+       zone_span_writeunlock(zone);
+}
+
+static void shrink_pgdat_span(struct pglist_data *pgdat,
+                             unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long pgdat_start_pfn =  pgdat->node_start_pfn;
+       unsigned long pgdat_end_pfn =
+               pgdat->node_start_pfn + pgdat->node_spanned_pages;
+       unsigned long pfn;
+       struct mem_section *ms;
+       int nid = pgdat->node_id;
+
+       if (pgdat_start_pfn == start_pfn) {
+               /*
+                * If the section is smallest section in the pgdat, it need
+                * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
+                * In this case, we find second smallest valid mem_section
+                * for shrinking zone.
+                */
+               pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
+                                               pgdat_end_pfn);
+               if (pfn) {
+                       pgdat->node_start_pfn = pfn;
+                       pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
+               }
+       } else if (pgdat_end_pfn == end_pfn) {
+               /*
+                * If the section is biggest section in the pgdat, it need
+                * shrink pgdat->node_spanned_pages.
+                * In this case, we find second biggest valid mem_section for
+                * shrinking zone.
+                */
+               pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
+                                              start_pfn);
+               if (pfn)
+                       pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
+       }
+
+       /*
+        * If the section is not biggest or smallest mem_section in the pgdat,
+        * it only creates a hole in the pgdat. So in this case, we need not
+        * change the pgdat.
+        * But perhaps, the pgdat has only hole data. Thus it check the pgdat
+        * has only hole or not.
+        */
+       pfn = pgdat_start_pfn;
+       for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
+               ms = __pfn_to_section(pfn);
+
+               if (unlikely(!valid_section(ms)))
+                       continue;
+
+               if (pfn_to_nid(pfn) != nid)
+                       continue;
+
+                /* If the section is current section, it continues the loop */
+               if (start_pfn == pfn)
+                       continue;
+
+               /* If we find valid section, we have nothing to do */
+               return;
+       }
+
+       /* The pgdat has no valid section */
+       pgdat->node_start_pfn = 0;
+       pgdat->node_spanned_pages = 0;
+}
+
+static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       int nr_pages = PAGES_PER_SECTION;
+       int zone_type;
+       unsigned long flags;
+
+       zone_type = zone - pgdat->node_zones;
+
+       pgdat_resize_lock(zone->zone_pgdat, &flags);
+       shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
+       shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
 static int __remove_section(struct zone *zone, struct mem_section *ms)
 {
+       unsigned long start_pfn;
+       int scn_nr;
        int ret = -EINVAL;
 
        if (!valid_section(ms))
@@ -443,6 +650,10 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
        if (ret)
                return ret;
 
+       scn_nr = __section_nr(ms);
+       start_pfn = section_nr_to_pfn(scn_nr);
+       __remove_zone(zone, start_pfn);
+
        sparse_remove_one_section(zone, ms);
        return 0;
 }
@@ -809,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
        unsigned long zholes_size[MAX_NR_ZONES] = {0};
        unsigned long start_pfn = start >> PAGE_SHIFT;
 
-       pgdat = arch_alloc_nodedata(nid);
-       if (!pgdat)
-               return NULL;
+       pgdat = NODE_DATA(nid);
+       if (!pgdat) {
+               pgdat = arch_alloc_nodedata(nid);
+               if (!pgdat)
+                       return NULL;
 
-       arch_refresh_nodedata(nid, pgdat);
+               arch_refresh_nodedata(nid, pgdat);
+       }
 
        /* we can use NODE_DATA(nid) from here */
 
@@ -866,7 +1080,8 @@ out:
 int __ref add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat = NULL;
-       int new_pgdat = 0;
+       bool new_pgdat;
+       bool new_node;
        struct resource *res;
        int ret;
 
@@ -877,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
        if (!res)
                goto out;
 
-       if (!node_online(nid)) {
+       {       /* Stupid hack to suppress address-never-null warning */
+               void *p = NODE_DATA(nid);
+               new_pgdat = !p;
+       }
+       new_node = !node_online(nid);
+       if (new_node) {
                pgdat = hotadd_new_pgdat(nid, start);
                ret = -ENOMEM;
                if (!pgdat)
                        goto error;
-               new_pgdat = 1;
        }
 
        /* call arch's memory hotadd */
@@ -894,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
        /* we online node here. we can't roll back from here. */
        node_set_online(nid);
 
-       if (new_pgdat) {
+       if (new_node) {
                ret = register_one_node(nid);
                /*
                 * If sysfs file of new node can't create, cpu on the node
@@ -1069,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * migrate_pages returns # of failed pages.
                 */
                ret = migrate_pages(&source, alloc_migrate_target, 0,
-                                                       true, MIGRATE_SYNC,
-                                                       MR_MEMORY_HOTPLUG);
+                                       MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
                if (ret)
                        putback_lru_pages(&source);
        }
@@ -1472,7 +1690,111 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
        return ret;
 }
 
-int __ref remove_memory(u64 start, u64 size)
+static int check_cpu_on_node(void *data)
+{
+       struct pglist_data *pgdat = data;
+       int cpu;
+
+       for_each_present_cpu(cpu) {
+               if (cpu_to_node(cpu) == pgdat->node_id)
+                       /*
+                        * the cpu on this node isn't removed, and we can't
+                        * offline this node.
+                        */
+                       return -EBUSY;
+       }
+
+       return 0;
+}
+
+static void unmap_cpu_on_node(void *data)
+{
+#ifdef CONFIG_ACPI_NUMA
+       struct pglist_data *pgdat = data;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               if (cpu_to_node(cpu) == pgdat->node_id)
+                       numa_clear_node(cpu);
+#endif
+}
+
+static int check_and_unmap_cpu_on_node(void *data)
+{
+       int ret = check_cpu_on_node(data);
+
+       if (ret)
+               return ret;
+
+       /*
+        * the node will be offlined when we come here, so we can clear
+        * the cpu_to_node() now.
+        */
+
+       unmap_cpu_on_node(data);
+       return 0;
+}
+
+/* offline the node if all memory sections of this node are removed */
+void try_offline_node(int nid)
+{
+       pg_data_t *pgdat = NODE_DATA(nid);
+       unsigned long start_pfn = pgdat->node_start_pfn;
+       unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+       unsigned long pfn;
+       struct page *pgdat_page = virt_to_page(pgdat);
+       int i;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+               unsigned long section_nr = pfn_to_section_nr(pfn);
+
+               if (!present_section_nr(section_nr))
+                       continue;
+
+               if (pfn_to_nid(pfn) != nid)
+                       continue;
+
+               /*
+                * some memory sections of this node are not removed, and we
+                * can't offline node now.
+                */
+               return;
+       }
+
+       if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
+               return;
+
+       /*
+        * all memory/cpu of this node are removed, we can offline this
+        * node now.
+        */
+       node_set_offline(nid);
+       unregister_one_node(nid);
+
+       if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
+               /* node data is allocated from boot memory */
+               return;
+
+       /* free waittable in each zone */
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+
+               if (zone->wait_table)
+                       vfree(zone->wait_table);
+       }
+
+       /*
+        * Since there is no way to guarentee the address of pgdat/zone is not
+        * on stack of any kernel threads or used by other kernel objects
+        * without reference counting or other symchronizing method, do not
+        * reset node_data and free pgdat here. Just reset it to 0 and reuse
+        * the memory when the node is online again.
+        */
+       memset(pgdat, 0, sizeof(*pgdat));
+}
+EXPORT_SYMBOL(try_offline_node);
+
+int __ref remove_memory(int nid, u64 start, u64 size)
 {
        unsigned long start_pfn, end_pfn;
        int ret = 0;
@@ -1527,6 +1849,8 @@ repeat:
 
        arch_remove_memory(start, size);
 
+       try_offline_node(nid);
+
        unlock_memory_hotplug();
 
        return 0;
@@ -1536,7 +1860,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
        return -EINVAL;
 }
-int remove_memory(u64 start, u64 size)
+int remove_memory(int nid, u64 start, u64 size)
 {
        return -EINVAL;
 }
This page took 0.039584 seconds and 5 git commands to generate.