From: Dave Hansen <haveblue@us.ibm.com>
This patch allows the zone's freelist bitmaps to be resized (well, grown, but
shrinking will come later).

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 memhotplug-dave/include/linux/mmzone.h |    7 +
 memhotplug-dave/mm/page_alloc.c        |  133 +++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+)

diff -puN mm/page_alloc.c~J-free_areas-resize mm/page_alloc.c
--- memhotplug/mm/page_alloc.c~J-free_areas-resize	2004-09-27 10:32:32.000000000 -0700
+++ memhotplug-dave/mm/page_alloc.c	2004-09-27 10:32:32.000000000 -0700
@@ -1570,9 +1570,139 @@ void zone_init_free_lists(struct pglist_
 		bitmap_size = pages_to_bitmap_size(order, size);
 		zone->free_area[order].map =
 		  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+		zone->free_area[order].alloc_type = FROM_BOOTMEM;
+		zone->free_area[order].capacity_pages = bitmap_size;
 	}
 }
 
+/*
+ * Do not call these holding zone->lock.
+ */
+static void alloc_zone_bitmap(struct free_area *fa, unsigned long size_bytes)
+{
+	struct page *new_map_page;
+	/*
+	 * It's possible that this doesn't *have* to be a GFP_ATOMIC
+	 * allocation, but. we're on the way to get new memory so we
+	 * should go into the reserve to get it.  It will get better
+	 * soon.
+	 */
+	fa->map = kmalloc(size_bytes, GFP_ATOMIC);
+	fa->alloc_type = FROM_KMALLOC;
+
+	if (fa->map)
+		return;
+
+	/*
+	 * Might be too big for kmalloc
+	 *
+	 * Could use vmalloc, but these are quite often accessed
+	 * in a sparse manner, and it could have bad TLB effects
+	 */
+	new_map_page = alloc_pages(GFP_ATOMIC, get_order(size_bytes));
+	fa->alloc_type = FROM_ALLOC_PAGES;
+
+	if (new_map_page)
+		fa->map = page_address(new_map_page);
+}
+
+static void free_zone_bitmap(unsigned long *old_map, unsigned long size_bytes,
+			      int old_alloc_type)
+{
+	switch (old_alloc_type) {
+		case FROM_BOOTMEM:
+			/*
+			 * Freeing bootmem at runtime is too much of a pain.
+			 * This can be fixed later if it's a real problem
+			 */
+			break;
+		case FROM_KMALLOC:
+			kfree(old_map);
+			break;
+		case FROM_ALLOC_PAGES:
+			free_pages((unsigned long)old_map, get_order(size_bytes));
+			break;
+	}
+}
+
+/*
+ * zone->resize_sem must be held to keep the sizes from changing from
+ * a concurrent add or remove operation
+ */
+static int zone_grow_one_free_list(struct zone *zone, unsigned long order,
+				   unsigned long new_nr_pages)
+{
+	unsigned long *old_map = zone->free_area[order].map;
+	struct free_area *fa = &zone->free_area[order];
+	unsigned long old_map_size_bytes;
+	unsigned long new_map_size_bytes;
+	unsigned long flags;
+
+	new_map_size_bytes = pages_to_bitmap_size(order, new_nr_pages);
+	old_map_size_bytes = pages_to_bitmap_size(order, zone->spanned_pages);
+
+	/* tried to grow a currently empty zone - not supported yet */
+	if (!old_map || !old_map_size_bytes)
+		return -EINVAL;
+
+	/*
+	 * In the case that one of these allocations fails, the previously
+	 * allocated maps will be bigger than necessary.  This code will
+	 * fix those up on the next attemped growth operation.
+	 *
+	 * This will also allow these structures to be oversized if we
+	 * so desire.
+	 */
+	if (fa->capacity_pages < new_nr_pages) {
+		struct free_area new_fa;
+		int old_alloc_type;
+
+		alloc_zone_bitmap(&new_fa, new_map_size_bytes);
+		if (!new_fa.map)
+			return -ENOMEM;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		memcpy(new_fa.map, old_map, old_map_size_bytes);
+		fa->capacity_pages = new_nr_pages;
+		fa->map = new_fa.map;
+		old_alloc_type = fa->alloc_type;
+		fa->alloc_type = new_fa.alloc_type;
+		memset(&fa->map[old_map_size_bytes/sizeof(unsigned long)],
+			0, new_map_size_bytes - old_map_size_bytes);
+		spin_unlock_irqrestore(&zone->lock, flags);
+
+		free_zone_bitmap(old_map, old_map_size_bytes, old_alloc_type);
+	}
+
+	return 0;
+}
+
+/*
+ * Note that new_nr_pages is the _total_ number of pages that the zone
+ * will have, not the number that are being added.
+ */
+int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages)
+{
+	unsigned long order;
+	int err = 0;
+
+	if (new_nr_pages < zone->spanned_pages) {
+		printk("%s(): can't shrink zone\n", __func__);
+		BUG();
+	}
+
+	for (order=0; order < MAX_ORDER-1; order++) {
+		err = zone_grow_one_free_list(zone, order, new_nr_pages);
+		if (err)
+			goto out;
+	}
+
+	zone->free_area[order].map = NULL;
+	zone->spanned_pages = new_nr_pages;
+out:
+	return err;
+}
+
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1614,6 +1744,7 @@ static void __init free_area_init_core(s
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
+		init_MUTEX(&zone->resize_sem);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 
@@ -1664,6 +1795,8 @@ static void __init free_area_init_core(s
 		/*
 		 * The per-page waitqueue mechanism uses hashed waitqueues
 		 * per zone.
+		 *
+		 * this will need to be resized at hot-add time eventually
 		 */
 		zone->wait_table_size = wait_table_size(size);
 		zone->wait_table_bits =
diff -puN include/linux/mmzone.h~J-free_areas-resize include/linux/mmzone.h
--- memhotplug/include/linux/mmzone.h~J-free_areas-resize	2004-09-27 10:32:32.000000000 -0700
+++ memhotplug-dave/include/linux/mmzone.h	2004-09-27 10:32:32.000000000 -0700
@@ -13,6 +13,7 @@
 #include <linux/numa.h>
 #include <linux/nonlinear.h>
 #include <asm/atomic.h>
+#include <asm/semaphore.h>
 
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_FORCE_MAX_ZONEORDER
@@ -24,6 +25,11 @@
 struct free_area {
 	struct list_head	free_list;
 	unsigned long		*map;
+	unsigned long		capacity_pages;
+	enum { FROM_BOOTMEM,
+	       FROM_KMALLOC,
+	       FROM_ALLOC_PAGES
+	} alloc_type;
 };
 
 struct pglist_data;
@@ -215,6 +221,7 @@ struct zone {
 	char			*name;
 	unsigned long		spanned_pages;	/* total size, including holes */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
+	struct semaphore	resize_sem;
 } ____cacheline_maxaligned_in_smp;
 
 
_