

Here's a consolidated patch that replaces the three
x86-64 patches currently in the patchset as well
as the ones that are missing.  This restores x86-64
functionality in terms of sparse and memory hot-plug
(well hot-add).  

This applies cleanly after first reverting the 
H-* patches.  I haven't yet looked at the complete
impact to DISCONTIGMEM, but hot-add does work for
logical and physical operations.  Further, this
patch only affects x86-64 specific parts of the tree.
Please apply.

 include/asm-x86_64/bitops.h |    2 
 include/asm-x86_64/io.h     |    2 
 include/asm-x86_64/mman.h   |    1 
 include/asm-x86_64/page.h   |    4 

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 memhotplug-dave/arch/x86_64/Kconfig         |   14 +-
 memhotplug-dave/arch/x86_64/kernel/setup.c  |    7 +
 memhotplug-dave/arch/x86_64/mm/init.c       |  196 +++++++++++++++++++++++-----
 memhotplug-dave/include/asm-x86_64/bitops.h |    2 
 memhotplug-dave/include/asm-x86_64/io.h     |    2 
 memhotplug-dave/include/asm-x86_64/mman.h   |    1 
 memhotplug-dave/include/asm-x86_64/mmzone.h |   20 ++
 memhotplug-dave/include/asm-x86_64/page.h   |    4 
 8 files changed, 205 insertions(+), 41 deletions(-)

diff -puN arch/x86_64/Kconfig~H-sparsemem-x86_64 arch/x86_64/Kconfig
--- memhotplug/arch/x86_64/Kconfig~H-sparsemem-x86_64	2005-03-03 09:05:13.000000000 -0800
+++ memhotplug-dave/arch/x86_64/Kconfig	2005-03-03 09:05:13.000000000 -0800
@@ -282,15 +282,19 @@ config NUMA_EMU
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 
-config DISCONTIGMEM
-       bool
-       depends on NUMA
-       default y
-
 config NUMA
        bool
        default n
 
+config ARCH_SPARSEMEM_DEFAULT
+	bool
+	depends on NUMA
+
+config ARCH_DISCONTIGMEM_DISABLE
+	depends on !NUMA
+
+source "mm/Kconfig"
+
 config HAVE_DEC_LOCK
 	bool
 	depends on SMP
diff -puN arch/x86_64/kernel/setup.c~H-sparsemem-x86_64 arch/x86_64/kernel/setup.c
--- memhotplug/arch/x86_64/kernel/setup.c~H-sparsemem-x86_64	2005-03-03 09:05:13.000000000 -0800
+++ memhotplug-dave/arch/x86_64/kernel/setup.c	2005-03-03 09:05:13.000000000 -0800
@@ -40,6 +40,7 @@
 #include <linux/acpi.h>
 #include <linux/kallsyms.h>
 #include <linux/edd.h>
+#include <linux/mmzone.h>
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -539,6 +540,10 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	end_pfn = e820_end_of_ram();
 
+#ifdef CONFIG_SPARSEMEM
+	memory_present(0, 0, end_pfn);
+#endif
+
 	check_efer();
 
 	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
@@ -622,6 +627,8 @@ void __init setup_arch(char **cmdline_p)
 		}
 	}
 #endif
+
+	sparse_init();
 	paging_init();
 
 	check_ioapic();
diff -puN arch/x86_64/mm/init.c~H-sparsemem-x86_64 arch/x86_64/mm/init.c
--- memhotplug/arch/x86_64/mm/init.c~H-sparsemem-x86_64	2005-03-03 09:05:13.000000000 -0800
+++ memhotplug-dave/arch/x86_64/mm/init.c	2005-03-03 09:05:13.000000000 -0800
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/memory_hotplug.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -179,13 +181,19 @@ static  struct temp_map { 
 	{}
 }; 
 
-static __init void *alloc_low_page(int *index, unsigned long *phys) 
+static __devinit void *alloc_low_page(int *index, unsigned long *phys)
 { 
 	struct temp_map *ti;
 	int i; 
 	unsigned long pfn = table_end++, paddr; 
 	void *adr;
 
+	if (after_bootmem) {
+		adr = (void *)get_zeroed_page(GFP_ATOMIC);
+		*phys = __pa(adr);
+		return adr;
+	}
+
 	if (pfn >= end_pfn) 
 		panic("alloc_low_page: ran out of memory"); 
 	for (i = 0; temp_mappings[i].allocated; i++) {
@@ -198,55 +206,95 @@ static __init void *alloc_low_page(int *
 	ti->allocated = 1; 
 	__flush_tlb(); 	       
 	adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
+	memset(adr, 0, PAGE_SIZE);
 	*index = i; 
 	*phys  = pfn * PAGE_SIZE;  
 	return adr; 
 } 
 
-static __init void unmap_low_page(int i)
+static __devinit void unmap_low_page(int i)
 { 
-	struct temp_map *ti = &temp_mappings[i];
+	struct temp_map *ti;
+
+	if (after_bootmem)
+		return;
+	ti = &temp_mappings[i];
 	set_pmd(ti->pmd, __pmd(0));
 	ti->allocated = 0; 
 } 
 
-static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+
+static void __devinit
+phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
+{
+	int i;
+
+	printk("%s: pmd: 0x%p, address: 0x%lx end: 0x%lx\n",
+		__func__, pmd, address, end);
+
+	for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
+		unsigned long entry;
+
+		if (address > end) {
+			for (; i < PTRS_PER_PMD; i++, pmd++)
+				set_pmd(pmd, __pmd(0));
+			break;
+		}
+		entry = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | address;
+		entry &= __supported_pte_mask;
+		set_pmd(pmd, __pmd(entry));
+	}
+}
+
+
+static void __devinit
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+{
+	pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
+
+	printk("%s: addr: 0x%lx end: 0x%lx pmd: 0x%p\n",
+		__func__, address, end, pmd);
+
+	if (pmd_none(*pmd)) {
+		spin_lock(&init_mm.page_table_lock);
+		phys_pmd_init(pmd, address, end);
+		spin_unlock(&init_mm.page_table_lock);
+		__flush_tlb_all();
+	}
+}
+
+
+
+static void __devinit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
 { 
-	long i, j; 
+	long i = pud_index(address);
 
-	i = pud_index(address);
 	pud = pud + i;
+
+	if (after_bootmem && pud_val(*pud)) {
+		phys_pmd_update(pud, address, end);
+		return;
+	}
+
 	for (; i < PTRS_PER_PUD; pud++, i++) {
 		int map; 
 		unsigned long paddr, pmd_phys;
 		pmd_t *pmd;
 
-		paddr = address + i*PUD_SIZE;
-		if (paddr >= end) { 
-			for (; i < PTRS_PER_PUD; i++, pud++) 
-				set_pud(pud, __pud(0)); 
+		paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
+		if (paddr >= end)
 			break;
-		} 
 
-		if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 
+		if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
 			set_pud(pud, __pud(0)); 
 			continue;
 		} 
 
 		pmd = alloc_low_page(&map, &pmd_phys);
+		if (after_bootmem) spin_lock(&init_mm.page_table_lock);
 		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
-		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
-			unsigned long pe;
-
-			if (paddr >= end) { 
-				for (; j < PTRS_PER_PMD; j++, pmd++)
-					set_pmd(pmd,  __pmd(0)); 
-				break;
-		}
-			pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
-			pe &= __supported_pte_mask;
-			set_pmd(pmd, __pmd(pe));
-		}
+		phys_pmd_init(pmd, paddr, end);
+		if (after_bootmem) spin_unlock(&init_mm.page_table_lock);
 		unmap_low_page(map);
 	}
 	__flush_tlb();
@@ -267,12 +315,16 @@ static void __init find_early_table_spac
 
 	table_start >>= PAGE_SHIFT;
 	table_end = table_start;
+
+	early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
+	       table_start<<PAGE_SHIFT,
+	       table_end<<PAGE_SHIFT);
 }
 
 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
    This runs before bootmem is initialized and gets pages directly from the 
    physical memory. To access them they are temporarily mapped. */
-void __init init_memory_mapping(unsigned long start, unsigned long end)
+void __devinit init_memory_mapping(unsigned long start, unsigned long end)
 { 
 	unsigned long next; 
 
@@ -284,7 +336,8 @@ void __init init_memory_mapping(unsigned
 	 * mapped.  Unfortunately this is done currently before the nodes are 
 	 * discovered.
 	 */
-	find_early_table_space(end);
+	if (!after_bootmem)
+		find_early_table_space(end);
 
 	start = (unsigned long)__va(start);
 	end = (unsigned long)__va(end);
@@ -292,20 +345,26 @@ void __init init_memory_mapping(unsigned
 	for (; start < end; start = next) {
 		int map;
 		unsigned long pud_phys; 
-		pud_t *pud = alloc_low_page(&map, &pud_phys);
+		pgd_t *pgd = pgd_offset_k(start);
+		pud_t *pud;
+
+		if (after_bootmem)
+			pud = pud_offset_k(pgd, __PAGE_OFFSET);
+		else
+			pud = alloc_low_page(&map, &pud_phys);
+
 		next = start + PGDIR_SIZE;
 		if (next > end) 
 			next = end; 
 		phys_pud_init(pud, __pa(start), __pa(next));
-		set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+		if (!after_bootmem)
+			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
 		unmap_low_page(map);   
 	} 
 
-	asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+	if (!after_bootmem)
+		asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
 	__flush_tlb_all();
-	early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 
-	       table_start<<PAGE_SHIFT, 
-	       table_end<<PAGE_SHIFT);
 }
 
 extern struct x8664_pda cpu_pda[NR_CPUS];
@@ -395,6 +454,76 @@ static inline int page_is_ram (unsigned 
 	return 0;
 }
 
+/*
+ * Memory hotplug specific functions
+ */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	set_page_count(page, 1);
+	__free_page(page);
+	totalram_pages++;
+	num_physpages++;
+}
+
+int add_memory(u64 start, u64 size, unsigned long attr)
+{
+	struct pglist_data *pgdat = &contig_page_data;
+	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES - 2;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int ret;
+
+	ret = __add_pages(zone, start_pfn, nr_pages, attr);
+	if (ret)
+		goto error;
+
+	init_memory_mapping(start, (start + size - 1));
+
+	return ret;
+error:
+	printk("%s: Problem encountered in __add_pages!\n", __func__);
+	return ret;
+}
+EXPORT_SYMBOL(add_memory);
+
+int remove_memory(u64 start, u64 size, unsigned long attr)
+{
+	struct zone *zone;
+	unsigned long start_pfn, end_pfn, nr_pages;
+
+	printk("%s: start: 0x%llx size: 0x%llx attr: 0x%lx\n",
+		__func__, start, size, attr);
+
+	start_pfn = start >> PAGE_SHIFT;
+	nr_pages = size >> PAGE_SHIFT;
+	/* end_pfn is the last *valid* pfn */
+	end_pfn = start_pfn + nr_pages - 1;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+
+	printk("%s: memory will be removed from the %s zone\n",
+		__func__, zone->name);
+	printk("%s: start_pfn: 0x%lx nr_pages: 0x%lx end_pfn: 0x%lx\n",
+		__func__, start_pfn, nr_pages, end_pfn);
+
+	if (zone != page_zone(pfn_to_page(end_pfn)))
+		goto overlap;
+
+	printk("%s: just before remove pages\n", __func__);
+
+	return __remove_pages(zone, start_pfn, nr_pages, attr);
+overlap:
+	printk("%s: memory range overlaps multiple zones?\n", __func__);
+	return -ENOSYS;
+}
+EXPORT_SYMBOL(remove_memory);
+
+#endif
+
 extern int swiotlb_force;
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
@@ -432,8 +561,11 @@ void __init mem_init(void)
 	tmp = 0;
 	/* should count reserved pages here for all nodes */ 
 #else
+
+#ifdef CONFIG_FLATMEM
 	max_mapnr = end_pfn;
 	if (!mem_map) BUG();
+#endif
 
 	totalram_pages += free_all_bootmem();
 
diff -puN include/asm-x86_64/bitops.h~H-sparsemem-x86_64 include/asm-x86_64/bitops.h
--- memhotplug/include/asm-x86_64/bitops.h~H-sparsemem-x86_64	2005-03-03 09:05:13.000000000 -0800
+++ memhotplug-dave/include/asm-x86_64/bitops.h	2005-03-03 09:05:13.000000000 -0800
@@ -411,8 +411,6 @@ static __inline__ int ffs(int x)
 /* find last set bit */
 #define fls(x) generic_fls(x)
 
-#define ARCH_HAS_ATOMIC_UNSIGNED 1
-
 #endif /* __KERNEL__ */
 
 #endif /* _X86_64_BITOPS_H */
diff -puN include/asm-x86_64/io.h~H-sparsemem-x86_64 include/asm-x86_64/io.h
--- memhotplug/include/asm-x86_64/io.h~H-sparsemem-x86_64	2005-03-03 09:05:13.000000000 -0800
+++ memhotplug-dave/include/asm-x86_64/io.h	2005-03-03 09:05:13.000000000 -0800
@@ -132,7 +132,7 @@ extern inline void * phys_to_virt(unsign
 #include <asm/mmzone.h>
 #define page_to_phys(page)    ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
 #else
-#define page_to_phys(page)	((page - mem_map) << PAGE_SHIFT)
+#define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
 #endif
 
 #include <asm-generic/iomap.h>
diff -puN include/asm-x86_64/mman.h~H-sparsemem-x86_64 include/asm-x86_64/mman.h
--- memhotplug/include/asm-x86_64/mman.h~H-sparsemem-x86_64	2005-03-03 09:05:13.000000000 -0800
+++ memhotplug-dave/include/asm-x86_64/mman.h	2005-03-03 09:05:13.000000000 -0800
@@ -23,6 +23,7 @@
 #define MAP_NORESERVE	0x4000		/* don't check for reservations */
 #define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
+#define MAP_IMMOVABLE	0x20000
 
 #define MS_ASYNC	1		/* sync memory asynchronously */
 #define MS_INVALIDATE	2		/* invalidate the caches */
diff -puN include/asm-x86_64/mmzone.h~H-sparsemem-x86_64 include/asm-x86_64/mmzone.h
--- memhotplug/include/asm-x86_64/mmzone.h~H-sparsemem-x86_64	2005-03-03 09:05:13.000000000 -0800
+++ memhotplug-dave/include/asm-x86_64/mmzone.h	2005-03-03 09:05:13.000000000 -0800
@@ -60,4 +60,24 @@ static inline __attribute__((pure)) int 
 			({ u8 nid__ = pfn_to_nid(pfn); \
 			   nid__ != 0xff && (pfn) >= node_start_pfn(nid__) && (pfn) <= node_end_pfn(nid__); }))
 #endif
+
+#ifdef CONFIG_SPARSEMEM
+
+/* generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the
+ *    flags field of the struct page
+ */
+
+  /*
+ * SECTION_SIZE_BITS		2^N: how big each section will be
+ * MAX_PHYSADDR_BITS		2^N: how much physical address space we have
+ * MAX_PHYSMEM_BITS		2^N: how much memory we can have in that space
+   */
+#define SECTION_SIZE_BITS	27 /* matt - 128MB is convenient right now */
+#define MAX_PHYSADDR_BITS	40
+#define MAX_PHYSMEM_BITS	40
+
+#endif /* CONFIG_SPARSEMEM */
+
 #endif
diff -puN include/asm-x86_64/page.h~H-sparsemem-x86_64 include/asm-x86_64/page.h
--- memhotplug/include/asm-x86_64/page.h~H-sparsemem-x86_64	2005-03-03 09:05:13.000000000 -0800
+++ memhotplug-dave/include/asm-x86_64/page.h	2005-03-03 09:05:13.000000000 -0800
@@ -122,7 +122,9 @@ extern __inline__ int get_order(unsigned
 	  __pa(v); })
 
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
-#ifndef CONFIG_DISCONTIGMEM
+#define __boot_va(x)	__va(x)
+#define __boot_pa(x)	__pa(x)
+#ifdef CONFIG_FLATMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))
 #define pfn_valid(pfn)		((pfn) < max_mapnr)
_
