--- linux.orig/arch/alpha/mm/numa.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/alpha/mm/numa.c 2005-07-28 13:50:20.000000000 -0700 @@ -371,6 +371,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); i = node_spanned_pages(nid); while (i-- > 0) { struct page *page = nid_page_nr(nid, i); @@ -384,6 +386,7 @@ else shared += page_count(page) - 1; } + pgdat_resize_unlock(NODE_DATA(nid), &flags); } printk("%ld pages of RAM\n",total); printk("%ld free pages\n",free); --- linux.orig/arch/i386/Kconfig~A4-sparsemem-extreme-as-default 2005-07-28 13:50:14.000000000 -0700 +++ linux/arch/i386/Kconfig 2005-07-28 14:29:39.000000000 -0700 @@ -753,6 +753,7 @@ depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) + select SPARSEMEM_STATIC # Need comments to help the hapless user trying to turn on NUMA support comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" @@ -785,13 +786,25 @@ def_bool y depends on NUMA -config ARCH_DISCONTIGMEM_DEFAULT +config SIMULATED_MEM_HOTPLUG + bool "Simulate memory hotplug on non-hotplug hardware" + depends on EXPERIMENTAL && HIGHMEM + +config X86_SPARSEMEM_DEBUG_NONUMA + bool "Enable SPARSEMEM on flat systems (debugging only)" + depends on !NUMA + +config ARCH_MEMORY_PROBE def_bool y - depends on NUMA + depends on X86_SPARSEMEM_DEBUG_NONUMA + +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_SPARSEMEM_DEBUG_NONUMA config ARCH_SPARSEMEM_ENABLE def_bool y - depends on NUMA + depends on NUMA || X86_SPARSEMEM_DEBUG_NONUMA config ARCH_SELECT_MEMORY_MODEL def_bool y @@ -908,7 +921,7 @@ config BOOT_IOREMAP bool depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) - default y + default y || SIMULATED_MEM_HOTPLUG config REGPARM bool "Use register arguments (EXPERIMENTAL)" --- linux.orig/arch/i386/kernel/setup.c~E0-for-debugging-page_is_ram_hotplug 2005-07-28 14:29:37.000000000 -0700 +++ linux/arch/i386/kernel/setup.c 2005-07-28 14:29:38.000000000 -0700 @@ -145,6 +145,7 @@ EXPORT_SYMBOL(ist_info); #endif struct e820map e820; +struct e820map bios_e820; extern void early_cpu_init(void); extern void dmi_scan_machine(void); @@ -1120,11 +1121,19 @@ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); #endif + /* + * This will only work for contiguous memory systems. + * + * Leave the evil #ifdef as a big FIXME until you do + * this properly + */ +#ifdef CONFIG_SPARSEMEM + memory_present(/*node*/0, /*start_pfn*/0, max_pfn); +#endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); setup_bootmem_allocator(); - return max_low_pfn; } @@ -1512,6 +1521,7 @@ else { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); print_memory_map(machine_specific_memory_setup()); + bios_e820 = e820; } copy_edd(); --- linux.orig/arch/i386/mm/discontig.c~D1-i386-hotplug-functions 2005-07-28 14:29:36.000000000 -0700 +++ linux/arch/i386/mm/discontig.c 2005-07-28 14:29:36.000000000 -0700 @@ -98,7 +98,7 @@ extern unsigned long find_max_low_pfn(void); extern void find_max_pfn(void); -extern void one_highpage_init(struct page *, int, int); +extern void add_one_highpage_init(struct page *, int, int); extern struct e820map e820; extern unsigned long init_pg_tables_end; @@ -416,7 +416,7 @@ if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - one_highpage_init(page, node_pfn, bad_ppro); + add_one_highpage_init(page, node_pfn, bad_ppro); } } totalram_pages += totalhigh_pages; --- linux.orig/arch/i386/mm/init.c~D1-i386-hotplug-functions 2005-07-28 14:29:36.000000000 -0700 +++ linux/arch/i386/mm/init.c 2005-07-28 14:29:37.000000000 -0700 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -191,38 +192,42 @@ extern int is_available_memory(efi_memory_desc_t *); -int page_is_ram(unsigned long pagenr) +static int page_is_ram_efi(unsigned long pagenr) { +#ifdef CONFIG_EFI int i; unsigned long addr, end; + efi_memory_desc_t *md; - if (efi_enabled) { - efi_memory_desc_t *md; - - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + if (!is_available_memory(md)) + continue; + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; + if ((pagenr >= addr) && (pagenr < end)) + return 1; } +#endif /* CONFIG_EFI */ + return 0; +} - for (i = 0; i < e820.nr_map; i++) { +int page_is_ram_e820(unsigned long pagenr, struct e820map *local_e820) +{ + int i; + unsigned long addr, end; - if (e820.map[i].type != E820_RAM) /* not usable memory */ + for (i = 0; i < local_e820->nr_map; i++) { + + if (local_e820->map[i].type != E820_RAM) /* not usable memory */ continue; /* * !!!FIXME!!! Some BIOSen report areas as RAM that * are not. Notably the 640->1Mb area. We need a sanity * check here. */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + addr = (local_e820->map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (local_e820->map[i].addr+local_e820->map[i].size) >> PAGE_SHIFT; if ((pagenr >= addr) && (pagenr < end)) return 1; } @@ -265,17 +270,45 @@ pkmap_page_table = pte; } -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +void __devinit free_new_highpage(struct page *page) +{ + set_page_count(page, 1); + __free_page(page); + totalhigh_pages++; +} + +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - totalhigh_pages++; } else SetPageReserved(page); } +int add_one_highpage_hotplug(struct page *page, int pfn) +{ + free_new_highpage(page); + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr++; +#endif + num_physpages++; + return 0; +} + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage_hotplug(page, page_to_pfn(page)); +} + + #ifdef CONFIG_NUMA extern void set_highmem_pages_init(int); #else @@ -283,7 +316,7 @@ { int pfn; for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } #endif /* CONFIG_FLATMEM */ @@ -614,6 +647,28 @@ #endif } +/* + * this is for the non-NUMA, single node SMP system case. + * Specifically, in the case of x86, we will always add + * memory to the highmem for now. + */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +int add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = &contig_page_data; + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages); +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +#endif + kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; @@ -694,3 +749,10 @@ } } #endif + +int page_is_ram(unsigned long pagenr) +{ + if (efi_enabled) + return page_is_ram_efi(pagenr); + return page_is_ram_e820(pagenr, &e820); +} --- linux.orig/arch/i386/mm/pgtable.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/i386/mm/pgtable.c 2005-07-28 13:50:20.000000000 -0700 @@ -31,11 +31,13 @@ pg_data_t *pgdat; unsigned long i; struct page_state ps; + unsigned long flags; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -48,6 +50,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); --- linux.orig/arch/ia64/Kconfig~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/Kconfig 2005-07-28 14:28:23.000000000 -0700 @@ -62,8 +62,6 @@ bool "generic" select NUMA select ACPI_NUMA - select VIRTUAL_MEM_MAP - select DISCONTIGMEM help This selects the system type of your hardware. A "generic" kernel will run on any supported IA-64 system. However, if you configure @@ -187,6 +185,7 @@ config VIRTUAL_MEM_MAP bool "Virtual mem map" + depends on !SPARSEMEM default y if !IA64_HP_SIM help Say Y to compile the kernel with support for a virtual mem map. @@ -199,16 +198,6 @@ bool default y if VIRTUAL_MEM_MAP -config ARCH_DISCONTIGMEM_ENABLE - bool "Discontiguous memory support" - depends on (IA64_DIG || IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB) && NUMA && VIRTUAL_MEM_MAP - default y if (IA64_SGI_SN2 || IA64_GENERIC) && NUMA - help - Say Y to support efficient handling of discontiguous physical memory, - for architectures which are either NUMA (Non-Uniform Memory Access) - or have huge holes in the physical address space for other reasons. - See for more. - config IA64_CYCLONE bool "Cyclone (EXA) Time Source support" help @@ -231,8 +220,10 @@ based on a network adapter and DMA messaging. config FORCE_MAX_ZONEORDER - int - default "18" + int "MAX_ORDER (11 - 20)" if !HUGETLB_PAGE + range 11 20 if !HUGETLB_PAGE + default "18" if HUGETLB_PAGE + default "11" config SMP bool "Symmetric multi-processing support" @@ -297,6 +288,35 @@ source "mm/Kconfig" +config SECTION_BITS + int + depends on SPARSEMEM + range 28 32 if !HUGETLB_PAGE + default "32" if HUGETLB_PAGE + default "28" + help + Size of memory section in bits. + +config PHYSICAL_MEMORY_BITS + int + depends on SPARSEMEM + range 44 50 + default 44 + help + Maximum physical memory address bits. + +config ARCH_SPARSEMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_DISCONTIGMEM_DEFAULT + def_bool y + depends on NUMA + +config ARCH_DISCONTIGMEM_ENABLE + def_bool y + depends NUMA + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) --- linux.orig/arch/ia64/mm/Makefile~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/mm/Makefile 2005-07-28 13:50:11.000000000 -0700 @@ -7,6 +7,5 @@ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DISCONTIGMEM) += discontig.o -ifndef CONFIG_DISCONTIGMEM -obj-y += contig.o -endif +obj-$(CONFIG_SPARSEMEM) += discontig.o +obj-$(CONFIG_FLATMEM) += contig.o --- linux.orig/arch/ia64/mm/discontig.c~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/mm/discontig.c 2005-07-28 13:50:20.000000000 -0700 @@ -421,6 +421,21 @@ return; } +#ifdef CONFIG_SPARSEMEM +static int __init register_sparse_mem(unsigned long start, unsigned long end, + void *arg) +{ + int nid; + + start = __pa(start) >> PAGE_SHIFT; + end = __pa(end) >> PAGE_SHIFT; + nid = early_pfn_to_nid(start); + (void) memory_present(nid, start, end); + + return 0; +} +#endif + /** * find_memory - walk the EFI memory map and setup the bootmem allocator * @@ -443,6 +458,9 @@ max_low_pfn = 0; /* These actually end up getting called by call_pernode_memory() */ +#ifdef CONFIG_SPARSEMEM + efi_memmap_walk(register_sparse_mem, (void *) 0); +#endif efi_memmap_walk(filter_rsvd_memory, build_node_maps); efi_memmap_walk(filter_rsvd_memory, find_pernode_space); @@ -524,12 +542,19 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long present = pgdat->node_present_pages; + unsigned long present; + unsigned long flags; int shared = 0, cached = 0, reserved = 0; + printk("Node ID: %d\n", pgdat->node_id); + pgdat_resize_lock(pgdat, &flags); + present = pgdat->node_present_pages; for(i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page = pgdat_page_nr(pgdat, i); - if (!ia64_pfn_valid(pgdat->node_start_pfn+i)) + unsigned long pfn = pgdat->node_start_pfn + i; + struct page *page; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + else continue; if (PageReserved(page)) reserved++; @@ -538,6 +563,7 @@ else if (page_count(page)) shared += page_count(page)-1; } + pgdat_resize_unlock(pgdat, &flags); total_present += present; total_reserved += reserved; total_cached += cached; @@ -648,6 +674,8 @@ max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + sparse_init(); + efi_memmap_walk(filter_rsvd_memory, count_node_pages); vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page)); @@ -687,10 +715,13 @@ (mem_data[node].num_physpages - mem_data[node].num_dma_physpages); } - pfn_offset = mem_data[node].min_pfn; +#ifndef CONFIG_SPARSEMEM NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; +#endif + + free_area_init_node(node, NODE_DATA(node), zones_size, pfn_offset, zholes_size); } --- linux.orig/arch/ia64/mm/init.c~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/mm/init.c 2005-07-28 14:29:37.000000000 -0700 @@ -584,7 +584,7 @@ platform_dma_init(); #endif -#ifndef CONFIG_DISCONTIGMEM +#if !defined(CONFIG_DISCONTIGMEM) && !defined(CONFIG_SPARSEMEM) if (!mem_map) BUG(); max_mapnr = max_low_pfn; @@ -631,3 +631,24 @@ ia32_mem_init(); #endif } + +#ifdef CONFIG_MEMORY_HOTPLUG +void online_page(struct page *page) +{ + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + totalram_pages++; + num_physpages++; +} + +int add_memory(u64 start, u64 size, unsigned long attr) +{ + return -ENOSYS; +} + +int remove_memory(u64 start, u64 size, unsigned long attr) +{ + return -ENOSYS; +} +#endif --- linux.orig/arch/ia64/mm/numa.c~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/mm/numa.c 2005-07-28 13:50:11.000000000 -0700 @@ -47,3 +47,26 @@ return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0); } + +#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_NUMA) +/* + * Because of holes evaluate on section limits. + */ +int early_pfn_to_nid(unsigned long pfn) +{ + int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec; + + for (i = 0; i < num_node_memblks; i++) { + ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT; + esec = (node_memblk[i].start_paddr + node_memblk[i].size + + ((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT; + if (section >= ssec && section < esec) + break; + } + + if (i == num_node_memblks) + return 0; + else + return node_memblk[i].nid; +} +#endif --- linux.orig/arch/m32r/mm/init.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/m32r/mm/init.c 2005-07-28 13:50:20.000000000 -0700 @@ -48,6 +48,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -60,6 +62,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); @@ -150,10 +153,14 @@ int reservedpages, nid, i; reservedpages = 0; - for_each_online_node(nid) + for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) if (PageReserved(nid_page_nr(nid, i))) reservedpages++; + pgdat_resize_unlock(NODE_DATA(nid), &flags); + } return reservedpages; } --- linux.orig/arch/parisc/mm/init.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/parisc/mm/init.c 2005-07-28 13:50:20.000000000 -0700 @@ -505,7 +505,9 @@ for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { struct page *p; + unsigned long flags; + pgdat_resize_lock(NODE_DATA(i), &flags); p = nid_page_nr(i, j) - node_start_pfn(i); total++; @@ -517,6 +519,7 @@ free++; else shared += page_count(p) - 1; + pgdat_resize_unlock(NODE_DATA(i), &flags); } } #endif --- linux.orig/arch/ppc64/mm/init.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/ppc64/mm/init.c 2005-07-28 14:29:36.000000000 -0700 @@ -97,6 +97,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); total++; @@ -107,6 +109,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%ld pages of RAM\n", total); printk("%ld reserved pages\n", reserved); @@ -662,11 +665,14 @@ #endif for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); if (PageReserved(page)) reservedpages++; } + pgdat_resize_unlock(pgdat, &flags); } codesize = (unsigned long)&_etext - (unsigned long)&_stext; @@ -864,3 +870,80 @@ return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); + +#ifdef CONFIG_MEMORY_HOTPLUG + +void online_page(struct page *page) +{ + ClearPageReserved(page); + free_cold_page(page); + totalram_pages++; + num_physpages++; +} + +/* + * This works only for the non-NUMA case. Later, we'll need a lookup + * to convert from real physical addresses to nid, that doesn't use + * pfn_to_nid(). + */ +int __devinit add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(0); + struct zone *zone; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones; + + return __add_pages(zone, start_pfn, nr_pages); + + return 0; +} + +/* + * First pass at this code will check to determine if the remove + * request is within the RMO. Do not allow removal within the RMO. + */ +int __devinit remove_memory(u64 start, u64 size) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + end_pfn = start_pfn + nr_pages; + + printk("%s(): Attempting to remove memoy in range " + "%lx to %lx\n", __func__, start, start+size); + /* + * check for range within RMO + */ + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s(): memory will be removed from " + "the %s zone\n", __func__, zone->name); + + /* + * not handling removing memory ranges that + * overlap multiple zones yet + */ + if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages)) + goto overlap; + + /* make sure it is NOT in RMO */ + if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) { + printk("%s(): range to be removed must NOT be in RMO!\n", + __func__); + goto in_rmo; + } + + return __remove_pages(zone, start_pfn, nr_pages); + +overlap: + printk("%s(): memory range to be removed overlaps " + "multiple zones!!!\n", __func__); +in_rmo: + return -1; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ --- linux.orig/arch/x86_64/kernel/setup.c~B-sparse-182-x86_64-later-sparse-init 2005-07-28 13:50:12.000000000 -0700 +++ linux/arch/x86_64/kernel/setup.c 2005-07-28 13:50:12.000000000 -0700 @@ -646,8 +646,6 @@ } #endif - sparse_init(); - #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) { reserve_bootmem(crashk_res.start, --- linux.orig/arch/x86_64/mm/init.c~D3-x86_64-hotplug-functions 2005-07-28 14:29:37.000000000 -0700 +++ linux/arch/x86_64/mm/init.c 2005-07-28 14:29:37.000000000 -0700 @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -179,13 +181,19 @@ {} }; -static __init void *alloc_low_page(int *index, unsigned long *phys) +static __devinit void *alloc_low_page(int *index, unsigned long *phys) { struct temp_map *ti; int i; unsigned long pfn = table_end++, paddr; void *adr; + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC); + *phys = __pa(adr); + return adr; + } + if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); for (i = 0; temp_mappings[i].allocated; i++) { @@ -198,55 +206,95 @@ ti->allocated = 1; __flush_tlb(); adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + memset(adr, 0, PAGE_SIZE); *index = i; *phys = pfn * PAGE_SIZE; return adr; } -static __init void unmap_low_page(int i) +static __devinit void unmap_low_page(int i) { - struct temp_map *ti = &temp_mappings[i]; + struct temp_map *ti; + + if (after_bootmem) + return; + ti = &temp_mappings[i]; set_pmd(ti->pmd, __pmd(0)); ti->allocated = 0; } -static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) + +static void __devinit +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +{ + int i; + + printk("%s: pmd: 0x%p, address: 0x%lx end: 0x%lx\n", + __func__, pmd, address, end); + + for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + unsigned long entry; + + if (address > end) { + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + entry = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | address; + entry &= __supported_pte_mask; + set_pmd(pmd, __pmd(entry)); + } +} + + +static void __devinit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); + + printk("%s: addr: 0x%lx end: 0x%lx pmd: 0x%p\n", + __func__, address, end, pmd); + + if (pmd_none(*pmd)) { + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); + } +} + + + +static void __devinit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { - long i, j; + long i = pud_index(address); - i = pud_index(address); pud = pud + i; + + if (after_bootmem && pud_val(*pud)) { + phys_pmd_update(pud, address, end); + return; + } + for (; i < PTRS_PER_PUD; pud++, i++) { int map; unsigned long paddr, pmd_phys; pmd_t *pmd; - paddr = address + i*PUD_SIZE; - if (paddr >= end) { - for (; i < PTRS_PER_PUD; i++, pud++) - set_pud(pud, __pud(0)); + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; + if (paddr >= end) break; - } - if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { set_pud(pud, __pud(0)); continue; } pmd = alloc_low_page(&map, &pmd_phys); + if (after_bootmem) spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) { - for (; j < PTRS_PER_PMD; j++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } + phys_pmd_init(pmd, paddr, end); + if (after_bootmem) spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } __flush_tlb(); @@ -267,12 +315,16 @@ table_start >>= PAGE_SHIFT; table_end = table_start; + + early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, + table_start< end) next = end; phys_pud_init(pud, __pa(start), __pa(next)); - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + if (!after_bootmem) + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(map); } - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + if (!after_bootmem) + asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); __flush_tlb_all(); - early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, - table_start<node_zones + MAX_NR_ZONES - 2; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + ret = __add_pages(zone, start_pfn, nr_pages, attr); + if (ret) + goto error; + + init_memory_mapping(start, (start + size - 1)); + + return ret; +error: + printk("%s: Problem encountered in __add_pages!\n", __func__); + return ret; +} +EXPORT_SYMBOL(add_memory); + +int remove_memory(u64 start, u64 size, unsigned long attr) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + printk("%s: start: 0x%llx size: 0x%llx attr: 0x%lx\n", + __func__, start, size, attr); + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + /* end_pfn is the last *valid* pfn */ + end_pfn = start_pfn + nr_pages - 1; + + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s: memory will be removed from the %s zone\n", + __func__, zone->name); + printk("%s: start_pfn: 0x%lx nr_pages: 0x%lx end_pfn: 0x%lx\n", + __func__, start_pfn, nr_pages, end_pfn); + + if (zone != page_zone(pfn_to_page(end_pfn))) + goto overlap; + + printk("%s: just before remove pages\n", __func__); + + return __remove_pages(zone, start_pfn, nr_pages, attr); +overlap: + printk("%s: memory range overlaps multiple zones?\n", __func__); + return -ENOSYS; +} +EXPORT_SYMBOL(remove_memory); + +#endif + extern int swiotlb_force; static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, --- linux.orig/arch/x86_64/mm/numa.c~B-sparse-182-x86_64-later-sparse-init 2005-07-28 13:50:12.000000000 -0700 +++ linux/arch/x86_64/mm/numa.c 2005-07-28 13:50:12.000000000 -0700 @@ -87,7 +87,6 @@ start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - memory_present(nodeid, start_pfn, end_pfn); nodedata_phys = find_e820_area(start, end, pgdat_size); if (nodedata_phys == -1L) panic("Cannot find memory pgdat in node %d\n", nodeid); @@ -271,9 +270,14 @@ void __init paging_init(void) { int i; - for_each_online_node(i) { + + for_each_online_node(i) + memory_present(node_start_pfn(i), node_end_pfn(i)); + + sparse_init(); + + for_each_online_node(i) setup_node_zones(i); - } } /* [numa=off] */ --- linux.orig/drivers/base/Makefile~D0-sysfs-memory-class 2005-07-28 13:50:22.000000000 -0700 +++ linux/drivers/base/Makefile 2005-07-28 13:50:22.000000000 -0700 @@ -7,6 +7,7 @@ obj-y += power/ obj-$(CONFIG_FW_LOADER) += firmware_class.o obj-$(CONFIG_NUMA) += node.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_DEBUG_DRIVER),y) EXTRA_CFLAGS += -DDEBUG --- linux.orig/drivers/base/init.c~D0-sysfs-memory-class 2005-07-28 13:50:22.000000000 -0700 +++ linux/drivers/base/init.c 2005-07-28 13:50:22.000000000 -0700 @@ -9,6 +9,7 @@ #include #include +#include extern int devices_init(void); extern int buses_init(void); @@ -39,5 +40,6 @@ platform_bus_init(); system_bus_init(); cpu_dev_init(); + memory_dev_init(); attribute_container_init(); } --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/drivers/base/memory.c 2005-07-28 14:29:37.000000000 -0700 @@ -0,0 +1,465 @@ +/* + * drivers/base/memory.c - basic Memory class support + * + * Written by Matt Tolentino + * Dave Hansen + * + * This file provides the necessary infrastructure to represent + * a SPARSEMEM-memory-model system's physical memory in /sysfs. + * All arch-independent code that assumes MEMORY_HOTPLUG requires + * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. + */ + +#include +#include +#include +#include /* capable() */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMORY_CLASS_NAME "memory" + +struct sysdev_class memory_sysdev_class = { + set_kset_name(MEMORY_CLASS_NAME), +}; +EXPORT_SYMBOL(memory_sysdev_class); + +static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) +{ + return MEMORY_CLASS_NAME; +} + +static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + int retval = 0; + + return retval; +} + +static struct kset_hotplug_ops memory_hotplug_ops = { + .name = memory_hotplug_name, + .hotplug = memory_hotplug, +}; + +static struct notifier_block *memory_chain; + +int register_memory_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&memory_chain, nb); +} + +void unregister_memory_notifier(struct notifier_block *nb) +{ + notifier_chain_unregister(&memory_chain, nb); +} + +/* + * register_memory - Setup a sysfs device for a memory block + */ +int +register_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + int error; + + memory->sysdev.cls = &memory_sysdev_class; + memory->sysdev.id = __section_nr(section); + + error = sysdev_register(&memory->sysdev); + + if (root && !error) + error = sysfs_create_link(&root->sysdev.kobj, + &memory->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); + + return error; +} + +void +unregister_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + BUG_ON(memory->sysdev.cls != &memory_sysdev_class); + BUG_ON(memory->sysdev.id != __section_nr(section)); + + sysdev_unregister(&memory->sysdev); + if (root) + sysfs_remove_link(&root->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); +} + +/* + * use this as the physical section index that this memsection + * uses. + */ + +static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%08lx\n", mem->phys_index); +} + +/* + * online, offline, going offline, etc. + */ +static ssize_t show_mem_state(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + ssize_t len = 0; + + /* + * We can probably put these states in a nice little array + * so that they're not open-coded + */ + switch (mem->state) { + case MEM_ONLINE: + len = sprintf(buf, "online\n"); + break; + case MEM_OFFLINE: + len = sprintf(buf, "offline\n"); + break; + case MEM_GOING_OFFLINE: + len = sprintf(buf, "going-offline\n"); + break; + default: + len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", + mem->state); + WARN_ON(1); + break; + } + + return len; +} + +static inline int memory_notify(unsigned long val, void *v) +{ + return notifier_call_chain(&memory_chain, val, v); +} + +/* + * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is + * OK to have direct references to sparsemem variables in here. + */ +static int +memory_block_action(struct memory_block *mem, unsigned long action) +{ + int i; + unsigned long psection; + unsigned long start_pfn, start_paddr; + struct page *first_page; + int ret; + int old_state = mem->state; + + psection = mem->phys_index; + first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); + + /* + * The probe routines leave the pages reserved, just + * as the bootmem code does. Make sure they're still + * that way. + */ + if (action == MEM_ONLINE) { + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageReserved(first_page+i)) + continue; + + printk(KERN_WARNING "section number %ld page number %d " + "not reserved, was it already online? \n", + psection, i); + return -EBUSY; + } + } + + switch (action) { + case MEM_ONLINE: + start_pfn = page_to_pfn(first_page); + ret = online_pages(start_pfn, PAGES_PER_SECTION); + break; + case MEM_OFFLINE: + mem->state = MEM_GOING_OFFLINE; + memory_notify(MEM_GOING_OFFLINE, NULL); + start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; + ret = remove_memory(start_paddr, + PAGES_PER_SECTION << PAGE_SHIFT); + if (ret) { + mem->state = old_state; + break; + } + memory_notify(MEM_MAPPING_INVALID, NULL); + break; + default: + printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", + __FUNCTION__, mem, action, action); + WARN_ON(1); + ret = -EINVAL; + } + /* + * For now, only notify on successful memory operations + */ + if (!ret) + memory_notify(action, NULL); + + return ret; +} + +static int memory_block_change_state(struct memory_block *mem, + unsigned long to_state, unsigned long from_state_req) +{ + int ret = 0; + down(&mem->state_sem); + + if (mem->state != from_state_req) { + ret = -EINVAL; + goto out; + } + + ret = memory_block_action(mem, to_state); + if (!ret) + mem->state = to_state; + +out: + up(&mem->state_sem); + return ret; +} + +static ssize_t +store_mem_state(struct sys_device *dev, const char *buf, size_t count) +{ + struct memory_block *mem; + unsigned int phys_section_nr; + int ret = -EINVAL; + + mem = container_of(dev, struct memory_block, sysdev); + phys_section_nr = mem->phys_index; + + if (!valid_section_nr(phys_section_nr)) + goto out; + + if (!strncmp(buf, "online", min((int)count, 6))) + ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + else if(!strncmp(buf, "offline", min((int)count, 7))) + ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); +out: + if (ret) + return ret; + return count; +} + +/* + * phys_device is a bad name for this. What I really want + * is a way to differentiate between memory ranges that + * are part of physical devices that constitute + * a complete removable unit or fru. + * i.e. do these ranges belong to the same physical device, + * s.t. if I offline all of these sections I can then + * remove the physical device? + */ +static ssize_t show_phys_device(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%d\n", mem->phys_device); +} + +SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); +SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); +SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); + +#define mem_create_simple_file(mem, attr_name) \ + sysdev_create_file(&mem->sysdev, &attr_##attr_name) +#define mem_remove_simple_file(mem, attr_name) \ + sysdev_remove_file(&mem->sysdev, &attr_##attr_name) + +/* + * Block size attribute stuff + */ +static ssize_t +print_block_size(struct class *class, char *buf) +{ + return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); +} + +static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); + +static int block_size_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_block_size_bytes.attr); + return 0; +} + +/* + * Some architectures will have custom drivers to do this, and + * will not need to do it from userspace. The fake hot-add code + * as well as ppc64 will do all of their discovery in userspace + * and will require this interface. + */ +extern int page_is_hotpluggable_ram(unsigned long pfn); +#ifdef CONFIG_ARCH_MEMORY_PROBE +static ssize_t +memory_probe_store(struct class *class, const char __user *buf, size_t count) +{ + u64 phys_addr; + unsigned long offset; + int ret; + + phys_addr = simple_strtoull(buf, NULL, 0); + + for (offset = 0; offset < PAGES_PER_SECTION; offset++) { + unsigned long page_nr = (phys_addr >> PAGE_SHIFT) + offset; + if (page_is_hotpluggable_ram(page_nr)) + break; + } + if (offset == PAGES_PER_SECTION) + return -EINVAL; + + ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); + + if (ret) + count = ret; + + return count; +} +static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); + +static int memory_probe_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_probe.attr); + return 0; +} +#else +#define memory_probe_init(...) do {} while (0) +#endif + +/* + * Note that phys_device is optional. It is here to allow for + * differentiation between which *physical* devices each + * section belongs to... + */ + +int add_memory_block(unsigned long node_id, struct mem_section *section, + unsigned long state, int phys_device) +{ + size_t size = sizeof(struct memory_block); + struct memory_block *mem = kmalloc(size, GFP_KERNEL); + int ret = 0; + + if (!mem) + return -ENOMEM; + + memset(mem, 0, size); + + mem->phys_index = __section_nr(section); + mem->state = state; + init_MUTEX(&mem->state_sem); + mem->phys_device = phys_device; + + ret = register_memory(mem, section, NULL); + if (!ret) + ret = mem_create_simple_file(mem, phys_index); + if (!ret) + ret = mem_create_simple_file(mem, state); + if (!ret) + ret = mem_create_simple_file(mem, phys_device); + + return ret; +} + +/* + * For now, we have a linear search to go find the appropriate + * memory_block corresponding to a particular phys_index. If + * this gets to be a real problem, we can always use a radix + * tree or something here. + * + * This could be made generic for all sysdev classes. + */ +struct memory_block *find_memory_block(struct mem_section *section) +{ + struct kobject *kobj; + struct sys_device *sysdev; + struct memory_block *mem; + char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; + + /* + * This only works because we know that section == sysdev->id + * slightly redundant with sysdev_register() + */ + sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); + + kobj = kset_find_obj(&memory_sysdev_class.kset, name); + if (!kobj) + return NULL; + + sysdev = container_of(kobj, struct sys_device, kobj); + mem = container_of(sysdev, struct memory_block, sysdev); + + return mem; +} + +int remove_memory_block(unsigned long node_id, struct mem_section *section, + int phys_device) +{ + struct memory_block *mem; + + mem = find_memory_block(section); + mem_remove_simple_file(mem, phys_index); + mem_remove_simple_file(mem, state); + mem_remove_simple_file(mem, phys_device); + unregister_memory(mem, section, NULL); + + return 0; +} + +/* + * need an interface for the VM to add new memory regions, + * but without onlining it. + */ +int register_new_memory(struct mem_section *section) +{ + return add_memory_block(0, section, MEM_OFFLINE, 0); +} + +int unregister_memory_section(struct mem_section *section) +{ + if (!valid_section(section)) + return -EINVAL; + + return remove_memory_block(0, section, 0); +} + +/* + * Initialize the sysfs support for memory devices... + */ +int __init memory_dev_init(void) +{ + unsigned int i; + int ret; + + memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; + ret = sysdev_class_register(&memory_sysdev_class); + + /* + * Create entries for memory sections that were found + * during boot and have been initialized + */ + for (i = 0; i < NR_MEM_SECTIONS; i++) { + if (!valid_section_nr(i)) + continue; + add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); + } + + memory_probe_init(); + block_size_init(); + + return ret; +} --- linux.orig/include/asm-i386/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-i386/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -88,12 +88,6 @@ __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) --- linux.orig/include/asm-i386/sparsemem.h~E3-for-debugging-more-FLAGS_RESERVED 2005-07-28 14:29:38.000000000 -0700 +++ linux/include/asm-i386/sparsemem.h 2005-07-28 14:29:38.000000000 -0700 @@ -15,7 +15,7 @@ * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space */ #ifdef CONFIG_X86_PAE -#define SECTION_SIZE_BITS 30 +#define SECTION_SIZE_BITS 28 #define MAX_PHYSADDR_BITS 36 #define MAX_PHYSMEM_BITS 36 #else --- linux.orig/include/asm-ia64/meminit.h~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/include/asm-ia64/meminit.h 2005-07-28 13:50:11.000000000 -0700 @@ -41,7 +41,7 @@ #define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1)) #define ORDERROUNDDOWN(n) ((n) & ~((PAGE_SIZE< #include -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA static inline int pfn_to_nid(unsigned long pfn) { @@ -39,8 +39,8 @@ # define NR_NODE_MEMBLKS (MAX_NUMNODES * 4) #endif -#else /* CONFIG_DISCONTIGMEM */ +#else /* CONFIG_NUMA */ # define NR_NODE_MEMBLKS (MAX_NUMNODES * 4) -#endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NUMA */ #endif /* _ASM_IA64_MMZONE_H */ --- linux.orig/include/asm-ia64/nodedata.h~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/include/asm-ia64/nodedata.h 2005-07-28 13:50:11.000000000 -0700 @@ -17,7 +17,7 @@ #include #include -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA /* * Node Data. One of these structures is located on each node of a NUMA system. @@ -47,6 +47,6 @@ */ #define NODE_DATA(nid) (local_node_data->pg_data_ptrs[nid]) -#endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NUMA */ #endif /* _ASM_IA64_NODEDATA_H */ --- linux.orig/include/asm-ia64/page.h~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/include/asm-ia64/page.h 2005-07-28 13:50:11.000000000 -0700 @@ -88,17 +88,17 @@ #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_VIRTUAL_MEM_MAP +#ifdef CONFIG_VIRTUAL_MEM_MAP extern int ia64_pfn_valid (unsigned long pfn); -#else +#elif CONFIG_FLATMEM # define ia64_pfn_valid(pfn) 1 #endif -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM # define pfn_valid(pfn) (((pfn) < max_mapnr) && ia64_pfn_valid(pfn)) # define page_to_pfn(page) ((unsigned long) (page - mem_map)) # define pfn_to_page(pfn) (mem_map + (pfn)) -#else +#elif CONFIG_DISCONTIGMEM extern struct page *vmem_map; extern unsigned long max_low_pfn; # define pfn_valid(pfn) (((pfn) < max_low_pfn) && ia64_pfn_valid(pfn)) @@ -106,6 +106,10 @@ # define pfn_to_page(pfn) (vmem_map + (pfn)) #endif +#if defined(CONFIG_NUMA) && defined(CONFIG_SPARSEMEM) +extern int early_pfn_to_nid(unsigned long pfn); +#endif + #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) @@ -124,8 +128,11 @@ * expressed in this way to ensure they result in a single "dep" * instruction. */ -#define __pa(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;}) -#define __va(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;}) +#define __boot_pa(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;}) +#define __boot_va(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;}) +#define __pa(x) __boot_pa(x) +#define __va(x) __boot_va(x) +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #define REGION_NUMBER(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg;}) #define REGION_OFFSET(x) ({ia64_va _v; _v.l = (long) (x); _v.f.off;}) --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/include/asm-ia64/sparsemem.h 2005-07-28 13:50:11.000000000 -0700 @@ -0,0 +1,32 @@ +#ifndef _ASM_IA64_SPARSEMEM_H +#define _ASM_IA64_SPARSEMEM_H + +#ifdef CONFIG_SPARSEMEM + /* + * SECTION_SIZE_BITS 2^N: how big each section will be + * MAX_PHYSADDR_BITS 2^N: how much physical address space we have + * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + */ + +#define SECTION_SIZE_BITS CONFIG_SECTION_BITS + +/* + * If FORCE_MAX_ORDER is used, then check and possibly enforce the boundary + * condition on SECTION_SIZE_BITS's magnitude. + */ +#ifdef CONFIG_FORCE_MAX_ZONEORDER +#if ((CONFIG_FORCE_MAX_ZONEORDER+PAGE_SHIFT) > SECTION_SIZE_BITS) +#undef SECTION_SIZE_BITS +#define SECTION_SIZE_BITS (CONFIG_FORCE_MAX_ZONEORDER+PAGE_SHIFT) +#endif +#endif + +#define MAX_PHYSADDR_BITS CONFIG_PHYSICAL_MEMORY_BITS +#define MAX_PHYSMEM_BITS CONFIG_PHYSICAL_MEMORY_BITS + +/* until we think of something better */ +#define page_is_ram(pfn) 1 + +#endif /* CONFIG_SPARSEMEM */ + +#endif /* _ASM_IA64_SPARSEMEM_H */ --- linux.orig/include/asm-m32r/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-m32r/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -21,12 +21,6 @@ __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = pfn; \ --- linux.orig/include/asm-parisc/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-parisc/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -27,12 +27,6 @@ }) #define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ --- linux.orig/include/asm-ppc64/abs_addr.h~D2-ppc64-hotplug-functions 2005-07-28 14:29:36.000000000 -0700 +++ linux/include/asm-ppc64/abs_addr.h 2005-07-28 14:29:36.000000000 -0700 @@ -104,5 +104,7 @@ /* Convenience macros */ #define virt_to_abs(va) phys_to_abs(__pa(va)) #define abs_to_virt(aa) __va(abs_to_phys(aa)) +#define boot_virt_to_abs(va) phys_to_abs(__boot_pa(va)) +#define boot_abs_to_virt(aa) __boot_va(abs_to_phys(aa)) #endif /* _ABS_ADDR_H */ --- linux.orig/include/asm-ppc64/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-ppc64/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -67,9 +67,6 @@ #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) - #ifdef CONFIG_DISCONTIGMEM /* --- linux.orig/include/asm-x86_64/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-x86_64/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -38,8 +38,6 @@ #ifdef CONFIG_DISCONTIGMEM -#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) -#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) /* AK: this currently doesn't deal with invalid addresses. We'll see if the 2.5 kernel doesn't pass them --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/include/linux/memory.h 2005-07-28 14:29:39.000000000 -0700 @@ -0,0 +1,95 @@ +/* + * include/linux/memory.h - generic memory definition + * + * This is mainly for topological representation. We define the + * basic "struct memory_block" here, which can be embedded in per-arch + * definitions or NUMA information. + * + * Basic handling of the devices is done in drivers/base/memory.c + * and system devices are handled in drivers/base/sys.c. + * + * Memory block are exported via sysfs in the class/memory/devices/ + * directory. + * + */ +#ifndef _LINUX_MEMORY_H_ +#define _LINUX_MEMORY_H_ + +#include +#include +#include + +#include + +struct memory_block { + unsigned long phys_index; + unsigned long state; + /* + * This serializes all state change requests. It isn't + * held during creation because the control files are + * created long after the critical areas during + * initialization. + */ + struct semaphore state_sem; + int phys_device; /* to which fru does this belong? */ + void *hw; /* optional pointer to fw/hw data */ + int (*phys_callback)(struct memory_block *); + struct sys_device sysdev; +}; + +/* These states are exposed to userspace as text strings in sysfs */ +#define MEM_ONLINE (1<<0) /* exposed to userspace */ +#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ +#define MEM_OFFLINE (1<<2) /* exposed to userspace */ + +/* + * All of these states are currently kernel-internal for notifying + * kernel components and architectures. + * + * For MEM_MAPPING_INVALID, all notifier chains with priority >0 + * are called before pfn_to_page() becomes invalid. The priority=0 + * entry is reserved for the function that actually makes + * pfn_to_page() stop working. Any notifiers that want to be called + * after that should have priority <0. + */ +#define MEM_MAPPING_INVALID (1<<3) + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline int memory_dev_init(void) +{ + return 0; +} +static inline int register_memory_notifier(struct notifier_block *nb) +{ + return 0; +} +static inline void unregister_memory_notifier(struct notifier_block *nb) +{ +} +#else +extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); +extern int register_new_memory(struct mem_section *); +extern int unregister_memory_section(struct mem_section *); +extern int memory_dev_init(void); +extern int register_memory_notifier(struct notifier_block *nb); +extern void unregister_memory_notifier(struct notifier_block *nb); + +#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< +#include +#include +#include + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * pgdat resizing functions + */ +static inline +void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqsave(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_init(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->node_size_lock); +} +/* + * Zone resizing functions + */ +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return read_seqbegin(&zone->span_seqlock); +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return read_seqretry(&zone->span_seqlock, iv); +} +static inline void zone_span_writelock(struct zone *zone) +{ + write_seqlock(&zone->span_seqlock); +} +static inline void zone_span_writeunlock(struct zone *zone) +{ + write_sequnlock(&zone->span_seqlock); +} +static inline void zone_seqlock_init(struct zone *zone) +{ + seqlock_init(&zone->span_seqlock); +} +extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); +extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); +extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +/* need some defines for these for archs that don't support it */ +extern void online_page(struct page *page); +/* VM interface that may be used by firmware interface */ +extern int add_memory(u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern int online_pages(unsigned long, unsigned long); + +/* reasonably generic interface to expand the physical pages in a zone */ +extern int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); +extern int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); +#else /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Stub functions for when hotplug is off + */ +static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_init(struct pglist_data *pgdat) {} + +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return 0; +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return 0; +} +static inline void zone_span_writelock(struct zone *zone) {} +static inline void zone_span_writeunlock(struct zone *zone) {} +static inline void zone_seqlock_init(struct zone *zone) {} + +static inline int mhp_notimplemented(const char *func) +{ + printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); + dump_stack(); + return -ENOSYS; +} + +static inline int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + return mhp_notimplemented(__FUNCTION__); +} +static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + return mhp_notimplemented(__FUNCTION__); +} +#endif /* ! CONFIG_MEMORY_HOTPLUG */ +#endif /* __LINUX_MEMORY_HOTPLUG_H */ --- linux.orig/include/linux/mm.h~D0-sysfs-memory-class 2005-07-28 13:50:22.000000000 -0700 +++ linux/include/linux/mm.h 2005-07-28 14:28:26.000000000 -0700 @@ -779,6 +779,7 @@ unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); +extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); --- linux.orig/include/linux/mm_inline.h~AA-PM-01-steal_page_from_lru 2005-07-28 14:29:40.000000000 -0700 +++ linux/include/linux/mm_inline.h 2005-07-28 14:29:40.000000000 -0700 @@ -38,3 +38,71 @@ zone->nr_inactive--; } } + +static inline int +isolate_lru_onepage(struct page *page, struct list_head *src, + struct list_head *dst) +{ + if (!TestClearPageLRU(page)) + BUG(); + list_del(&page->lru); + if (get_page_testone(page)) { + /* + * It is being freed elsewhere + */ + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + return 0; + } + list_add(&page->lru, dst); + return 1; +} + + +static inline int +__steal_page_from_lru(struct zone *zone, struct page *page, + struct list_head *dst) +{ + if (PageActive(page)) { + if (!isolate_lru_onepage(page, &zone->active_list, dst)) + return 0; + zone->nr_active--; + } else { + if (!isolate_lru_onepage(page, &zone->inactive_list, dst)) + return 0; + zone->nr_inactive--; + } + return 1; +} + +static inline int +steal_page_from_lru(struct zone *zone, struct page *page, + struct list_head *dst) +{ + int ret; + spin_lock_irq(&zone->lru_lock); + ret = __steal_page_from_lru(zone, page, dst); + spin_unlock_irq(&zone->lru_lock); + return ret; +} + +static inline void +__putback_page_to_lru(struct zone *zone, struct page *page) +{ + if (TestSetPageLRU(page)) + BUG(); + if (PageActive(page)) + add_page_to_active_list(zone, page); + else + add_page_to_inactive_list(zone, page); +} + +static inline void +putback_page_to_lru(struct zone *zone, struct page *page) +{ + spin_lock_irq(&zone->lru_lock); + __putback_page_to_lru(zone, page); + spin_unlock_irq(&zone->lru_lock); +} + --- linux.orig/include/linux/mmzone.h~A3-sparsemem-extreme 2005-07-28 13:50:13.000000000 -0700 +++ linux/include/linux/mmzone.h 2005-07-28 14:29:38.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -137,6 +138,10 @@ * free areas of different sizes */ spinlock_t lock; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif struct free_area free_area[MAX_ORDER]; @@ -220,6 +225,16 @@ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; + /* + * zone_start_pfn, spanned_pages and present_pages are all + * protected by span_seqlock. It is a seqlock because it has + * to be read outside of zone->lock, and it is done in the main + * allocator path. But, it is written quite infrequently. + * + * The lock is declared along with zone->lock because it is + * frequently read in proximity to zone->lock. It's good to + * give them a chance of being in the same cacheline. + */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ @@ -273,6 +288,16 @@ struct page *node_mem_map; #endif struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page @@ -293,6 +318,8 @@ #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) +#include + extern struct pglist_data *pgdat_list; void __get_zone_counts(unsigned long *active, unsigned long *inactive, @@ -431,7 +458,7 @@ * with 32 bit page->flags field, we reserve 8 bits for node/zone info. * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. */ -#define FLAGS_RESERVED 8 +#define FLAGS_RESERVED 10 #elif BITS_PER_LONG == 64 /* @@ -487,11 +514,52 @@ unsigned long section_mem_map; }; -extern struct mem_section mem_section[NR_MEM_SECTIONS]; +#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME +#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) +#else +#define SECTIONS_PER_ROOT 1 +#endif + +#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) +#define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT) +#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) + +#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME +extern struct mem_section *mem_section[NR_SECTION_ROOTS]; +#else +extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; +#endif static inline struct mem_section *__nr_to_section(unsigned long nr) { - return &mem_section[nr]; + if (!mem_section[SECTION_NR_TO_ROOT(nr)]) + return NULL; + return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; +} + +/* + * Although written for the SPARSEMEM_EXTREME case, this happens + * to also work for the flat array case becase + * NR_SECTION_ROOTS==NR_MEM_SECTIONS. + */ +static inline int __section_nr(struct mem_section* ms) +{ + unsigned long root_nr; + struct mem_section* root; + + for (root_nr = 0; + root_nr < NR_MEM_SECTIONS; + root_nr += SECTIONS_PER_ROOT) { + root = __nr_to_section(root_nr); + + if (!root) + continue; + + if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) + break; + } + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } /* @@ -513,12 +581,12 @@ static inline int valid_section(struct mem_section *section) { - return (section->section_mem_map & SECTION_MARKED_PRESENT); + return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int section_has_mem_map(struct mem_section *section) { - return (section->section_mem_map & SECTION_HAS_MEM_MAP); + return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int valid_section_nr(unsigned long nr) @@ -572,6 +640,7 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) +#define sparse_index_init(_sec, _nid) do {} while (0) #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_NODES_SPAN_OTHER_NODES @@ -590,3 +659,4 @@ #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ + --- linux.orig/mm/Kconfig~A3-sparsemem-extreme 2005-07-28 13:50:13.000000000 -0700 +++ linux/mm/Kconfig 2005-07-28 14:29:40.000000000 -0700 @@ -89,3 +89,33 @@ config HAVE_MEMORY_PRESENT def_bool y depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM + +# +# SPARSEMEM_EXTREME (which is the default) does some bootmem +# allocations when memory_present() is called. If this can not +# be done on your architecture, select this option. However, +# statically allocating the mem_section[] array can potentially +# consume vast quantities of .bss, so be careful. +# +# This option will also potentially produce smaller runtime code +# with gcc 3.4 and later. +# +config SPARSEMEM_STATIC + def_bool n + +# +# Architectecture platforms which require a two level mem_section in SPARSEMEM +# must select this option. This is usually for architecture platforms with +# an extremely sparse physical address space. +# +config SPARSEMEM_EXTREME + def_bool y + depends on SPARSEMEM && !SPARSEMEM_STATIC + +# eventually, we can have this option just 'select SPARSEMEM' +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + +comment "Memory hotplug is currently incompatible with Software Suspend" + depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND --- linux.orig/mm/Makefile~D0-sysfs-memory-class 2005-07-28 13:50:22.000000000 -0700 +++ linux/mm/Makefile 2005-07-28 14:28:31.000000000 -0700 @@ -18,5 +18,5 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o - +obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/mm/memory_hotplug.c 2005-07-28 14:29:40.000000000 -0700 @@ -0,0 +1,161 @@ +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, + unsigned long size); +static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int nid = pgdat->node_id; + int zone_type; + + zone_type = zone - pgdat->node_zones; + memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); + zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); +} + +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + struct page *mem_map); +int __add_section(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + struct page *memmap; + int ret; + + ret = sparse_add_one_section(zone, phys_start_pfn, memmap); + + hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION); + + if (ret < 0) + return ret; + + __add_zone(zone, phys_start_pfn); + return register_new_memory(__pfn_to_section(phys_start_pfn)); +} + +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __add_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int err = 0; + + printk(KERN_DEBUG "%s(%p, %08lx, %ld)\n", __func__, + zone, phys_start_pfn, nr_pages); + + for (i = 0; !err && (i < nr_pages); i += PAGES_PER_SECTION) { + printk(KERN_DEBUG "\tfor: i: %ld\n", i); + err = __add_section(zone, phys_start_pfn + i); + } + + return err; +} + +static void grow_zone_span(struct zone *zone, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_zone_end_pfn; + + zone_span_writelock(zone); + + old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + if (start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + if (end_pfn > old_zone_end_pfn) + zone->spanned_pages = end_pfn - zone->zone_start_pfn; + + zone_span_writeunlock(zone); +} + +static void grow_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + if (end_pfn > old_pgdat_end_pfn) + pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; +} + +#ifdef CONFIG_SIMULATED_MEM_HOTPLUG +int page_is_hotpluggable_ram(unsigned long pfn) +{ + extern struct e820map bios_e820; + extern int page_is_ram_e820(unsigned long, struct e820map*); + + return page_is_ram_e820(pfn, &bios_e820); +} +#else +int page_is_hotpluggable_ram(unsigned long pfn) +{ + return 1; +} +#endif + +int online_pages(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long i; + unsigned long flags; + unsigned long onlined_pages = 0; + struct zone *zone; + + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_sem. + */ + zone = page_zone(pfn_to_page(pfn)); + pgdat_resize_lock(zone->zone_pgdat, &flags); + grow_zone_span(zone, pfn, pfn + nr_pages); + grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + printk(KERN_DEBUG "%s: onlining 0x%lx pages starting from pfn: 0x%lx\n", + __func__, nr_pages, pfn); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); + + if (page_is_hotpluggable_ram(pfn + i)) { + online_page(page); + onlined_pages++; + } + } + zone->present_pages += onlined_pages; + + return 0; +} --- linux.orig/mm/page_alloc.c~C1-pcp_zone_init 2005-07-28 13:50:17.000000000 -0700 +++ linux/mm/page_alloc.c 2005-07-28 14:29:39.000000000 -0700 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -77,21 +78,44 @@ unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (zone != page_zone(page)) + return 0; + + return 1; +} /* * Temporary debugging check for pages not lying within a given zone. */ static int bad_range(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + if (page_outside_zone_boundaries(zone, page)) return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) - return 1; -#endif - if (zone != page_zone(page)) + if (!page_is_consistent(zone, page)) return 1; + return 0; } @@ -1382,7 +1406,7 @@ /* * Builds allocation fallback zone lists. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +int __devinit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) { switch (k) { struct zone *zone; @@ -1390,7 +1414,12 @@ BUG(); case ZONE_HIGHMEM: zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { + /* + * with mem hotplug we don't increment present_pages + * until the pages are actually freed into the zone, + * but we increment spanned pages much earlier + */ + if (zone->spanned_pages) { #ifndef CONFIG_HIGHMEM BUG(); #endif @@ -1398,11 +1427,11 @@ } case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; } @@ -1472,7 +1501,7 @@ return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +void __devinit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; @@ -1519,7 +1548,7 @@ #else /* CONFIG_NUMA */ -static void __init build_zonelists(pg_data_t *pgdat) +void __devinit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; @@ -1640,7 +1669,7 @@ * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -1851,6 +1880,68 @@ #endif +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long size_bytes; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + size_bytes = zone->wait_table_size * sizeof(wait_queue_head_t); + if (system_state >= SYSTEM_RUNNING) + zone->wait_table = kmalloc(size_bytes, GFP_KERNEL); + else + zone->wait_table = alloc_bootmem_node(pgdat, size_bytes); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + if ((zone_start_pfn) & (zone_required_alignment-1)) + printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); + zone->spanned_pages = size; +} + + /* * Set up the zone data structures: * - mark all pages reserved @@ -1860,11 +1951,11 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - int cpu, nid = pgdat->node_id; + unsigned long j; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; @@ -1872,7 +1963,6 @@ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; realsize = size = zones_size[j]; if (zholes_size) @@ -1882,29 +1972,17 @@ nr_kernel_pages += realsize; nr_all_pages += realsize; - zone->spanned_pages = size; zone->present_pages = realsize; zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - batch = zone_batchsize(zone); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1915,35 +1993,12 @@ if (!size) continue; - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - if ((zone_start_pfn) & (zone_required_alignment-1)) - printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); - memmap_init(size, nid, j, zone_start_pfn); zonetable_add(zone, nid, j, zone_start_pfn, size); - + init_currently_empty_zone(zone, zone_start_pfn, size); + //memmap_init(size, nid, zone_idx(zone), zone_start_pfn); zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } @@ -2343,7 +2398,7 @@ * that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -static void setup_per_zone_pages_min(void) +void setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -2552,3 +2607,32 @@ return table; } + +static inline int zone_previously_initialized(struct zone *zone) +{ + if (zone->wait_table_size) + return 1; + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages) +{ + if (zone_previously_initialized(zone)) + return -EEXIST; + + zone_wait_table_init(zone, PAGES_PER_SECTION); + init_currently_empty_zone(zone, phys_start_pfn, PAGES_PER_SECTION); + zone_pcp_init(zone); + + /* + * FIXME: there is no locking at all for the zonelists. + * Least impactful (codewise) way to do this is probably + * to freeze all the CPUs for a sec while this is done. + */ + build_zonelists(zone->zone_pgdat); + + return 0; +} +#endif --- linux.orig/mm/sparse.c~A3-sparsemem-extreme 2005-07-28 13:50:13.000000000 -0700 +++ linux/mm/sparse.c 2005-07-28 14:29:35.000000000 -0700 @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include #include /* @@ -13,9 +15,66 @@ * * 1) mem_section - memory sections, mem_map's for valid memory */ -struct mem_section mem_section[NR_MEM_SECTIONS]; +#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME +struct mem_section *mem_section[NR_SECTION_ROOTS] + ____cacheline_maxaligned_in_smp; +#else +struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] + ____cacheline_maxaligned_in_smp; +#endif EXPORT_SYMBOL(mem_section); +#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME +static struct mem_section *sparse_index_alloc(int nid) +{ + struct mem_section *section = NULL; + unsigned long array_size = SECTIONS_PER_ROOT * + sizeof(struct mem_section); + + if (system_state < SYSTEM_RUNNING) + section = alloc_bootmem_node(NODE_DATA(nid), array_size); + else + section = kmalloc_node(array_size, GFP_KERNEL, nid); + + if (section) + memset(section, 0, array_size); + + return section; +} + +static int sparse_index_init(unsigned long section_nr, int nid) +{ + static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; + unsigned long root = SECTION_NR_TO_ROOT(section_nr); + struct mem_section *section; + int ret = 0; + + section = sparse_index_alloc(nid); + /* + * This lock keeps two different sections from + * reallocating for the same index + */ + spin_lock(&index_init_lock); + + if (mem_section[root]) { + if (system_state >= SYSTEM_RUNNING) + kfree(mem_section); + ret = -EEXIST; + goto out; + } + + mem_section[root] = section; +out: + spin_unlock(&index_init_lock); + return ret; +} +#else /* !SPARSEMEM_EXTREME */ +static inline int sparse_index_init(unsigned long section_nr, int nid) +{ + return 0; +} +#endif + /* Record a memory area against a node. */ void memory_present(int nid, unsigned long start, unsigned long end) { @@ -23,9 +82,14 @@ start &= PAGE_SECTION_MASK; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { - unsigned long section = pfn_to_section_nr(pfn); - if (!mem_section[section].section_mem_map) - mem_section[section].section_mem_map = SECTION_MARKED_PRESENT; + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms; + + sparse_index_init(section_nr, nid); + + ms = __nr_to_section(section_nr); + if (!ms->section_mem_map) + ms->section_mem_map = SECTION_MARKED_PRESENT; } } @@ -85,6 +149,7 @@ { struct page *map; int nid = early_pfn_to_nid(section_nr_to_pfn(pnum)); + struct mem_section *ms = __nr_to_section(pnum); map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) @@ -96,8 +161,47 @@ return map; printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); - mem_section[pnum].section_mem_map = 0; + ms->section_mem_map = 0; + return NULL; +} + +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * nr_pages; + + page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + memset(ret, 0, memmap_size); + + return ret; +} + +static int vaddr_in_vmalloc_area(void *addr) +{ + if (addr >= (void *)VMALLOC_START && + addr < (void *)VMALLOC_END) + return 1; + return 0; +} + +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +{ + if (vaddr_in_vmalloc_area(memmap)) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * nr_pages)); } /* @@ -114,8 +218,9 @@ continue; map = sparse_early_mem_map_alloc(pnum); - if (map) - sparse_init_one_section(&mem_section[pnum], pnum, map); + if (!map) + continue; + sparse_init_one_section(__nr_to_section(pnum), pnum, map); } } @@ -124,14 +229,37 @@ * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) +int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + int nr_pages) { - struct mem_section *ms = __pfn_to_section(start_pfn); - - if (ms->section_mem_map & SECTION_MARKED_PRESENT) - return -EEXIST; - + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct pglist_data *pgdat = zone->zone_pgdat; + struct mem_section *ms; + struct page *memmap; + unsigned long flags; + int ret; + + /* + * no locking for this, because it does its own + * plus, it does a kmalloc + */ + sparse_index_init(section_nr, pgdat->node_id); + memmap = __kmalloc_section_memmap(nr_pages); + + pgdat_resize_lock(pgdat, &flags); + + ms = __pfn_to_section(start_pfn); + if (ms->section_mem_map & SECTION_MARKED_PRESENT) { + ret = -EEXIST; + goto out; + } ms->section_mem_map |= SECTION_MARKED_PRESENT; - return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); + ret = sparse_init_one_section(ms, section_nr, memmap); + + if (ret <= 0) + __kfree_section_memmap(memmap, nr_pages); +out: + pgdat_resize_unlock(pgdat, &flags); + return ret; } --- linux.orig/mm/vmscan.c~AA-PM-01-steal_page_from_lru 2005-07-28 14:29:40.000000000 -0700 +++ linux/mm/vmscan.c 2005-07-28 14:29:40.000000000 -0700 @@ -582,22 +582,8 @@ while (scan++ < nr_to_scan && !list_empty(src)) { page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - - if (!TestClearPageLRU(page)) - BUG(); - list_del(&page->lru); - if (get_page_testone(page)) { - /* - * It is being freed elsewhere - */ - __put_page(page); - SetPageLRU(page); - list_add(&page->lru, src); - continue; - } else { - list_add(&page->lru, dst); + if (isolate_lru_onepage(page, src, dst)) nr_taken++; - } } *scanned = scan; @@ -650,13 +636,10 @@ */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + if (PageActive(page) && page_under_capture(page)) + ClearPageActive(page); + __putback_page_to_lru(zone, page); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec);