--- linux.orig/arch/alpha/mm/numa.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/alpha/mm/numa.c 2005-07-28 13:50:20.000000000 -0700 @@ -371,6 +371,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); i = node_spanned_pages(nid); while (i-- > 0) { struct page *page = nid_page_nr(nid, i); @@ -384,6 +386,7 @@ else shared += page_count(page) - 1; } + pgdat_resize_unlock(NODE_DATA(nid), &flags); } printk("%ld pages of RAM\n",total); printk("%ld free pages\n",free); --- linux.orig/arch/i386/Kconfig~A4-sparsemem-extreme-as-default 2005-07-28 13:50:14.000000000 -0700 +++ linux/arch/i386/Kconfig 2005-07-28 14:29:39.000000000 -0700 @@ -753,6 +753,7 @@ depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) + select SPARSEMEM_STATIC # Need comments to help the hapless user trying to turn on NUMA support comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" @@ -785,13 +786,25 @@ def_bool y depends on NUMA -config ARCH_DISCONTIGMEM_DEFAULT +config SIMULATED_MEM_HOTPLUG + bool "Simulate memory hotplug on non-hotplug hardware" + depends on EXPERIMENTAL && HIGHMEM + +config X86_SPARSEMEM_DEBUG_NONUMA + bool "Enable SPARSEMEM on flat systems (debugging only)" + depends on !NUMA + +config ARCH_MEMORY_PROBE def_bool y - depends on NUMA + depends on X86_SPARSEMEM_DEBUG_NONUMA + +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_SPARSEMEM_DEBUG_NONUMA config ARCH_SPARSEMEM_ENABLE def_bool y - depends on NUMA + depends on NUMA || X86_SPARSEMEM_DEBUG_NONUMA config ARCH_SELECT_MEMORY_MODEL def_bool y @@ -908,7 +921,7 @@ config BOOT_IOREMAP bool depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) - default y + default y || SIMULATED_MEM_HOTPLUG config REGPARM bool "Use register arguments (EXPERIMENTAL)" --- linux.orig/arch/i386/kernel/setup.c~E0-for-debugging-page_is_ram_hotplug 2005-07-28 14:29:37.000000000 -0700 +++ linux/arch/i386/kernel/setup.c 2005-07-28 14:29:38.000000000 -0700 @@ -145,6 +145,7 @@ EXPORT_SYMBOL(ist_info); #endif struct e820map e820; +struct e820map bios_e820; extern void early_cpu_init(void); extern void dmi_scan_machine(void); @@ -1120,11 +1121,19 @@ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); #endif + /* + * This will only work for contiguous memory systems. + * + * Leave the evil #ifdef as a big FIXME until you do + * this properly + */ +#ifdef CONFIG_SPARSEMEM + memory_present(/*node*/0, /*start_pfn*/0, max_pfn); +#endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); setup_bootmem_allocator(); - return max_low_pfn; } @@ -1512,6 +1521,7 @@ else { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); print_memory_map(machine_specific_memory_setup()); + bios_e820 = e820; } copy_edd(); --- linux.orig/arch/i386/kernel/sys_i386.c~AA-PM-22-vm_immovable 2005-07-28 14:30:01.000000000 -0700 +++ linux/arch/i386/kernel/sys_i386.c 2005-07-28 14:30:01.000000000 -0700 @@ -70,7 +70,7 @@ unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + return do_mmap2(addr, len, prot, flags & ~MAP_IMMOVABLE, fd, pgoff); } /* @@ -101,7 +101,8 @@ if (a.offset & ~PAGE_MASK) goto out; - err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + err = do_mmap2(a.addr, a.len, a.prot, a.flags & ~MAP_IMMOVABLE, + a.fd, a.offset >> PAGE_SHIFT); out: return err; } --- linux.orig/arch/i386/mm/discontig.c~D1-i386-hotplug-functions 2005-07-28 14:29:36.000000000 -0700 +++ linux/arch/i386/mm/discontig.c 2005-07-28 14:29:36.000000000 -0700 @@ -98,7 +98,7 @@ extern unsigned long find_max_low_pfn(void); extern void find_max_pfn(void); -extern void one_highpage_init(struct page *, int, int); +extern void add_one_highpage_init(struct page *, int, int); extern struct e820map e820; extern unsigned long init_pg_tables_end; @@ -416,7 +416,7 @@ if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - one_highpage_init(page, node_pfn, bad_ppro); + add_one_highpage_init(page, node_pfn, bad_ppro); } } totalram_pages += totalhigh_pages; --- linux.orig/arch/i386/mm/init.c~D1-i386-hotplug-functions 2005-07-28 14:29:36.000000000 -0700 +++ linux/arch/i386/mm/init.c 2005-07-28 14:29:37.000000000 -0700 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -191,38 +192,42 @@ extern int is_available_memory(efi_memory_desc_t *); -int page_is_ram(unsigned long pagenr) +static int page_is_ram_efi(unsigned long pagenr) { +#ifdef CONFIG_EFI int i; unsigned long addr, end; + efi_memory_desc_t *md; - if (efi_enabled) { - efi_memory_desc_t *md; - - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + if (!is_available_memory(md)) + continue; + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; + if ((pagenr >= addr) && (pagenr < end)) + return 1; } +#endif /* CONFIG_EFI */ + return 0; +} - for (i = 0; i < e820.nr_map; i++) { +int page_is_ram_e820(unsigned long pagenr, struct e820map *local_e820) +{ + int i; + unsigned long addr, end; - if (e820.map[i].type != E820_RAM) /* not usable memory */ + for (i = 0; i < local_e820->nr_map; i++) { + + if (local_e820->map[i].type != E820_RAM) /* not usable memory */ continue; /* * !!!FIXME!!! Some BIOSen report areas as RAM that * are not. Notably the 640->1Mb area. We need a sanity * check here. */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + addr = (local_e820->map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (local_e820->map[i].addr+local_e820->map[i].size) >> PAGE_SHIFT; if ((pagenr >= addr) && (pagenr < end)) return 1; } @@ -265,17 +270,45 @@ pkmap_page_table = pte; } -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +void __devinit free_new_highpage(struct page *page) +{ + set_page_count(page, 1); + __free_page(page); + totalhigh_pages++; +} + +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - totalhigh_pages++; } else SetPageReserved(page); } +int add_one_highpage_hotplug(struct page *page, int pfn) +{ + free_new_highpage(page); + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr++; +#endif + num_physpages++; + return 0; +} + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage_hotplug(page, page_to_pfn(page)); +} + + #ifdef CONFIG_NUMA extern void set_highmem_pages_init(int); #else @@ -283,7 +316,7 @@ { int pfn; for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } #endif /* CONFIG_FLATMEM */ @@ -614,6 +647,28 @@ #endif } +/* + * this is for the non-NUMA, single node SMP system case. + * Specifically, in the case of x86, we will always add + * memory to the highmem for now. + */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +int add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = &contig_page_data; + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages); +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +#endif + kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; @@ -694,3 +749,10 @@ } } #endif + +int page_is_ram(unsigned long pagenr) +{ + if (efi_enabled) + return page_is_ram_efi(pagenr); + return page_is_ram_e820(pagenr, &e820); +} --- linux.orig/arch/i386/mm/pgtable.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/i386/mm/pgtable.c 2005-07-28 13:50:20.000000000 -0700 @@ -31,11 +31,13 @@ pg_data_t *pgdat; unsigned long i; struct page_state ps; + unsigned long flags; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -48,6 +50,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); --- linux.orig/arch/ia64/Kconfig~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/Kconfig 2005-07-28 14:28:23.000000000 -0700 @@ -62,8 +62,6 @@ bool "generic" select NUMA select ACPI_NUMA - select VIRTUAL_MEM_MAP - select DISCONTIGMEM help This selects the system type of your hardware. A "generic" kernel will run on any supported IA-64 system. However, if you configure @@ -187,6 +185,7 @@ config VIRTUAL_MEM_MAP bool "Virtual mem map" + depends on !SPARSEMEM default y if !IA64_HP_SIM help Say Y to compile the kernel with support for a virtual mem map. @@ -199,16 +198,6 @@ bool default y if VIRTUAL_MEM_MAP -config ARCH_DISCONTIGMEM_ENABLE - bool "Discontiguous memory support" - depends on (IA64_DIG || IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB) && NUMA && VIRTUAL_MEM_MAP - default y if (IA64_SGI_SN2 || IA64_GENERIC) && NUMA - help - Say Y to support efficient handling of discontiguous physical memory, - for architectures which are either NUMA (Non-Uniform Memory Access) - or have huge holes in the physical address space for other reasons. - See for more. - config IA64_CYCLONE bool "Cyclone (EXA) Time Source support" help @@ -231,8 +220,10 @@ based on a network adapter and DMA messaging. config FORCE_MAX_ZONEORDER - int - default "18" + int "MAX_ORDER (11 - 20)" if !HUGETLB_PAGE + range 11 20 if !HUGETLB_PAGE + default "18" if HUGETLB_PAGE + default "11" config SMP bool "Symmetric multi-processing support" @@ -297,6 +288,35 @@ source "mm/Kconfig" +config SECTION_BITS + int + depends on SPARSEMEM + range 28 32 if !HUGETLB_PAGE + default "32" if HUGETLB_PAGE + default "28" + help + Size of memory section in bits. + +config PHYSICAL_MEMORY_BITS + int + depends on SPARSEMEM + range 44 50 + default 44 + help + Maximum physical memory address bits. + +config ARCH_SPARSEMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_DISCONTIGMEM_DEFAULT + def_bool y + depends on NUMA + +config ARCH_DISCONTIGMEM_ENABLE + def_bool y + depends NUMA + config HAVE_DEC_LOCK bool depends on (SMP || PREEMPT) --- linux.orig/arch/ia64/mm/Makefile~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/mm/Makefile 2005-07-28 13:50:11.000000000 -0700 @@ -7,6 +7,5 @@ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DISCONTIGMEM) += discontig.o -ifndef CONFIG_DISCONTIGMEM -obj-y += contig.o -endif +obj-$(CONFIG_SPARSEMEM) += discontig.o +obj-$(CONFIG_FLATMEM) += contig.o --- linux.orig/arch/ia64/mm/discontig.c~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/mm/discontig.c 2005-07-28 13:50:20.000000000 -0700 @@ -421,6 +421,21 @@ return; } +#ifdef CONFIG_SPARSEMEM +static int __init register_sparse_mem(unsigned long start, unsigned long end, + void *arg) +{ + int nid; + + start = __pa(start) >> PAGE_SHIFT; + end = __pa(end) >> PAGE_SHIFT; + nid = early_pfn_to_nid(start); + (void) memory_present(nid, start, end); + + return 0; +} +#endif + /** * find_memory - walk the EFI memory map and setup the bootmem allocator * @@ -443,6 +458,9 @@ max_low_pfn = 0; /* These actually end up getting called by call_pernode_memory() */ +#ifdef CONFIG_SPARSEMEM + efi_memmap_walk(register_sparse_mem, (void *) 0); +#endif efi_memmap_walk(filter_rsvd_memory, build_node_maps); efi_memmap_walk(filter_rsvd_memory, find_pernode_space); @@ -524,12 +542,19 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long present = pgdat->node_present_pages; + unsigned long present; + unsigned long flags; int shared = 0, cached = 0, reserved = 0; + printk("Node ID: %d\n", pgdat->node_id); + pgdat_resize_lock(pgdat, &flags); + present = pgdat->node_present_pages; for(i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page = pgdat_page_nr(pgdat, i); - if (!ia64_pfn_valid(pgdat->node_start_pfn+i)) + unsigned long pfn = pgdat->node_start_pfn + i; + struct page *page; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + else continue; if (PageReserved(page)) reserved++; @@ -538,6 +563,7 @@ else if (page_count(page)) shared += page_count(page)-1; } + pgdat_resize_unlock(pgdat, &flags); total_present += present; total_reserved += reserved; total_cached += cached; @@ -648,6 +674,8 @@ max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; + sparse_init(); + efi_memmap_walk(filter_rsvd_memory, count_node_pages); vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page)); @@ -687,10 +715,13 @@ (mem_data[node].num_physpages - mem_data[node].num_dma_physpages); } - pfn_offset = mem_data[node].min_pfn; +#ifndef CONFIG_SPARSEMEM NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; +#endif + + free_area_init_node(node, NODE_DATA(node), zones_size, pfn_offset, zholes_size); } --- linux.orig/arch/ia64/mm/init.c~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/mm/init.c 2005-07-28 14:29:37.000000000 -0700 @@ -584,7 +584,7 @@ platform_dma_init(); #endif -#ifndef CONFIG_DISCONTIGMEM +#if !defined(CONFIG_DISCONTIGMEM) && !defined(CONFIG_SPARSEMEM) if (!mem_map) BUG(); max_mapnr = max_low_pfn; @@ -631,3 +631,24 @@ ia32_mem_init(); #endif } + +#ifdef CONFIG_MEMORY_HOTPLUG +void online_page(struct page *page) +{ + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + totalram_pages++; + num_physpages++; +} + +int add_memory(u64 start, u64 size, unsigned long attr) +{ + return -ENOSYS; +} + +int remove_memory(u64 start, u64 size, unsigned long attr) +{ + return -ENOSYS; +} +#endif --- linux.orig/arch/ia64/mm/numa.c~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/arch/ia64/mm/numa.c 2005-07-28 13:50:11.000000000 -0700 @@ -47,3 +47,26 @@ return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0); } + +#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_NUMA) +/* + * Because of holes evaluate on section limits. + */ +int early_pfn_to_nid(unsigned long pfn) +{ + int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec; + + for (i = 0; i < num_node_memblks; i++) { + ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT; + esec = (node_memblk[i].start_paddr + node_memblk[i].size + + ((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT; + if (section >= ssec && section < esec) + break; + } + + if (i == num_node_memblks) + return 0; + else + return node_memblk[i].nid; +} +#endif --- linux.orig/arch/m32r/mm/init.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/m32r/mm/init.c 2005-07-28 13:50:20.000000000 -0700 @@ -48,6 +48,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -60,6 +62,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); @@ -150,10 +153,14 @@ int reservedpages, nid, i; reservedpages = 0; - for_each_online_node(nid) + for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) if (PageReserved(nid_page_nr(nid, i))) reservedpages++; + pgdat_resize_unlock(NODE_DATA(nid), &flags); + } return reservedpages; } --- linux.orig/arch/parisc/mm/init.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/parisc/mm/init.c 2005-07-28 13:50:20.000000000 -0700 @@ -505,7 +505,9 @@ for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { struct page *p; + unsigned long flags; + pgdat_resize_lock(NODE_DATA(i), &flags); p = nid_page_nr(i, j) - node_start_pfn(i); total++; @@ -517,6 +519,7 @@ free++; else shared += page_count(p) - 1; + pgdat_resize_unlock(NODE_DATA(i), &flags); } } #endif --- linux.orig/arch/ppc64/mm/init.c~C5.2-pgdat_size_lock 2005-07-28 13:50:20.000000000 -0700 +++ linux/arch/ppc64/mm/init.c 2005-07-28 14:29:36.000000000 -0700 @@ -97,6 +97,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); total++; @@ -107,6 +109,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%ld pages of RAM\n", total); printk("%ld reserved pages\n", reserved); @@ -662,11 +665,14 @@ #endif for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); if (PageReserved(page)) reservedpages++; } + pgdat_resize_unlock(pgdat, &flags); } codesize = (unsigned long)&_etext - (unsigned long)&_stext; @@ -864,3 +870,80 @@ return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); + +#ifdef CONFIG_MEMORY_HOTPLUG + +void online_page(struct page *page) +{ + ClearPageReserved(page); + free_cold_page(page); + totalram_pages++; + num_physpages++; +} + +/* + * This works only for the non-NUMA case. Later, we'll need a lookup + * to convert from real physical addresses to nid, that doesn't use + * pfn_to_nid(). + */ +int __devinit add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(0); + struct zone *zone; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones; + + return __add_pages(zone, start_pfn, nr_pages); + + return 0; +} + +/* + * First pass at this code will check to determine if the remove + * request is within the RMO. Do not allow removal within the RMO. + */ +int __devinit remove_memory(u64 start, u64 size) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + end_pfn = start_pfn + nr_pages; + + printk("%s(): Attempting to remove memoy in range " + "%lx to %lx\n", __func__, start, start+size); + /* + * check for range within RMO + */ + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s(): memory will be removed from " + "the %s zone\n", __func__, zone->name); + + /* + * not handling removing memory ranges that + * overlap multiple zones yet + */ + if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages)) + goto overlap; + + /* make sure it is NOT in RMO */ + if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) { + printk("%s(): range to be removed must NOT be in RMO!\n", + __func__); + goto in_rmo; + } + + return __remove_pages(zone, start_pfn, nr_pages); + +overlap: + printk("%s(): memory range to be removed overlaps " + "multiple zones!!!\n", __func__); +in_rmo: + return -1; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ --- linux.orig/arch/x86_64/kernel/setup.c~B-sparse-182-x86_64-later-sparse-init 2005-07-28 13:50:12.000000000 -0700 +++ linux/arch/x86_64/kernel/setup.c 2005-07-28 13:50:12.000000000 -0700 @@ -646,8 +646,6 @@ } #endif - sparse_init(); - #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) { reserve_bootmem(crashk_res.start, --- linux.orig/arch/x86_64/mm/init.c~D3-x86_64-hotplug-functions 2005-07-28 14:29:37.000000000 -0700 +++ linux/arch/x86_64/mm/init.c 2005-07-28 14:29:37.000000000 -0700 @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -179,13 +181,19 @@ {} }; -static __init void *alloc_low_page(int *index, unsigned long *phys) +static __devinit void *alloc_low_page(int *index, unsigned long *phys) { struct temp_map *ti; int i; unsigned long pfn = table_end++, paddr; void *adr; + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC); + *phys = __pa(adr); + return adr; + } + if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); for (i = 0; temp_mappings[i].allocated; i++) { @@ -198,55 +206,95 @@ ti->allocated = 1; __flush_tlb(); adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + memset(adr, 0, PAGE_SIZE); *index = i; *phys = pfn * PAGE_SIZE; return adr; } -static __init void unmap_low_page(int i) +static __devinit void unmap_low_page(int i) { - struct temp_map *ti = &temp_mappings[i]; + struct temp_map *ti; + + if (after_bootmem) + return; + ti = &temp_mappings[i]; set_pmd(ti->pmd, __pmd(0)); ti->allocated = 0; } -static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) + +static void __devinit +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +{ + int i; + + printk("%s: pmd: 0x%p, address: 0x%lx end: 0x%lx\n", + __func__, pmd, address, end); + + for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + unsigned long entry; + + if (address > end) { + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + entry = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | address; + entry &= __supported_pte_mask; + set_pmd(pmd, __pmd(entry)); + } +} + + +static void __devinit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); + + printk("%s: addr: 0x%lx end: 0x%lx pmd: 0x%p\n", + __func__, address, end, pmd); + + if (pmd_none(*pmd)) { + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); + } +} + + + +static void __devinit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { - long i, j; + long i = pud_index(address); - i = pud_index(address); pud = pud + i; + + if (after_bootmem && pud_val(*pud)) { + phys_pmd_update(pud, address, end); + return; + } + for (; i < PTRS_PER_PUD; pud++, i++) { int map; unsigned long paddr, pmd_phys; pmd_t *pmd; - paddr = address + i*PUD_SIZE; - if (paddr >= end) { - for (; i < PTRS_PER_PUD; i++, pud++) - set_pud(pud, __pud(0)); + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; + if (paddr >= end) break; - } - if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { set_pud(pud, __pud(0)); continue; } pmd = alloc_low_page(&map, &pmd_phys); + if (after_bootmem) spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) { - for (; j < PTRS_PER_PMD; j++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } + phys_pmd_init(pmd, paddr, end); + if (after_bootmem) spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } __flush_tlb(); @@ -267,12 +315,16 @@ table_start >>= PAGE_SHIFT; table_end = table_start; + + early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, + table_start< end) next = end; phys_pud_init(pud, __pa(start), __pa(next)); - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + if (!after_bootmem) + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(map); } - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + if (!after_bootmem) + asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); __flush_tlb_all(); - early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, - table_start<node_zones + MAX_NR_ZONES - 2; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + ret = __add_pages(zone, start_pfn, nr_pages, attr); + if (ret) + goto error; + + init_memory_mapping(start, (start + size - 1)); + + return ret; +error: + printk("%s: Problem encountered in __add_pages!\n", __func__); + return ret; +} +EXPORT_SYMBOL(add_memory); + +int remove_memory(u64 start, u64 size, unsigned long attr) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + printk("%s: start: 0x%llx size: 0x%llx attr: 0x%lx\n", + __func__, start, size, attr); + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + /* end_pfn is the last *valid* pfn */ + end_pfn = start_pfn + nr_pages - 1; + + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s: memory will be removed from the %s zone\n", + __func__, zone->name); + printk("%s: start_pfn: 0x%lx nr_pages: 0x%lx end_pfn: 0x%lx\n", + __func__, start_pfn, nr_pages, end_pfn); + + if (zone != page_zone(pfn_to_page(end_pfn))) + goto overlap; + + printk("%s: just before remove pages\n", __func__); + + return __remove_pages(zone, start_pfn, nr_pages, attr); +overlap: + printk("%s: memory range overlaps multiple zones?\n", __func__); + return -ENOSYS; +} +EXPORT_SYMBOL(remove_memory); + +#endif + extern int swiotlb_force; static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, --- linux.orig/arch/x86_64/mm/numa.c~B-sparse-182-x86_64-later-sparse-init 2005-07-28 13:50:12.000000000 -0700 +++ linux/arch/x86_64/mm/numa.c 2005-07-28 13:50:12.000000000 -0700 @@ -87,7 +87,6 @@ start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - memory_present(nodeid, start_pfn, end_pfn); nodedata_phys = find_e820_area(start, end, pgdat_size); if (nodedata_phys == -1L) panic("Cannot find memory pgdat in node %d\n", nodeid); @@ -271,9 +270,14 @@ void __init paging_init(void) { int i; - for_each_online_node(i) { + + for_each_online_node(i) + memory_present(node_start_pfn(i), node_end_pfn(i)); + + sparse_init(); + + for_each_online_node(i) setup_node_zones(i); - } } /* [numa=off] */ --- linux.orig/drivers/base/Makefile~D0-sysfs-memory-class 2005-07-28 13:50:22.000000000 -0700 +++ linux/drivers/base/Makefile 2005-07-28 13:50:22.000000000 -0700 @@ -7,6 +7,7 @@ obj-y += power/ obj-$(CONFIG_FW_LOADER) += firmware_class.o obj-$(CONFIG_NUMA) += node.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_DEBUG_DRIVER),y) EXTRA_CFLAGS += -DDEBUG --- linux.orig/drivers/base/init.c~D0-sysfs-memory-class 2005-07-28 13:50:22.000000000 -0700 +++ linux/drivers/base/init.c 2005-07-28 13:50:22.000000000 -0700 @@ -9,6 +9,7 @@ #include #include +#include extern int devices_init(void); extern int buses_init(void); @@ -39,5 +40,6 @@ platform_bus_init(); system_bus_init(); cpu_dev_init(); + memory_dev_init(); attribute_container_init(); } --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/drivers/base/memory.c 2005-07-28 14:29:37.000000000 -0700 @@ -0,0 +1,465 @@ +/* + * drivers/base/memory.c - basic Memory class support + * + * Written by Matt Tolentino + * Dave Hansen + * + * This file provides the necessary infrastructure to represent + * a SPARSEMEM-memory-model system's physical memory in /sysfs. + * All arch-independent code that assumes MEMORY_HOTPLUG requires + * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. + */ + +#include +#include +#include +#include /* capable() */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMORY_CLASS_NAME "memory" + +struct sysdev_class memory_sysdev_class = { + set_kset_name(MEMORY_CLASS_NAME), +}; +EXPORT_SYMBOL(memory_sysdev_class); + +static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) +{ + return MEMORY_CLASS_NAME; +} + +static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + int retval = 0; + + return retval; +} + +static struct kset_hotplug_ops memory_hotplug_ops = { + .name = memory_hotplug_name, + .hotplug = memory_hotplug, +}; + +static struct notifier_block *memory_chain; + +int register_memory_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&memory_chain, nb); +} + +void unregister_memory_notifier(struct notifier_block *nb) +{ + notifier_chain_unregister(&memory_chain, nb); +} + +/* + * register_memory - Setup a sysfs device for a memory block + */ +int +register_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + int error; + + memory->sysdev.cls = &memory_sysdev_class; + memory->sysdev.id = __section_nr(section); + + error = sysdev_register(&memory->sysdev); + + if (root && !error) + error = sysfs_create_link(&root->sysdev.kobj, + &memory->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); + + return error; +} + +void +unregister_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + BUG_ON(memory->sysdev.cls != &memory_sysdev_class); + BUG_ON(memory->sysdev.id != __section_nr(section)); + + sysdev_unregister(&memory->sysdev); + if (root) + sysfs_remove_link(&root->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); +} + +/* + * use this as the physical section index that this memsection + * uses. + */ + +static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%08lx\n", mem->phys_index); +} + +/* + * online, offline, going offline, etc. + */ +static ssize_t show_mem_state(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + ssize_t len = 0; + + /* + * We can probably put these states in a nice little array + * so that they're not open-coded + */ + switch (mem->state) { + case MEM_ONLINE: + len = sprintf(buf, "online\n"); + break; + case MEM_OFFLINE: + len = sprintf(buf, "offline\n"); + break; + case MEM_GOING_OFFLINE: + len = sprintf(buf, "going-offline\n"); + break; + default: + len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", + mem->state); + WARN_ON(1); + break; + } + + return len; +} + +static inline int memory_notify(unsigned long val, void *v) +{ + return notifier_call_chain(&memory_chain, val, v); +} + +/* + * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is + * OK to have direct references to sparsemem variables in here. + */ +static int +memory_block_action(struct memory_block *mem, unsigned long action) +{ + int i; + unsigned long psection; + unsigned long start_pfn, start_paddr; + struct page *first_page; + int ret; + int old_state = mem->state; + + psection = mem->phys_index; + first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); + + /* + * The probe routines leave the pages reserved, just + * as the bootmem code does. Make sure they're still + * that way. + */ + if (action == MEM_ONLINE) { + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageReserved(first_page+i)) + continue; + + printk(KERN_WARNING "section number %ld page number %d " + "not reserved, was it already online? \n", + psection, i); + return -EBUSY; + } + } + + switch (action) { + case MEM_ONLINE: + start_pfn = page_to_pfn(first_page); + ret = online_pages(start_pfn, PAGES_PER_SECTION); + break; + case MEM_OFFLINE: + mem->state = MEM_GOING_OFFLINE; + memory_notify(MEM_GOING_OFFLINE, NULL); + start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; + ret = remove_memory(start_paddr, + PAGES_PER_SECTION << PAGE_SHIFT); + if (ret) { + mem->state = old_state; + break; + } + memory_notify(MEM_MAPPING_INVALID, NULL); + break; + default: + printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", + __FUNCTION__, mem, action, action); + WARN_ON(1); + ret = -EINVAL; + } + /* + * For now, only notify on successful memory operations + */ + if (!ret) + memory_notify(action, NULL); + + return ret; +} + +static int memory_block_change_state(struct memory_block *mem, + unsigned long to_state, unsigned long from_state_req) +{ + int ret = 0; + down(&mem->state_sem); + + if (mem->state != from_state_req) { + ret = -EINVAL; + goto out; + } + + ret = memory_block_action(mem, to_state); + if (!ret) + mem->state = to_state; + +out: + up(&mem->state_sem); + return ret; +} + +static ssize_t +store_mem_state(struct sys_device *dev, const char *buf, size_t count) +{ + struct memory_block *mem; + unsigned int phys_section_nr; + int ret = -EINVAL; + + mem = container_of(dev, struct memory_block, sysdev); + phys_section_nr = mem->phys_index; + + if (!valid_section_nr(phys_section_nr)) + goto out; + + if (!strncmp(buf, "online", min((int)count, 6))) + ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + else if(!strncmp(buf, "offline", min((int)count, 7))) + ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); +out: + if (ret) + return ret; + return count; +} + +/* + * phys_device is a bad name for this. What I really want + * is a way to differentiate between memory ranges that + * are part of physical devices that constitute + * a complete removable unit or fru. + * i.e. do these ranges belong to the same physical device, + * s.t. if I offline all of these sections I can then + * remove the physical device? + */ +static ssize_t show_phys_device(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%d\n", mem->phys_device); +} + +SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); +SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); +SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); + +#define mem_create_simple_file(mem, attr_name) \ + sysdev_create_file(&mem->sysdev, &attr_##attr_name) +#define mem_remove_simple_file(mem, attr_name) \ + sysdev_remove_file(&mem->sysdev, &attr_##attr_name) + +/* + * Block size attribute stuff + */ +static ssize_t +print_block_size(struct class *class, char *buf) +{ + return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); +} + +static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); + +static int block_size_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_block_size_bytes.attr); + return 0; +} + +/* + * Some architectures will have custom drivers to do this, and + * will not need to do it from userspace. The fake hot-add code + * as well as ppc64 will do all of their discovery in userspace + * and will require this interface. + */ +extern int page_is_hotpluggable_ram(unsigned long pfn); +#ifdef CONFIG_ARCH_MEMORY_PROBE +static ssize_t +memory_probe_store(struct class *class, const char __user *buf, size_t count) +{ + u64 phys_addr; + unsigned long offset; + int ret; + + phys_addr = simple_strtoull(buf, NULL, 0); + + for (offset = 0; offset < PAGES_PER_SECTION; offset++) { + unsigned long page_nr = (phys_addr >> PAGE_SHIFT) + offset; + if (page_is_hotpluggable_ram(page_nr)) + break; + } + if (offset == PAGES_PER_SECTION) + return -EINVAL; + + ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); + + if (ret) + count = ret; + + return count; +} +static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); + +static int memory_probe_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_probe.attr); + return 0; +} +#else +#define memory_probe_init(...) do {} while (0) +#endif + +/* + * Note that phys_device is optional. It is here to allow for + * differentiation between which *physical* devices each + * section belongs to... + */ + +int add_memory_block(unsigned long node_id, struct mem_section *section, + unsigned long state, int phys_device) +{ + size_t size = sizeof(struct memory_block); + struct memory_block *mem = kmalloc(size, GFP_KERNEL); + int ret = 0; + + if (!mem) + return -ENOMEM; + + memset(mem, 0, size); + + mem->phys_index = __section_nr(section); + mem->state = state; + init_MUTEX(&mem->state_sem); + mem->phys_device = phys_device; + + ret = register_memory(mem, section, NULL); + if (!ret) + ret = mem_create_simple_file(mem, phys_index); + if (!ret) + ret = mem_create_simple_file(mem, state); + if (!ret) + ret = mem_create_simple_file(mem, phys_device); + + return ret; +} + +/* + * For now, we have a linear search to go find the appropriate + * memory_block corresponding to a particular phys_index. If + * this gets to be a real problem, we can always use a radix + * tree or something here. + * + * This could be made generic for all sysdev classes. + */ +struct memory_block *find_memory_block(struct mem_section *section) +{ + struct kobject *kobj; + struct sys_device *sysdev; + struct memory_block *mem; + char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; + + /* + * This only works because we know that section == sysdev->id + * slightly redundant with sysdev_register() + */ + sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); + + kobj = kset_find_obj(&memory_sysdev_class.kset, name); + if (!kobj) + return NULL; + + sysdev = container_of(kobj, struct sys_device, kobj); + mem = container_of(sysdev, struct memory_block, sysdev); + + return mem; +} + +int remove_memory_block(unsigned long node_id, struct mem_section *section, + int phys_device) +{ + struct memory_block *mem; + + mem = find_memory_block(section); + mem_remove_simple_file(mem, phys_index); + mem_remove_simple_file(mem, state); + mem_remove_simple_file(mem, phys_device); + unregister_memory(mem, section, NULL); + + return 0; +} + +/* + * need an interface for the VM to add new memory regions, + * but without onlining it. + */ +int register_new_memory(struct mem_section *section) +{ + return add_memory_block(0, section, MEM_OFFLINE, 0); +} + +int unregister_memory_section(struct mem_section *section) +{ + if (!valid_section(section)) + return -EINVAL; + + return remove_memory_block(0, section, 0); +} + +/* + * Initialize the sysfs support for memory devices... + */ +int __init memory_dev_init(void) +{ + unsigned int i; + int ret; + + memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; + ret = sysdev_class_register(&memory_sysdev_class); + + /* + * Create entries for memory sections that were found + * during boot and have been initialized + */ + for (i = 0; i < NR_MEM_SECTIONS; i++) { + if (!valid_section_nr(i)) + continue; + add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); + } + + memory_probe_init(); + block_size_init(); + + return ret; +} --- linux.orig/fs/aio.c~AA-PM-24-aio 2005-07-28 14:30:02.000000000 -0700 +++ linux/fs/aio.c 2005-07-28 14:30:02.000000000 -0700 @@ -131,7 +131,8 @@ dprintk("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, + PROT_READ|PROT_WRITE, + MAP_ANON|MAP_PRIVATE|MAP_IMMOVABLE, 0); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); --- linux.orig/fs/buffer.c~AA-PM-20.0-nowriteback 2005-07-28 14:29:59.000000000 -0700 +++ linux/fs/buffer.c 2005-07-28 14:29:59.000000000 -0700 @@ -3009,6 +3009,50 @@ return 0; } +void +generic_move_buffer(struct page *page, struct page *newpage) +{ + struct buffer_head *bh, *head; + + spin_lock(&page->mapping->private_lock); + bh = head = page_buffers(page); + do { + get_bh(bh); + lock_buffer(bh); + } while ((bh = bh->b_this_page) != head); + + newpage->private = page->private; + page->private = 0; + page_cache_release(page); + page_cache_get(newpage); + + /* XXX */ + ClearPagePrivate(page); + SetPagePrivate(newpage); + + bh = head; + do { + BUG_ON(bh->b_page != page); + set_bh_page(bh, newpage, (unsigned long)bh->b_data & (PAGE_SIZE - 1)); + } while ((bh = bh->b_this_page) != head); + spin_unlock(&page->mapping->private_lock); + /* buffers are unlocked when remapping is complete */ +} + +void +unlock_page_buffer(struct page *page) +{ + struct buffer_head *bh, *head; + + spin_lock(&page->mapping->private_lock); + bh = head = page_buffers(page); + do { + put_bh(bh); + unlock_buffer(bh); + } while ((bh = bh->b_this_page) != head); + spin_unlock(&page->mapping->private_lock); +} + /* * Buffer-head allocation */ @@ -3133,6 +3177,7 @@ EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand); +EXPORT_SYMBOL(generic_move_buffer); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(ll_rw_block); --- linux.orig/fs/ext2/inode.c~AA-PM-21-nowriteback-ext2 2005-07-28 14:30:00.000000000 -0700 +++ linux/fs/ext2/inode.c 2005-07-28 14:30:00.000000000 -0700 @@ -31,6 +31,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" #include "xip.h" @@ -690,6 +691,12 @@ return mpage_writepages(mapping, wbc, ext2_get_block); } +static int +ext2_migrate_page(struct page *from, struct page *to) +{ + return generic_migrate_page(from, to, migrate_page_buffer); +} + struct address_space_operations ext2_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -700,6 +707,7 @@ .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migrate_page = ext2_migrate_page, }; struct address_space_operations ext2_aops_xip = { @@ -717,6 +725,7 @@ .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migrate_page = ext2_migrate_page, }; /* --- linux.orig/fs/ext3/inode.c~AA-PM-21-nowriteback-ext3 2005-07-28 14:30:00.000000000 -0700 +++ linux/fs/ext3/inode.c 2005-07-28 14:30:00.000000000 -0700 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "xattr.h" #include "acl.h" @@ -1537,6 +1538,12 @@ return __set_page_dirty_nobuffers(page); } +static int +ext3_migrate_page(struct page *from, struct page *to) +{ + return generic_migrate_page(from, to, migrate_page_buffer); +} + static struct address_space_operations ext3_ordered_aops = { .readpage = ext3_readpage, .readpages = ext3_readpages, @@ -1548,6 +1555,7 @@ .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migrate_page = ext3_migrate_page, }; static struct address_space_operations ext3_writeback_aops = { @@ -1561,6 +1569,7 @@ .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migrate_page = ext3_migrate_page, }; static struct address_space_operations ext3_journalled_aops = { @@ -1574,6 +1583,7 @@ .bmap = ext3_bmap, .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, + .migrate_page = ext3_migrate_page, }; void ext3_set_aops(struct inode *inode) --- linux.orig/fs/namei.c~AA-PM-27-symlink 2005-07-28 14:30:03.000000000 -0700 +++ linux/fs/namei.c 2005-07-28 14:30:03.000000000 -0700 @@ -2419,10 +2419,19 @@ int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct page *page; int err = -ENOMEM; char *kaddr; + /* XXXX: + * This is temporary code. This code should be replaced with proper one + * After the scheme to specify hot removable memory region has defined. + * Or remove this code if pages for symlink files become hot-pluggable. + * 5/Oct/2004 -- taka + */ + mapping_set_gfp_mask(mapping, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); + + page = grab_cache_page(mapping, 0); if (!page) goto fail; err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); --- linux.orig/include/asm-alpha/mman.h~AA-PM-98-MAP_IMMOVABLE-lots-o-arches 2005-07-28 14:30:06.000000000 -0700 +++ linux/include/asm-alpha/mman.h 2005-07-28 14:30:06.000000000 -0700 @@ -28,6 +28,7 @@ #define MAP_NORESERVE 0x10000 /* don't check for reservations */ #define MAP_POPULATE 0x20000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x40000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x80000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ --- linux.orig/include/asm-arm/mman.h~AA-PM-98-MAP_IMMOVABLE-lots-o-arches 2005-07-28 14:30:06.000000000 -0700 +++ linux/include/asm-arm/mman.h 2005-07-28 14:30:06.000000000 -0700 @@ -22,6 +22,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) page tables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ --- linux.orig/include/asm-arm26/mman.h~AA-PM-98-MAP_IMMOVABLE-lots-o-arches 2005-07-28 14:30:06.000000000 -0700 +++ linux/include/asm-arm26/mman.h 2005-07-28 14:30:06.000000000 -0700 @@ -22,6 +22,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) page tables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ --- linux.orig/include/asm-i386/mman.h~AA-PM-22-vm_immovable 2005-07-28 14:30:01.000000000 -0700 +++ linux/include/asm-i386/mman.h 2005-07-28 14:30:01.000000000 -0700 @@ -22,6 +22,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ --- linux.orig/include/asm-i386/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-i386/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -88,12 +88,6 @@ __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) --- linux.orig/include/asm-i386/sparsemem.h~E3-for-debugging-more-FLAGS_RESERVED 2005-07-28 14:29:38.000000000 -0700 +++ linux/include/asm-i386/sparsemem.h 2005-07-28 14:29:38.000000000 -0700 @@ -15,7 +15,7 @@ * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space */ #ifdef CONFIG_X86_PAE -#define SECTION_SIZE_BITS 30 +#define SECTION_SIZE_BITS 28 #define MAX_PHYSADDR_BITS 36 #define MAX_PHYSMEM_BITS 36 #else --- linux.orig/include/asm-ia64/meminit.h~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/include/asm-ia64/meminit.h 2005-07-28 13:50:11.000000000 -0700 @@ -41,7 +41,7 @@ #define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1)) #define ORDERROUNDDOWN(n) ((n) & ~((PAGE_SIZE< #include -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA static inline int pfn_to_nid(unsigned long pfn) { @@ -39,8 +39,8 @@ # define NR_NODE_MEMBLKS (MAX_NUMNODES * 4) #endif -#else /* CONFIG_DISCONTIGMEM */ +#else /* CONFIG_NUMA */ # define NR_NODE_MEMBLKS (MAX_NUMNODES * 4) -#endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NUMA */ #endif /* _ASM_IA64_MMZONE_H */ --- linux.orig/include/asm-ia64/nodedata.h~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/include/asm-ia64/nodedata.h 2005-07-28 13:50:11.000000000 -0700 @@ -17,7 +17,7 @@ #include #include -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA /* * Node Data. One of these structures is located on each node of a NUMA system. @@ -47,6 +47,6 @@ */ #define NODE_DATA(nid) (local_node_data->pg_data_ptrs[nid]) -#endif /* CONFIG_DISCONTIGMEM */ +#endif /* CONFIG_NUMA */ #endif /* _ASM_IA64_NODEDATA_H */ --- linux.orig/include/asm-ia64/page.h~B-sparse-180-sparsemem-ia64 2005-07-28 13:50:11.000000000 -0700 +++ linux/include/asm-ia64/page.h 2005-07-28 13:50:11.000000000 -0700 @@ -88,17 +88,17 @@ #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_VIRTUAL_MEM_MAP +#ifdef CONFIG_VIRTUAL_MEM_MAP extern int ia64_pfn_valid (unsigned long pfn); -#else +#elif CONFIG_FLATMEM # define ia64_pfn_valid(pfn) 1 #endif -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM # define pfn_valid(pfn) (((pfn) < max_mapnr) && ia64_pfn_valid(pfn)) # define page_to_pfn(page) ((unsigned long) (page - mem_map)) # define pfn_to_page(pfn) (mem_map + (pfn)) -#else +#elif CONFIG_DISCONTIGMEM extern struct page *vmem_map; extern unsigned long max_low_pfn; # define pfn_valid(pfn) (((pfn) < max_low_pfn) && ia64_pfn_valid(pfn)) @@ -106,6 +106,10 @@ # define pfn_to_page(pfn) (vmem_map + (pfn)) #endif +#if defined(CONFIG_NUMA) && defined(CONFIG_SPARSEMEM) +extern int early_pfn_to_nid(unsigned long pfn); +#endif + #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) @@ -124,8 +128,11 @@ * expressed in this way to ensure they result in a single "dep" * instruction. */ -#define __pa(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;}) -#define __va(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;}) +#define __boot_pa(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;}) +#define __boot_va(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;}) +#define __pa(x) __boot_pa(x) +#define __va(x) __boot_va(x) +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #define REGION_NUMBER(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg;}) #define REGION_OFFSET(x) ({ia64_va _v; _v.l = (long) (x); _v.f.off;}) --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/include/asm-ia64/sparsemem.h 2005-07-28 13:50:11.000000000 -0700 @@ -0,0 +1,32 @@ +#ifndef _ASM_IA64_SPARSEMEM_H +#define _ASM_IA64_SPARSEMEM_H + +#ifdef CONFIG_SPARSEMEM + /* + * SECTION_SIZE_BITS 2^N: how big each section will be + * MAX_PHYSADDR_BITS 2^N: how much physical address space we have + * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space + */ + +#define SECTION_SIZE_BITS CONFIG_SECTION_BITS + +/* + * If FORCE_MAX_ORDER is used, then check and possibly enforce the boundary + * condition on SECTION_SIZE_BITS's magnitude. + */ +#ifdef CONFIG_FORCE_MAX_ZONEORDER +#if ((CONFIG_FORCE_MAX_ZONEORDER+PAGE_SHIFT) > SECTION_SIZE_BITS) +#undef SECTION_SIZE_BITS +#define SECTION_SIZE_BITS (CONFIG_FORCE_MAX_ZONEORDER+PAGE_SHIFT) +#endif +#endif + +#define MAX_PHYSADDR_BITS CONFIG_PHYSICAL_MEMORY_BITS +#define MAX_PHYSMEM_BITS CONFIG_PHYSICAL_MEMORY_BITS + +/* until we think of something better */ +#define page_is_ram(pfn) 1 + +#endif /* CONFIG_SPARSEMEM */ + +#endif /* _ASM_IA64_SPARSEMEM_H */ --- linux.orig/include/asm-m32r/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-m32r/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -21,12 +21,6 @@ __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = pfn; \ --- linux.orig/include/asm-parisc/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-parisc/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -27,12 +27,6 @@ }) #define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ --- linux.orig/include/asm-ppc64/abs_addr.h~D2-ppc64-hotplug-functions 2005-07-28 14:29:36.000000000 -0700 +++ linux/include/asm-ppc64/abs_addr.h 2005-07-28 14:29:36.000000000 -0700 @@ -104,5 +104,7 @@ /* Convenience macros */ #define virt_to_abs(va) phys_to_abs(__pa(va)) #define abs_to_virt(aa) __va(abs_to_phys(aa)) +#define boot_virt_to_abs(va) phys_to_abs(__boot_pa(va)) +#define boot_abs_to_virt(aa) __boot_va(abs_to_phys(aa)) #endif /* _ABS_ADDR_H */ --- linux.orig/include/asm-ppc64/mman.h~AA-PM-22-vm_immovable-ppc64 2005-07-28 14:30:01.000000000 -0700 +++ linux/include/asm-ppc64/mman.h 2005-07-28 14:30:01.000000000 -0700 @@ -38,6 +38,7 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MADV_NORMAL 0x0 /* default page-in behavior */ #define MADV_RANDOM 0x1 /* page-in minimum required */ --- linux.orig/include/asm-ppc64/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-ppc64/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -67,9 +67,6 @@ #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) - #ifdef CONFIG_DISCONTIGMEM /* --- linux.orig/include/asm-x86_64/mman.h~D3-x86_64-hotplug-functions 2005-07-28 14:29:37.000000000 -0700 +++ linux/include/asm-x86_64/mman.h 2005-07-28 14:30:06.000000000 -0700 @@ -23,6 +23,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ --- linux.orig/include/asm-x86_64/mmzone.h~C0-kill-local_mapnr 2005-07-28 13:50:17.000000000 -0700 +++ linux/include/asm-x86_64/mmzone.h 2005-07-28 13:50:17.000000000 -0700 @@ -38,8 +38,6 @@ #ifdef CONFIG_DISCONTIGMEM -#define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) -#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) /* AK: this currently doesn't deal with invalid addresses. We'll see if the 2.5 kernel doesn't pass them --- linux.orig/include/linux/buffer_head.h~AA-PM-20.0-nowriteback 2005-07-28 14:29:59.000000000 -0700 +++ linux/include/linux/buffer_head.h 2005-07-28 14:29:59.000000000 -0700 @@ -208,7 +208,8 @@ int nobh_truncate_page(struct address_space *, loff_t); int nobh_writepage(struct page *page, get_block_t *get_block, struct writeback_control *wbc); - +void generic_move_buffer(struct page *, struct page *); +void unlock_page_buffer(struct page *); /* * inline definitions --- linux.orig/include/linux/fs.h~AA-PM-13.1-migrate_page-operation 2005-07-28 14:29:54.000000000 -0700 +++ linux/include/linux/fs.h 2005-07-28 14:29:54.000000000 -0700 @@ -333,6 +333,7 @@ loff_t offset, unsigned long nr_segs); struct page* (*get_xip_page)(struct address_space *, sector_t, int); + int (*migrate_page)(struct page *, struct page *); }; struct backing_dev_info; --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/include/linux/memory.h 2005-07-28 14:29:39.000000000 -0700 @@ -0,0 +1,95 @@ +/* + * include/linux/memory.h - generic memory definition + * + * This is mainly for topological representation. We define the + * basic "struct memory_block" here, which can be embedded in per-arch + * definitions or NUMA information. + * + * Basic handling of the devices is done in drivers/base/memory.c + * and system devices are handled in drivers/base/sys.c. + * + * Memory block are exported via sysfs in the class/memory/devices/ + * directory. + * + */ +#ifndef _LINUX_MEMORY_H_ +#define _LINUX_MEMORY_H_ + +#include +#include +#include + +#include + +struct memory_block { + unsigned long phys_index; + unsigned long state; + /* + * This serializes all state change requests. It isn't + * held during creation because the control files are + * created long after the critical areas during + * initialization. + */ + struct semaphore state_sem; + int phys_device; /* to which fru does this belong? */ + void *hw; /* optional pointer to fw/hw data */ + int (*phys_callback)(struct memory_block *); + struct sys_device sysdev; +}; + +/* These states are exposed to userspace as text strings in sysfs */ +#define MEM_ONLINE (1<<0) /* exposed to userspace */ +#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ +#define MEM_OFFLINE (1<<2) /* exposed to userspace */ + +/* + * All of these states are currently kernel-internal for notifying + * kernel components and architectures. + * + * For MEM_MAPPING_INVALID, all notifier chains with priority >0 + * are called before pfn_to_page() becomes invalid. The priority=0 + * entry is reserved for the function that actually makes + * pfn_to_page() stop working. Any notifiers that want to be called + * after that should have priority <0. + */ +#define MEM_MAPPING_INVALID (1<<3) + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline int memory_dev_init(void) +{ + return 0; +} +static inline int register_memory_notifier(struct notifier_block *nb) +{ + return 0; +} +static inline void unregister_memory_notifier(struct notifier_block *nb) +{ +} +#else +extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); +extern int register_new_memory(struct mem_section *); +extern int unregister_memory_section(struct mem_section *); +extern int memory_dev_init(void); +extern int register_memory_notifier(struct notifier_block *nb); +extern void unregister_memory_notifier(struct notifier_block *nb); + +#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< +#include +#include +#include + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * pgdat resizing functions + */ +static inline +void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqsave(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_init(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->node_size_lock); +} +/* + * Zone resizing functions + */ +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return read_seqbegin(&zone->span_seqlock); +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return read_seqretry(&zone->span_seqlock, iv); +} +static inline void zone_span_writelock(struct zone *zone) +{ + write_seqlock(&zone->span_seqlock); +} +static inline void zone_span_writeunlock(struct zone *zone) +{ + write_sequnlock(&zone->span_seqlock); +} +static inline void zone_seqlock_init(struct zone *zone) +{ + seqlock_init(&zone->span_seqlock); +} +extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); +extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); +extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +/* need some defines for these for archs that don't support it */ +extern void online_page(struct page *page); +/* VM interface that may be used by firmware interface */ +extern int add_memory(u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern int online_pages(unsigned long, unsigned long); + +/* reasonably generic interface to expand the physical pages in a zone */ +extern int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); +extern int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); +#else /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Stub functions for when hotplug is off + */ +static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_init(struct pglist_data *pgdat) {} + +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return 0; +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return 0; +} +static inline void zone_span_writelock(struct zone *zone) {} +static inline void zone_span_writeunlock(struct zone *zone) {} +static inline void zone_seqlock_init(struct zone *zone) {} + +static inline int mhp_notimplemented(const char *func) +{ + printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); + dump_stack(); + return -ENOSYS; +} + +static inline int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + return mhp_notimplemented(__FUNCTION__); +} +static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + return mhp_notimplemented(__FUNCTION__); +} +#endif /* ! CONFIG_MEMORY_HOTPLUG */ +#endif /* __LINUX_MEMORY_HOTPLUG_H */ --- linux.orig/include/linux/mm.h~D0-sysfs-memory-class 2005-07-28 13:50:22.000000000 -0700 +++ linux/include/linux/mm.h 2005-07-28 14:30:02.000000000 -0700 @@ -161,6 +161,7 @@ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#define VM_IMMOVABLE 0x02000000 /* Don't place in hot removable area */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -177,6 +178,11 @@ #define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) +#ifdef CONFIG_MEMORY_HOTPLUG +#define VM_Immovable(v) ((v)->vm_flags & VM_IMMOVABLE) +#else +#define VM_Immovable(v) (0) +#endif /* * mapping from the currently active vm_flags protection bits (the @@ -779,6 +785,7 @@ unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); +extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); --- linux.orig/include/linux/mm_inline.h~AA-PM-01-steal_page_from_lru 2005-07-28 14:29:40.000000000 -0700 +++ linux/include/linux/mm_inline.h 2005-07-28 14:29:40.000000000 -0700 @@ -38,3 +38,71 @@ zone->nr_inactive--; } } + +static inline int +isolate_lru_onepage(struct page *page, struct list_head *src, + struct list_head *dst) +{ + if (!TestClearPageLRU(page)) + BUG(); + list_del(&page->lru); + if (get_page_testone(page)) { + /* + * It is being freed elsewhere + */ + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + return 0; + } + list_add(&page->lru, dst); + return 1; +} + + +static inline int +__steal_page_from_lru(struct zone *zone, struct page *page, + struct list_head *dst) +{ + if (PageActive(page)) { + if (!isolate_lru_onepage(page, &zone->active_list, dst)) + return 0; + zone->nr_active--; + } else { + if (!isolate_lru_onepage(page, &zone->inactive_list, dst)) + return 0; + zone->nr_inactive--; + } + return 1; +} + +static inline int +steal_page_from_lru(struct zone *zone, struct page *page, + struct list_head *dst) +{ + int ret; + spin_lock_irq(&zone->lru_lock); + ret = __steal_page_from_lru(zone, page, dst); + spin_unlock_irq(&zone->lru_lock); + return ret; +} + +static inline void +__putback_page_to_lru(struct zone *zone, struct page *page) +{ + if (TestSetPageLRU(page)) + BUG(); + if (PageActive(page)) + add_page_to_active_list(zone, page); + else + add_page_to_inactive_list(zone, page); +} + +static inline void +putback_page_to_lru(struct zone *zone, struct page *page) +{ + spin_lock_irq(&zone->lru_lock); + __putback_page_to_lru(zone, page); + spin_unlock_irq(&zone->lru_lock); +} + --- linux.orig/include/linux/mman.h~AA-PM-22-vm_immovable 2005-07-28 14:30:01.000000000 -0700 +++ linux/include/linux/mman.h 2005-07-28 14:30:01.000000000 -0700 @@ -61,7 +61,8 @@ return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | + _calc_vm_trans(flags, MAP_IMMOVABLE, VM_IMMOVABLE ); } #endif /* _LINUX_MMAN_H */ --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/include/linux/mmigrate.h 2005-07-28 14:30:05.000000000 -0700 @@ -0,0 +1,39 @@ +#ifndef _LINUX_MEMHOTPLUG_H +#define _LINUX_MEMHOTPLUG_H + +#include +#include + +#define MIGRATE_NODE_ANY -1 + +#ifdef CONFIG_MEMORY_MIGRATE +extern int generic_migrate_page(struct page *, struct page *, + int (*)(struct page *, struct page *, struct list_head *)); +extern int migrate_page_common(struct page *, struct page *, + struct list_head *); +extern int migrate_page_buffer(struct page *, struct page *, + struct list_head *); +extern int page_migratable(struct page *, struct page *, int, + struct list_head *); +extern struct page * migrate_onepage(struct page *, int nodeid); +extern int try_to_migrate_pages(struct list_head *); + +#else +static inline int generic_migrate_page(struct page *page, struct page *newpage, + int (*fn)(struct page *, struct page *)) +{ + return -ENOSYS; +} +static inline int migrate_page_buffer(struct page* page, struct page* newpage) +{ + return -ENOSYS; +} +#endif + +#ifdef ARCH_HAS_PAGEMIGRATION +extern void arch_migrate_page(struct page *, struct page *); +#else +static inline void arch_migrate_page(struct page *page, struct page *newpage) {} +#endif + +#endif /* _LINUX_MEMHOTPLUG_H */ --- linux.orig/include/linux/mmzone.h~A3-sparsemem-extreme 2005-07-28 13:50:13.000000000 -0700 +++ linux/include/linux/mmzone.h 2005-07-28 14:29:38.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -137,6 +138,10 @@ * free areas of different sizes */ spinlock_t lock; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif struct free_area free_area[MAX_ORDER]; @@ -220,6 +225,16 @@ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; + /* + * zone_start_pfn, spanned_pages and present_pages are all + * protected by span_seqlock. It is a seqlock because it has + * to be read outside of zone->lock, and it is done in the main + * allocator path. But, it is written quite infrequently. + * + * The lock is declared along with zone->lock because it is + * frequently read in proximity to zone->lock. It's good to + * give them a chance of being in the same cacheline. + */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ @@ -273,6 +288,16 @@ struct page *node_mem_map; #endif struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page @@ -293,6 +318,8 @@ #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) +#include + extern struct pglist_data *pgdat_list; void __get_zone_counts(unsigned long *active, unsigned long *inactive, @@ -431,7 +458,7 @@ * with 32 bit page->flags field, we reserve 8 bits for node/zone info. * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. */ -#define FLAGS_RESERVED 8 +#define FLAGS_RESERVED 10 #elif BITS_PER_LONG == 64 /* @@ -487,11 +514,52 @@ unsigned long section_mem_map; }; -extern struct mem_section mem_section[NR_MEM_SECTIONS]; +#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME +#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) +#else +#define SECTIONS_PER_ROOT 1 +#endif + +#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) +#define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT) +#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) + +#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME +extern struct mem_section *mem_section[NR_SECTION_ROOTS]; +#else +extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; +#endif static inline struct mem_section *__nr_to_section(unsigned long nr) { - return &mem_section[nr]; + if (!mem_section[SECTION_NR_TO_ROOT(nr)]) + return NULL; + return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; +} + +/* + * Although written for the SPARSEMEM_EXTREME case, this happens + * to also work for the flat array case becase + * NR_SECTION_ROOTS==NR_MEM_SECTIONS. + */ +static inline int __section_nr(struct mem_section* ms) +{ + unsigned long root_nr; + struct mem_section* root; + + for (root_nr = 0; + root_nr < NR_MEM_SECTIONS; + root_nr += SECTIONS_PER_ROOT) { + root = __nr_to_section(root_nr); + + if (!root) + continue; + + if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) + break; + } + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } /* @@ -513,12 +581,12 @@ static inline int valid_section(struct mem_section *section) { - return (section->section_mem_map & SECTION_MARKED_PRESENT); + return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int section_has_mem_map(struct mem_section *section) { - return (section->section_mem_map & SECTION_HAS_MEM_MAP); + return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int valid_section_nr(unsigned long nr) @@ -572,6 +640,7 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) +#define sparse_index_init(_sec, _nid) do {} while (0) #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_NODES_SPAN_OTHER_NODES @@ -590,3 +659,4 @@ #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ + --- linux.orig/include/linux/page-flags.h~AA-PM-04-config-noswap 2005-07-28 14:29:47.000000000 -0700 +++ linux/include/linux/page-flags.h 2005-07-28 14:29:47.000000000 -0700 @@ -299,6 +299,8 @@ #define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else #define PageSwapCache(page) 0 +#define SetPageSwapCache(page) +#define ClearPageSwapCache(page) #endif #define PageUncached(page) test_bit(PG_uncached, &(page)->flags) --- linux.orig/include/linux/radix-tree.h~AA-PM-03-radix-tree-replace 2005-07-28 14:29:47.000000000 -0700 +++ linux/include/linux/radix-tree.h 2005-07-28 14:29:47.000000000 -0700 @@ -47,6 +47,7 @@ int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void *radix_tree_delete(struct radix_tree_root *, unsigned long); +void *radix_tree_replace(struct radix_tree_root *, unsigned long, void *); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); --- linux.orig/include/linux/rmap.h~AA-PM-14-try_to_unmap_force 2005-07-28 14:29:56.000000000 -0700 +++ linux/include/linux/rmap.h 2005-07-28 14:29:56.000000000 -0700 @@ -90,7 +90,9 @@ * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, int ignore_token); -int try_to_unmap(struct page *); +int try_to_unmap(struct page *, struct list_head *); +int touch_unmapped_address(struct list_head *); + /* * Called from mm/filemap_xip.c to unmap empty zero page @@ -110,7 +112,7 @@ #define anon_vma_link(vma) do {} while (0) #define page_referenced(page,l,i) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL +#define try_to_unmap(page, force) SWAP_FAIL #endif /* CONFIG_MMU */ --- linux.orig/include/linux/swap.h~AA-PM-02-export-pageout 2005-07-28 14:29:46.000000000 -0700 +++ linux/include/linux/swap.h 2005-07-28 14:29:58.000000000 -0700 @@ -177,6 +177,50 @@ extern int try_to_free_pages(struct zone **, unsigned int); extern int zone_reclaim(struct zone *, unsigned int, unsigned int); extern int shrink_all_memory(int); +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; +extern pageout_t pageout(struct page *, struct address_space *); +struct scan_control { + /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ + unsigned long nr_to_scan; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Incremented by the number of pages reclaimed */ + unsigned long nr_reclaimed; + + unsigned long nr_mapped; /* From page_state */ + + /* How many pages shrink_cache() should reclaim */ + int nr_to_reclaim; + + /* Ask shrink_caches, or shrink_zone to scan at this priority */ + unsigned int priority; + + /* This context's GFP mask */ + unsigned int gfp_mask; + + int may_writepage; + + /* This context's SWAP_CLUSTER_MAX. If freeing memory for + * suspend, we effectively ignore SWAP_CLUSTER_MAX. + * In this context, it doesn't matter that we scan the + * whole list at once. */ + int swap_cluster_max; + + /* Can pages be swapped as part of reclaim? */ + int may_swap; +}; +extern int shrink_list(struct list_head *, struct scan_control *); extern int vm_swappiness; #ifdef CONFIG_MMU @@ -196,7 +240,7 @@ extern struct address_space swapper_space; #define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *); +extern int add_to_swap(struct page *, unsigned int); extern void __delete_from_swap_cache(struct page *); extern void delete_from_swap_cache(struct page *); extern int move_to_swap_cache(struct page *, swp_entry_t); @@ -220,7 +264,11 @@ extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); -extern int remove_exclusive_swap_page(struct page *); +extern int __remove_exclusive_swap_page(struct page *, int); +static inline int remove_exclusive_swap_page(struct page *p) +{ + return __remove_exclusive_swap_page(p, 0); +} struct backing_dev_info; extern struct swap_list_t swap_list; @@ -274,11 +322,16 @@ #define delete_from_swap_cache(p) /*NOTHING*/ #define swap_token_default_timeout 0 -static inline int remove_exclusive_swap_page(struct page *p) +static inline int __remove_exclusive_swap_page(struct page *p, int force) { return 0; } +static inline int remove_exclusive_swap_page(struct page *p) +{ + return __remove_exclusive_swap_page(p, 0); +} + static inline swp_entry_t get_swap_page(void) { swp_entry_t entry; --- linux.orig/init/Kconfig~AA-PM-07.2-memory_migration-depends-swap 2005-07-28 14:29:49.000000000 -0700 +++ linux/init/Kconfig 2005-07-28 14:29:49.000000000 -0700 @@ -87,6 +87,9 @@ used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. +comment " Swap automatically enabled by selecting Memory Migration" + depends on MEMORY_MIGRATE + config SYSVIPC bool "System V IPC" depends on MMU --- linux.orig/kernel/fork.c~AA-PM-22-vm_immovable 2005-07-28 14:30:01.000000000 -0700 +++ linux/kernel/fork.c 2005-07-28 14:30:01.000000000 -0700 @@ -230,7 +230,7 @@ if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= ~(VM_LOCKED|VM_IMMOVABLE); tmp->vm_mm = mm; tmp->vm_next = NULL; anon_vma_link(tmp); --- linux.orig/lib/radix-tree.c~AA-PM-03-radix-tree-replace 2005-07-28 14:29:47.000000000 -0700 +++ linux/lib/radix-tree.c 2005-07-28 14:29:47.000000000 -0700 @@ -100,7 +100,13 @@ static inline void radix_tree_node_free(struct radix_tree_node *node) { - kmem_cache_free(radix_tree_node_cachep, node); + struct radix_tree_preload *rtp; + + rtp = &__get_cpu_var(radix_tree_preloads); + if (rtp->nr < ARRAY_SIZE(rtp->nodes)) + rtp->nodes[rtp->nr++] = node; + else + kmem_cache_free(radix_tree_node_cachep, node); } /* @@ -733,6 +739,53 @@ EXPORT_SYMBOL(radix_tree_delete); /** + * radix_tree_replace - replace items in a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Replace the item at @index with @item. + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_replace(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + unsigned int height, shift; + void *ret = NULL; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + pathp->slot = &root->rnode; + + while (height > 0) { + int offset; + + if (*pathp->slot == NULL) + goto out; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp[1].offset = offset; + pathp[1].node = *pathp[0].slot; + pathp[1].slot = (struct radix_tree_node **) + (pathp[1].node->slots + offset); + pathp++; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + if ((ret = *pathp[0].slot)) + *pathp[0].slot = item; +out: + return ret; +} +EXPORT_SYMBOL(radix_tree_replace); + +/** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root * @tag: tag to test --- linux.orig/mm/Kconfig~A3-sparsemem-extreme 2005-07-28 13:50:13.000000000 -0700 +++ linux/mm/Kconfig 2005-07-28 14:29:49.000000000 -0700 @@ -89,3 +89,44 @@ config HAVE_MEMORY_PRESENT def_bool y depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM + +# +# SPARSEMEM_EXTREME (which is the default) does some bootmem +# allocations when memory_present() is called. If this can not +# be done on your architecture, select this option. However, +# statically allocating the mem_section[] array can potentially +# consume vast quantities of .bss, so be careful. +# +# This option will also potentially produce smaller runtime code +# with gcc 3.4 and later. +# +config SPARSEMEM_STATIC + def_bool n + +# +# Architectecture platforms which require a two level mem_section in SPARSEMEM +# must select this option. This is usually for architecture platforms with +# an extremely sparse physical address space. +# +config SPARSEMEM_EXTREME + def_bool y + depends on SPARSEMEM && !SPARSEMEM_STATIC + +# eventually, we can have this option just 'select SPARSEMEM' +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + +comment "Memory hotplug is currently incompatible with Software Suspend" + depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND + +config MEMORY_REMOVE + bool "Allow for memory hot-remove" + depends on MEMORY_HOTPLUG && MEMORY_MIGRATE && (X86 && !X86_64) + default y if MEMORY_HOTPLUG + help + Enabling this option allows you to hot-remove highmem zones + on i386 systems. The i386 depenence is a hack for now. + +comment "Selecting Memory Migration automatically enables CONFIG_SWAP" + depends on !SWAP --- linux.orig/mm/Makefile~D0-sysfs-memory-class 2005-07-28 13:50:22.000000000 -0700 +++ linux/mm/Makefile 2005-07-28 14:29:48.000000000 -0700 @@ -18,5 +18,6 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o - +obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o +obj-$(CONFIG_MEMORY_MIGRATE) += mmigrate.o obj-$(CONFIG_FS_XIP) += filemap_xip.o --- linux.orig/mm/memory.c~AA-PM-09-migrate-swapcache-validate 2005-07-28 14:29:50.000000000 -0700 +++ linux/mm/memory.c 2005-07-28 14:30:02.000000000 -0700 @@ -1265,12 +1265,22 @@ if (unlikely(anon_vma_prepare(vma))) goto no_new_page; + if (old_page == ZERO_PAGE(address)) { - new_page = alloc_zeroed_user_highpage(vma, address); + if (VM_Immovable(vma)) { + new_page = alloc_page_vma(GFP_USER, vma, address); + if (new_page) + clear_user_page(address, address, new_page); + } else + new_page = alloc_zeroed_user_highpage(vma, address); if (!new_page) goto no_new_page; } else { - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (VM_Immovable(vma)) + new_page = alloc_page_vma(GFP_USER, vma, address); + else + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!new_page) goto no_new_page; copy_user_highpage(new_page, old_page, address); @@ -1640,6 +1650,7 @@ pte_unmap(page_table); spin_unlock(&mm->page_table_lock); +again: page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1668,6 +1679,12 @@ mark_page_accessed(page); lock_page(page); + if (!PageSwapCache(page)) { + /* page-migration has occured */ + unlock_page(page); + page_cache_release(page); + goto again; + } /* * Back out if somebody else faulted in this pte while we @@ -1749,7 +1766,10 @@ if (unlikely(anon_vma_prepare(vma))) goto no_mem; - page = alloc_zeroed_user_highpage(vma, addr); + if (VM_Immovable(vma)) + page = alloc_page_vma(GFP_USER, vma, addr); + else + page = alloc_zeroed_user_highpage(vma, addr); if (!page) goto no_mem; --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/mm/memory_hotplug.c 2005-07-28 14:29:40.000000000 -0700 @@ -0,0 +1,161 @@ +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, + unsigned long size); +static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int nid = pgdat->node_id; + int zone_type; + + zone_type = zone - pgdat->node_zones; + memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); + zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); +} + +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + struct page *mem_map); +int __add_section(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + struct page *memmap; + int ret; + + ret = sparse_add_one_section(zone, phys_start_pfn, memmap); + + hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION); + + if (ret < 0) + return ret; + + __add_zone(zone, phys_start_pfn); + return register_new_memory(__pfn_to_section(phys_start_pfn)); +} + +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __add_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int err = 0; + + printk(KERN_DEBUG "%s(%p, %08lx, %ld)\n", __func__, + zone, phys_start_pfn, nr_pages); + + for (i = 0; !err && (i < nr_pages); i += PAGES_PER_SECTION) { + printk(KERN_DEBUG "\tfor: i: %ld\n", i); + err = __add_section(zone, phys_start_pfn + i); + } + + return err; +} + +static void grow_zone_span(struct zone *zone, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_zone_end_pfn; + + zone_span_writelock(zone); + + old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + if (start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + if (end_pfn > old_zone_end_pfn) + zone->spanned_pages = end_pfn - zone->zone_start_pfn; + + zone_span_writeunlock(zone); +} + +static void grow_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + if (end_pfn > old_pgdat_end_pfn) + pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; +} + +#ifdef CONFIG_SIMULATED_MEM_HOTPLUG +int page_is_hotpluggable_ram(unsigned long pfn) +{ + extern struct e820map bios_e820; + extern int page_is_ram_e820(unsigned long, struct e820map*); + + return page_is_ram_e820(pfn, &bios_e820); +} +#else +int page_is_hotpluggable_ram(unsigned long pfn) +{ + return 1; +} +#endif + +int online_pages(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long i; + unsigned long flags; + unsigned long onlined_pages = 0; + struct zone *zone; + + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_sem. + */ + zone = page_zone(pfn_to_page(pfn)); + pgdat_resize_lock(zone->zone_pgdat, &flags); + grow_zone_span(zone, pfn, pfn + nr_pages); + grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + printk(KERN_DEBUG "%s: onlining 0x%lx pages starting from pfn: 0x%lx\n", + __func__, nr_pages, pfn); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); + + if (page_is_hotpluggable_ram(pfn + i)) { + online_page(page); + onlined_pages++; + } + } + zone->present_pages += onlined_pages; + + return 0; +} --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/mm/mmigrate.c 2005-07-28 14:30:05.000000000 -0700 @@ -0,0 +1,592 @@ +/* + * linux/mm/mmigrate.c + * + * Memory migration support. + * + * Authors: IWAMOTO Toshihiro + * Hirokazu Takahashi + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The concept of memory migration is to replace a target page with + * a substitute page on a radix tree. New requests to access the target + * - including system calls and page faults - are redirected to the + * substitute that is locked and not up-to-date, so that all of these + * requests are blocked until the migration has done. Data of the target + * is copied into the substitute and then the requests are unblocked + * after all operations against the target have finished. + * + * By this approach, regular pages in the swapcache/pagecache and + * hugetlbpages can be handled in the same way. + */ + + +/* + * Try to writeback a dirty page to free its buffers. + */ +static int +writeback_and_free_buffers(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + BUG_ON(!PageLocked(page)); + wait_on_page_writeback(page); + if (!PagePrivate(page)) + return 0; + + if (PageDirty(page)) { + switch(pageout(page, mapping)) { + case PAGE_ACTIVATE: + return -1; + case PAGE_SUCCESS: + lock_page(page); + return 1; + case PAGE_KEEP: + case PAGE_CLEAN: + break; + } + } + if (try_to_release_page(page, GFP_KERNEL)) + return 0; + + return -1; +} + +/* + * Replace "page" with "newpage" on the radix tree, which the page belongs to. + */ +static int +replace_pages(struct page *page, struct page *newpage) +{ + struct address_space *mapping = page_mapping(page); + int ret = 0; + struct page *delpage; + + page_cache_get(newpage); + read_lock_irq(&mapping->tree_lock); + newpage->index = page->index; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + newpage->private = page->private; + } else + newpage->mapping = page->mapping; + if (PageWriteback(page)) + SetPageWriteback(newpage); + + delpage = radix_tree_replace(&mapping->page_tree, page_index(page), newpage); + read_unlock_irq(&mapping->tree_lock); + if (delpage == NULL) { + /* + * Migration is unnecessary since truncating the page is + * in progress. Just release the newpage. + */ + page_cache_release(newpage); + ret = -ENOENT; + } + return ret; +} + +/* + * Check whether the page can be migrated or not. + */ +int +page_migratable(struct page *page, struct page *newpage, + int freeable_page_count, struct list_head *vlist) +{ + int truncated; + + if (page_mapped(page)) { + switch (try_to_unmap(page, vlist)) { + case SWAP_FAIL: + return -EBUSY; + case SWAP_AGAIN: + return -EAGAIN; + } + } + if (PageWriteback(page)) + return -EAGAIN; + /* The page might have been truncated */ + truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL; + if (page_count(page) + truncated <= freeable_page_count) + return truncated ? -ENOENT : 0; + return -EAGAIN; +} + +/* + * Wait for the completion of all operations, which are going on + * against the page, and copy it. + */ +int +migrate_page_common(struct page *page, struct page *newpage, + struct list_head *vlist) +{ + long timeout = 5000; /* XXXX */ + int ret; + + while (timeout > 0) { + BUG_ON(page_count(page) == 0); + ret = page_migratable(page, newpage, 2, vlist); + switch (ret) { + case 0: + case -ENOENT: + copy_highpage(newpage, page); + return ret; + case -EBUSY: + return ret; + case -EAGAIN: + writeback_and_free_buffers(page); + unlock_page(page); + msleep(10); + timeout -= 10; + lock_page(page); + continue; + } + } + return -EBUSY; +} + +/* + * Wait for the completion of all operations, which are going on + * against the page. After that, move the buffers the page owns + * to the newpage and copy the page. + */ +int +migrate_page_buffer(struct page *page, struct page *newpage, + struct list_head *vlist) +{ + long timeout = 5000; /* XXXX */ + int ret; + + while (timeout > 0) { + BUG_ON(page_count(page) == 0); + ret = page_migratable(page, newpage, + 2 + !!PagePrivate(page), vlist); + switch (ret) { + case 0: + if (PagePrivate(page)) + generic_move_buffer(page, newpage); + /* fall thru */ + case -ENOENT: /* truncated */ + copy_highpage(newpage, page); + return ret; + case -EBUSY: + return ret; + case -EAGAIN: + wait_on_page_writeback(page); + unlock_page(page); + msleep(10); + timeout -= 10; + lock_page(page); + continue; + } + } + return -EBUSY; +} + +/* + * In some cases, a page migration needs to be rolled back. + */ +static int +unwind_page(struct page *page, struct page *newpage) +{ + struct address_space *mapping = page_mapping(newpage); + int truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL; + long retry = 1000; + + BUG_ON(mapping == NULL); + + /* + * Unwinding is not needed if the newpage has been already truncated. + */ + if (truncated) + goto out; + + /* + * Try to unwind by notifying waiters. If someone misbehaves, + * we die. + */ + read_lock_irq(&mapping->tree_lock); + page->index = newpage->index; + if (PageSwapCache(newpage)) { + SetPageSwapCache(page); + page->private = newpage->private; + } else + page->mapping = newpage->mapping; + if (radix_tree_replace(&mapping->page_tree, page_index(newpage), page) == NULL) { + printk(KERN_ERR "%s(): newpage:%p has gone. We can't roll back page:%p.\n", __FUNCTION__, newpage, page); + BUG(); + } + /* no page_cache_get(page); needed */ + read_unlock_irq(&mapping->tree_lock); +out: + newpage->mapping = NULL; + if (PageWriteback(newpage)) + end_page_writeback(newpage); /* XXX */ + newpage->private = 0; + ClearPageSwapCache(newpage); + /* XXX unmap needed? No, it shouldn't. Handled by fault handlers. */ + unlock_page(newpage); + unlock_page(page); + + /* + * Some requests may be blocked on the newpage. Wait until the + * requests have gone. + */ + while (page_count(newpage) > 2) { + msleep(10); + if (retry-- <= 0) { + retry = 1000; + printk(KERN_ERR "%s(): page:%p can't be rolled back, as there remain some references to newpage:%p yet.\n", __FUNCTION__, page, newpage); + printk(KERN_ERR "newpage %p flags %lx %d %d, page %p flags %lx %d\n", + newpage, newpage->flags, page_count(newpage), + page_mapcount(newpage), + page, page->flags, page_count(page)); + } + } + + BUG_ON(PageUptodate(newpage)); + BUG_ON(PageDirty(newpage)); + BUG_ON(PageActive(newpage)); + BUG_ON(PagePrivate(newpage)); + BUG_ON(page_count(newpage) != 2); + page_cache_release(newpage); + return 0; +} + +/* + * Try to migrate one page. Returns non-zero on failure. + * - Lock for the page must be held when invoked. + * - The page must be attached to an address_space. + */ +int +generic_migrate_page(struct page *page, struct page *newpage, + int (*migrate_fn)(struct page *, struct page *, struct list_head *)) +{ + LIST_HEAD(vlist); + int ret; + + /* + * Make sure that the newpage must be locked and kept not up-to-date + * during the page migration, so that it's guaranteed that all + * accesses to the newpage will be blocked until everything has + * become ok. + */ + if (TestSetPageLocked(newpage)) + BUG(); + + if ((ret = replace_pages(page, newpage))) + goto out_removing; + + /* + * With cleared PTEs, any accesses via the PTEs to the page + * can be caught and blocked in a pagefault handler. + */ + if (page_mapped(page)) { + while ((ret = try_to_unmap(page, &vlist)) == SWAP_AGAIN) + msleep(1); + if (ret != SWAP_SUCCESS) { + ret = -EBUSY; + goto out_busy; + } + } + + wait_on_page_writeback(page); + if (PageSwapCache(page)) { + /* + * The page is not mapped from anywhere now. + * Detach it from the swapcache completely. + */ + ClearPageSwapCache(page); + page->private = 0; + page->mapping = NULL; + } + + /* Wait for all operations against the page to finish. */ + ret = migrate_fn(page, newpage, &vlist); + switch (ret) { + case -ENOENT: + /* The file the page belongs to has been truncated. */ + page_cache_get(page); + page_cache_release(newpage); + newpage->mapping = NULL; + break; + case 0: + break; + default: + /* The page is busy. Try it later. */ + goto out_busy; + } + + arch_migrate_page(page, newpage); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageActive(page)) { + SetPageActive(newpage); + ClearPageActive(page); + } + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + if (PagePrivate(newpage)) { + BUG_ON(newpage->mapping == NULL); + unlock_page_buffer(newpage); + } + /* + * Finally, the newpage has become ready! Wake up all waiters, + * which have been waiting for the completion of the migration. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); + unlock_page(newpage); + + /* map the newpage where the old page have been mapped. */ + touch_unmapped_address(&vlist); + if (PageSwapCache(newpage)) { + lock_page(newpage); + __remove_exclusive_swap_page(newpage, 1); + unlock_page(newpage); + } + + page->mapping = NULL; + unlock_page(page); + page_cache_release(page); + + return 0; + +out_busy: + /* Roll back all operations. */ + unwind_page(page, newpage); + touch_unmapped_address(&vlist); + if (PageSwapCache(page)) { + lock_page(page); + __remove_exclusive_swap_page(page, 1); + unlock_page(page); + } + + return ret; + +out_removing: + if (PagePrivate(newpage)) + BUG(); + unlock_page(page); + unlock_page(newpage); + return ret; +} + +/* + * migrate_onepage() can migrate regular pages assigned to pagecache, + * swapcache or anonymous memory. + */ +struct page * +migrate_onepage(struct page *page, int nodeid) +{ + struct page *newpage; + struct address_space *mapping; + int ret; + + lock_page(page); + + /* + * Put the page in a radix tree if it isn't in the tree yet. + */ +#ifdef CONFIG_SWAP + if (PageAnon(page) && !PageSwapCache(page)) + if (!add_to_swap(page, GFP_KERNEL)) { + unlock_page(page); + return ERR_PTR(-ENOSPC); + } +#endif /* CONFIG_SWAP */ + if ((mapping = page_mapping(page)) == NULL) { + /* truncation is in progress */ + if (PagePrivate(page)) + try_to_release_page(page, GFP_KERNEL); + unlock_page(page); + return ERR_PTR(-ENOENT); + } + + /* + * Allocate a new page with the same gfp_mask + * as the target page has. + */ + if (nodeid == MIGRATE_NODE_ANY) + newpage = page_cache_alloc(mapping); + else + newpage = alloc_pages_node(nodeid, mapping->flags, 0); + if (newpage == NULL) { + unlock_page(page); + return ERR_PTR(-ENOMEM); + } + + if (mapping->a_ops && mapping->a_ops->migrate_page) + ret = mapping->a_ops->migrate_page(page, newpage); + else + ret = generic_migrate_page(page, newpage, migrate_page_common); + if (ret) { + BUG_ON(page_count(newpage) != 1); + page_cache_release(newpage); + return ERR_PTR(ret); + } + BUG_ON(page_count(page) != 1); + page_cache_release(page); + return newpage; +} + +static inline int +need_writeback(struct page *page) +{ + return PageDirty(page) && PagePrivate(page) && !PageWriteback(page); +} + +/* + * Start writeback I/O against a dirty page with filesystem + * specific private data to release them. + */ +static inline void page_start_writeback(struct page *page) +{ + struct address_space *mapping; + int ret; + + if (!need_writeback(page)) + return; + if (TestSetPageLocked(page)) + return; + + mapping = page_mapping(page); + + if (!mapping) + goto out_unlock; + /* + * Writeback is not needed if it has migrate_page method, + * because it can move all of them without writeback I/O. + */ + if (mapping->a_ops && mapping->a_ops->migrate_page) + goto out_unlock; + if (!need_writeback(page)) + goto out_unlock; + + ret = pageout(page, mapping); + + if (ret == PAGE_SUCCESS) + return; + +out_unlock: + unlock_page(page); +} + +/* + * This is the main entry point to migrate pages in a specific region. + * If a page is inactive, the page may be just released instead of + * migration. + */ +int try_to_migrate_pages(struct list_head *page_list) +{ + struct page *page, *page2, *newpage; + LIST_HEAD(pass1_list); + LIST_HEAD(pass2_list); + LIST_HEAD(discharge_list); + int nr_busy = 0; + int nr_noswap = 0; + struct scan_control sc = { + .nr_scanned = 0, + .nr_reclaimed = 0, + .priority = 0, + .gfp_mask = GFP_ATOMIC, + .may_writepage = 0, + }; + + + current->flags |= PF_KSWAPD; /* It's fake */ + list_for_each_entry_safe(page, page2, page_list, lru) { + page_start_writeback(page); + list_del(&page->lru); + if (PageActive(page)) + list_add(&page->lru, &pass1_list); + else + list_add(&page->lru, &discharge_list); + } + /* + * Try to free inactive pages only. + */ + shrink_list(&discharge_list, &sc); + list_splice(&discharge_list, &pass1_list); + + /* + * Try to migrate easily movable pages first. + */ + list_for_each_entry_safe(page, page2, &pass1_list, lru) { + list_del(&page->lru); + if (PageLocked(page) || PageWriteback(page) || + IS_ERR(newpage = migrate_onepage(page, MIGRATE_NODE_ANY))) { + if (page_count(page) == 1) { + /* the page is already unused */ + putback_page_to_lru(page_zone(page), page); + page_cache_release(page); + } else { + list_add(&page->lru, &pass2_list); + } + } else { + putback_page_to_lru(page_zone(newpage), newpage); + page_cache_release(newpage); + } + } + /* + * Try to migrate the rest of them. + */ + list_for_each_entry_safe(page, page2, &pass2_list, lru) { + list_del(&page->lru); + if (IS_ERR(newpage = migrate_onepage(page, MIGRATE_NODE_ANY))) { + if (page_count(page) == 1) { + /* the page is already unused */ + putback_page_to_lru(page_zone(page), page); + page_cache_release(page); + } else { + /* truncation may be in progress now. */ + nr_busy++; + if (PTR_ERR(newpage) == -ENOSPC) + nr_noswap++; + list_add(&page->lru, page_list); + } + } else { + putback_page_to_lru(page_zone(newpage), newpage); + page_cache_release(newpage); + } + } + current->flags &= ~PF_KSWAPD; + if (nr_noswap) { + if (printk_ratelimit()) + printk(KERN_WARNING "memory migration failed: Any swap devices should be added.\n"); + return -ENOSPC; + } + return nr_busy; +} + +EXPORT_SYMBOL(generic_migrate_page); +EXPORT_SYMBOL(migrate_page_common); +EXPORT_SYMBOL(migrate_page_buffer); +EXPORT_SYMBOL(page_migratable); +EXPORT_SYMBOL(migrate_onepage); --- linux.orig/mm/page_alloc.c~C1-pcp_zone_init 2005-07-28 13:50:17.000000000 -0700 +++ linux/mm/page_alloc.c 2005-07-28 14:29:39.000000000 -0700 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -77,21 +78,44 @@ unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (zone != page_zone(page)) + return 0; + + return 1; +} /* * Temporary debugging check for pages not lying within a given zone. */ static int bad_range(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) + if (page_outside_zone_boundaries(zone, page)) return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) - return 1; -#endif - if (zone != page_zone(page)) + if (!page_is_consistent(zone, page)) return 1; + return 0; } @@ -1382,7 +1406,7 @@ /* * Builds allocation fallback zone lists. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +int __devinit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) { switch (k) { struct zone *zone; @@ -1390,7 +1414,12 @@ BUG(); case ZONE_HIGHMEM: zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { + /* + * with mem hotplug we don't increment present_pages + * until the pages are actually freed into the zone, + * but we increment spanned pages much earlier + */ + if (zone->spanned_pages) { #ifndef CONFIG_HIGHMEM BUG(); #endif @@ -1398,11 +1427,11 @@ } case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; } @@ -1472,7 +1501,7 @@ return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +void __devinit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; @@ -1519,7 +1548,7 @@ #else /* CONFIG_NUMA */ -static void __init build_zonelists(pg_data_t *pgdat) +void __devinit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; @@ -1640,7 +1669,7 @@ * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -1851,6 +1880,68 @@ #endif +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long size_bytes; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + size_bytes = zone->wait_table_size * sizeof(wait_queue_head_t); + if (system_state >= SYSTEM_RUNNING) + zone->wait_table = kmalloc(size_bytes, GFP_KERNEL); + else + zone->wait_table = alloc_bootmem_node(pgdat, size_bytes); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + if ((zone_start_pfn) & (zone_required_alignment-1)) + printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); + zone->spanned_pages = size; +} + + /* * Set up the zone data structures: * - mark all pages reserved @@ -1860,11 +1951,11 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - int cpu, nid = pgdat->node_id; + unsigned long j; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; @@ -1872,7 +1963,6 @@ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; realsize = size = zones_size[j]; if (zholes_size) @@ -1882,29 +1972,17 @@ nr_kernel_pages += realsize; nr_all_pages += realsize; - zone->spanned_pages = size; zone->present_pages = realsize; zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - batch = zone_batchsize(zone); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1915,35 +1993,12 @@ if (!size) continue; - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - if ((zone_start_pfn) & (zone_required_alignment-1)) - printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); - memmap_init(size, nid, j, zone_start_pfn); zonetable_add(zone, nid, j, zone_start_pfn, size); - + init_currently_empty_zone(zone, zone_start_pfn, size); + //memmap_init(size, nid, zone_idx(zone), zone_start_pfn); zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } @@ -2343,7 +2398,7 @@ * that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -static void setup_per_zone_pages_min(void) +void setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -2552,3 +2607,32 @@ return table; } + +static inline int zone_previously_initialized(struct zone *zone) +{ + if (zone->wait_table_size) + return 1; + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages) +{ + if (zone_previously_initialized(zone)) + return -EEXIST; + + zone_wait_table_init(zone, PAGES_PER_SECTION); + init_currently_empty_zone(zone, phys_start_pfn, PAGES_PER_SECTION); + zone_pcp_init(zone); + + /* + * FIXME: there is no locking at all for the zonelists. + * Least impactful (codewise) way to do this is probably + * to freeze all the CPUs for a sec while this is done. + */ + build_zonelists(zone->zone_pgdat); + + return 0; +} +#endif --- linux.orig/mm/rmap.c~AA-PM-14-try_to_unmap_force 2005-07-28 14:29:56.000000000 -0700 +++ linux/mm/rmap.c 2005-07-28 14:29:57.000000000 -0700 @@ -46,6 +46,7 @@ */ #include +#include #include #include #include @@ -506,11 +507,81 @@ } } +struct page_va_list { + struct mm_struct *mm; + unsigned long addr; + struct list_head list; +}; + +/* + * This function is invoked to record an address space and a mapped address + * to which a target page belongs, when it is unmapped forcibly. + */ +static int +record_unmapped_address(struct list_head *force, struct mm_struct *mm, + unsigned long address) +{ + struct page_va_list *vlist; + + vlist = kmalloc(sizeof(struct page_va_list), GFP_KERNEL); + if (vlist == NULL) + return -ENOMEM; + spin_lock(&mmlist_lock); + if (!atomic_read(&mm->mm_users)) + vlist->mm = NULL; + else { + vlist->mm = mm; + atomic_inc(&mm->mm_users); + } + spin_unlock(&mmlist_lock); + + if (vlist->mm == NULL) + kfree(vlist); + else { + vlist->addr = address; + list_add(&vlist->list, force); + } + return 0; +} + +/* + * This function touches an address recorded in the vlist to map + * a page into an address space again. + */ +int +touch_unmapped_address(struct list_head *vlist) +{ + struct page_va_list *v1, *v2; + struct vm_area_struct *vma; + int ret = 0; + int error; + + list_for_each_entry_safe(v1, v2, vlist, list) { + list_del(&v1->list); + down_read(&v1->mm->mmap_sem); + if (atomic_read(&v1->mm->mm_users) == 1) + goto out; + vma = find_vma(v1->mm, v1->addr); + if (vma == NULL) + goto out; + error = get_user_pages(current, v1->mm, v1->addr, 1, + 0, 0, NULL, NULL); + if (error < 0) + ret = error; + out: + up_read(&v1->mm->mmap_sem); + mmput(v1->mm); + kfree(v1); + } + return ret; +} + /* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + struct list_head *force) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -528,15 +599,18 @@ if (IS_ERR(pte)) goto out; + if (force && record_unmapped_address(force, mm, address)) + goto out_unmap; + /* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || - ptep_clear_flush_young(vma, address, pte)) { - ret = SWAP_FAIL; - goto out_unmap; + if (((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || + ptep_clear_flush_young(vma, address, pte)) && force == NULL) { + ret = SWAP_FAIL; + goto out_unmap; } /* Nuke the page table entry. */ @@ -678,7 +752,7 @@ spin_unlock(&mm->page_table_lock); } -static int try_to_unmap_anon(struct page *page) +static int try_to_unmap_anon(struct page *page, struct list_head *force) { struct anon_vma *anon_vma; struct vm_area_struct *vma; @@ -689,7 +763,7 @@ return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - ret = try_to_unmap_one(page, vma); + ret = try_to_unmap_one(page, vma, force); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -706,7 +780,7 @@ * * This function is only called from try_to_unmap for object-based pages. */ -static int try_to_unmap_file(struct page *page) +static int try_to_unmap_file(struct page *page, struct list_head *force) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -720,7 +794,7 @@ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma); + ret = try_to_unmap_one(page, vma, force); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -809,7 +883,7 @@ * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable */ -int try_to_unmap(struct page *page) +int try_to_unmap(struct page *page, struct list_head *force) { int ret; @@ -817,9 +891,9 @@ BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page); + ret = try_to_unmap_anon(page, force); else - ret = try_to_unmap_file(page); + ret = try_to_unmap_file(page, force); if (!page_mapped(page)) ret = SWAP_SUCCESS; --- linux.orig/mm/shmem.c~AA-PM-09-migrate-swapcache-validate 2005-07-28 14:29:50.000000000 -0700 +++ linux/mm/shmem.c 2005-07-28 14:30:03.000000000 -0700 @@ -93,7 +93,16 @@ * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: * might be reconsidered if it ever diverges from PAGE_SIZE. */ +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * XXXX: This is temprary code, which should be replaced with proper one + * after the scheme to specify hot removable region has defined. + * 25/Sep/2004 -- taka + */ + return alloc_pages(gfp_mask & ~__GFP_HIGHMEM, PAGE_CACHE_SHIFT-PAGE_SHIFT); +#else return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); +#endif } static inline void shmem_dir_free(struct page *page) @@ -1017,6 +1026,14 @@ page_cache_release(swappage); goto repeat; } + if (!PageSwapCache(swappage)) { + /* page-migration has occured */ + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + unlock_page(swappage); + page_cache_release(swappage); + goto repeat; + } if (PageWriteback(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); --- linux.orig/mm/sparse.c~A3-sparsemem-extreme 2005-07-28 13:50:13.000000000 -0700 +++ linux/mm/sparse.c 2005-07-28 14:29:35.000000000 -0700 @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include #include /* @@ -13,9 +15,66 @@ * * 1) mem_section - memory sections, mem_map's for valid memory */ -struct mem_section mem_section[NR_MEM_SECTIONS]; +#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME +struct mem_section *mem_section[NR_SECTION_ROOTS] + ____cacheline_maxaligned_in_smp; +#else +struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] + ____cacheline_maxaligned_in_smp; +#endif EXPORT_SYMBOL(mem_section); +#ifdef CONFIG_ARCH_SPARSEMEM_EXTREME +static struct mem_section *sparse_index_alloc(int nid) +{ + struct mem_section *section = NULL; + unsigned long array_size = SECTIONS_PER_ROOT * + sizeof(struct mem_section); + + if (system_state < SYSTEM_RUNNING) + section = alloc_bootmem_node(NODE_DATA(nid), array_size); + else + section = kmalloc_node(array_size, GFP_KERNEL, nid); + + if (section) + memset(section, 0, array_size); + + return section; +} + +static int sparse_index_init(unsigned long section_nr, int nid) +{ + static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; + unsigned long root = SECTION_NR_TO_ROOT(section_nr); + struct mem_section *section; + int ret = 0; + + section = sparse_index_alloc(nid); + /* + * This lock keeps two different sections from + * reallocating for the same index + */ + spin_lock(&index_init_lock); + + if (mem_section[root]) { + if (system_state >= SYSTEM_RUNNING) + kfree(mem_section); + ret = -EEXIST; + goto out; + } + + mem_section[root] = section; +out: + spin_unlock(&index_init_lock); + return ret; +} +#else /* !SPARSEMEM_EXTREME */ +static inline int sparse_index_init(unsigned long section_nr, int nid) +{ + return 0; +} +#endif + /* Record a memory area against a node. */ void memory_present(int nid, unsigned long start, unsigned long end) { @@ -23,9 +82,14 @@ start &= PAGE_SECTION_MASK; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { - unsigned long section = pfn_to_section_nr(pfn); - if (!mem_section[section].section_mem_map) - mem_section[section].section_mem_map = SECTION_MARKED_PRESENT; + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms; + + sparse_index_init(section_nr, nid); + + ms = __nr_to_section(section_nr); + if (!ms->section_mem_map) + ms->section_mem_map = SECTION_MARKED_PRESENT; } } @@ -85,6 +149,7 @@ { struct page *map; int nid = early_pfn_to_nid(section_nr_to_pfn(pnum)); + struct mem_section *ms = __nr_to_section(pnum); map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) @@ -96,8 +161,47 @@ return map; printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__); - mem_section[pnum].section_mem_map = 0; + ms->section_mem_map = 0; + return NULL; +} + +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * nr_pages; + + page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + memset(ret, 0, memmap_size); + + return ret; +} + +static int vaddr_in_vmalloc_area(void *addr) +{ + if (addr >= (void *)VMALLOC_START && + addr < (void *)VMALLOC_END) + return 1; + return 0; +} + +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +{ + if (vaddr_in_vmalloc_area(memmap)) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * nr_pages)); } /* @@ -114,8 +218,9 @@ continue; map = sparse_early_mem_map_alloc(pnum); - if (map) - sparse_init_one_section(&mem_section[pnum], pnum, map); + if (!map) + continue; + sparse_init_one_section(__nr_to_section(pnum), pnum, map); } } @@ -124,14 +229,37 @@ * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) +int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + int nr_pages) { - struct mem_section *ms = __pfn_to_section(start_pfn); - - if (ms->section_mem_map & SECTION_MARKED_PRESENT) - return -EEXIST; - + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct pglist_data *pgdat = zone->zone_pgdat; + struct mem_section *ms; + struct page *memmap; + unsigned long flags; + int ret; + + /* + * no locking for this, because it does its own + * plus, it does a kmalloc + */ + sparse_index_init(section_nr, pgdat->node_id); + memmap = __kmalloc_section_memmap(nr_pages); + + pgdat_resize_lock(pgdat, &flags); + + ms = __pfn_to_section(start_pfn); + if (ms->section_mem_map & SECTION_MARKED_PRESENT) { + ret = -EEXIST; + goto out; + } ms->section_mem_map |= SECTION_MARKED_PRESENT; - return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); + ret = sparse_init_one_section(ms, section_nr, memmap); + + if (ret <= 0) + __kfree_section_memmap(memmap, nr_pages); +out: + pgdat_resize_unlock(pgdat, &flags); + return ret; } --- linux.orig/mm/swap_state.c~AA-PM-05-swapper_space-gfpmask 2005-07-28 14:29:47.000000000 -0700 +++ linux/mm/swap_state.c 2005-07-28 14:29:48.000000000 -0700 @@ -37,6 +37,7 @@ .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .tree_lock = RW_LOCK_UNLOCKED, .a_ops = &swap_aops, + .flags = GFP_HIGHUSER, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, }; @@ -140,7 +141,7 @@ * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page * page) +int add_to_swap(struct page * page, unsigned int gfp_mask) { swp_entry_t entry; int err; @@ -165,7 +166,7 @@ * Add it to the swap cache and mark it dirty */ err = __add_to_swap_cache(page, entry, - GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN); + gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); switch (err) { case 0: /* Success */ --- linux.orig/mm/swapfile.c~AA-PM-09-migrate-swapcache-validate 2005-07-28 14:29:50.000000000 -0700 +++ linux/mm/swapfile.c 2005-07-28 14:29:57.000000000 -0700 @@ -313,11 +313,12 @@ * Work out if there are any other processes sharing this * swap cache page. Free it if you can. Return success. */ -int remove_exclusive_swap_page(struct page *page) +int __remove_exclusive_swap_page(struct page *page, int force) { int retval; struct swap_info_struct * p; swp_entry_t entry; + int mapcount = force ? page_mapcount(page) : 0; BUG_ON(PagePrivate(page)); BUG_ON(!PageLocked(page)); @@ -326,7 +327,7 @@ return 0; if (PageWriteback(page)) return 0; - if (page_count(page) != 2) /* 2: us + cache */ + if (page_count(page) - mapcount != 2) /* 2: us + cache */ return 0; entry.val = page->private; @@ -339,7 +340,8 @@ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ write_lock_irq(&swapper_space.tree_lock); - if ((page_count(page) == 2) && !PageWriteback(page)) { + mapcount = force ? page_mapcount(page) : 0; + if ((page_count(page) - mapcount == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; @@ -618,6 +620,7 @@ */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); +again: page = read_swap_cache_async(entry, NULL, 0); if (!page) { /* @@ -652,6 +655,12 @@ wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); + if (!PageSwapCache(page)) { + /* page-migration has occured */ + unlock_page(page); + page_cache_release(page); + goto again; + } wait_on_page_writeback(page); /* --- linux.orig/mm/thrash.c~AA-PM-15-swap_token-kthread 2005-07-28 14:29:57.000000000 -0700 +++ linux/mm/thrash.c 2005-07-28 14:29:57.000000000 -0700 @@ -54,6 +54,9 @@ struct mm_struct *mm; int reason; + if (current->mm == NULL) + return; + /* We have the token. Let others know we still need it. */ if (has_swap_token(current->mm)) { current->mm->recent_pagein = 1; --- linux.orig/mm/truncate.c~AA-PM-11.0-migrate-truncate 2005-07-28 14:29:52.000000000 -0700 +++ linux/mm/truncate.c 2005-07-28 14:29:53.000000000 -0700 @@ -90,6 +90,34 @@ return 1; } +static inline struct page *lock_replace_page(struct page **p, struct address_space *mapping) +{ + struct page *page = *p; + struct page *newpage; + + lock_page(page); + + if (page->mapping != NULL) + return page; + + unlock_page(page); + + newpage = find_lock_page(mapping, page->index); + if (!newpage) { + /* + * put the page back the way it was and let + * the normal truncate code handle it + */ + lock_page(page); + return page; + } + + /* memory migration has been rolled back. */ + page_cache_release(page); + *p = newpage; + return newpage; +} + /** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate @@ -140,6 +168,9 @@ unlock_page(page); continue; } + /* page->mapping check is done in + * truncate_complete_page() when the page has been + * migrated. */ truncate_complete_page(mapping, page); unlock_page(page); } @@ -167,9 +198,9 @@ continue; } for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + struct page *page; - lock_page(page); + page = lock_replace_page(&pvec.pages[i], mapping); wait_on_page_writeback(page); if (page->index > next) next = page->index; @@ -267,11 +298,11 @@ pagevec_lookup(&pvec, mapping, next, min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { for (i = 0; !ret && i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + struct page *page; pgoff_t page_index; int was_dirty; - lock_page(page); + page = lock_replace_page(&pvec.pages[i], mapping); if (page->mapping != mapping) { unlock_page(page); continue; --- linux.orig/mm/vmalloc.c~AA-PM-26-vmalloc 2005-07-28 14:30:03.000000000 -0700 +++ linux/mm/vmalloc.c 2005-07-28 14:30:03.000000000 -0700 @@ -477,7 +477,16 @@ */ void *vmalloc(unsigned long size) { +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * XXXX: This is temprary code, which should be replaced with proper one + * after the scheme to specify hot removable region has defined. + * 25/Sep/2004 -- taka + */ + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); +#else return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); +#endif } EXPORT_SYMBOL(vmalloc); --- linux.orig/mm/vmscan.c~AA-PM-01-steal_page_from_lru 2005-07-28 14:29:40.000000000 -0700 +++ linux/mm/vmscan.c 2005-07-28 14:30:05.000000000 -0700 @@ -39,51 +39,6 @@ #include -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; - -struct scan_control { - /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ - unsigned long nr_to_scan; - - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Incremented by the number of pages reclaimed */ - unsigned long nr_reclaimed; - - unsigned long nr_mapped; /* From page_state */ - - /* How many pages shrink_cache() should reclaim */ - int nr_to_reclaim; - - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; - - /* This context's GFP mask */ - unsigned int gfp_mask; - - int may_writepage; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; -}; - /* * The list of shrinker callbacks used by to apply pressure to * ageable caches. @@ -302,7 +257,7 @@ /* * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -373,7 +328,7 @@ /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +int shrink_list(struct list_head *page_list, struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; @@ -418,7 +373,7 @@ * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { - if (!add_to_swap(page)) + if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; } #endif /* CONFIG_SWAP */ @@ -432,7 +387,7 @@ * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page)) { + switch (try_to_unmap(page, NULL)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -582,22 +537,8 @@ while (scan++ < nr_to_scan && !list_empty(src)) { page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - - if (!TestClearPageLRU(page)) - BUG(); - list_del(&page->lru); - if (get_page_testone(page)) { - /* - * It is being freed elsewhere - */ - __put_page(page); - SetPageLRU(page); - list_add(&page->lru, src); - continue; - } else { - list_add(&page->lru, dst); + if (isolate_lru_onepage(page, src, dst)) nr_taken++; - } } *scanned = scan; @@ -650,13 +591,10 @@ */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + if (PageActive(page)) + ClearPageActive(page); + __putback_page_to_lru(zone, page); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec);