--- linux.orig/arch/alpha/mm/numa.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/alpha/mm/numa.c 2005-09-30 12:38:06.000000000 -0700 @@ -279,7 +279,7 @@ initrd_end, phys_to_virt(PFN_PHYS(max_low_pfn))); } else { - nid = kvaddr_to_nid(initrd_start); + nid = pa_to_nid(__pa(initrd_start)); reserve_bootmem_node(NODE_DATA(nid), virt_to_phys((void *)initrd_start), INITRD_SIZE); @@ -371,6 +371,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); i = node_spanned_pages(nid); while (i-- > 0) { struct page *page = nid_page_nr(nid, i); @@ -384,6 +386,7 @@ else shared += page_count(page) - 1; } + pgdat_resize_unlock(NODE_DATA(nid), &flags); } printk("%ld pages of RAM\n",total); printk("%ld free pages\n",free); --- linux.orig/arch/i386/Kconfig~C2-enable-i386-sparsemem-debug 2005-09-30 12:38:18.000000000 -0700 +++ linux/arch/i386/Kconfig 2005-09-30 12:38:22.000000000 -0700 @@ -776,6 +776,9 @@ depends on NUMA default y +config ARCH_HAS_BOOTPA + def_bool y + config ARCH_HAVE_MEMORY_PRESENT bool depends on DISCONTIGMEM @@ -799,9 +802,27 @@ def_bool y depends on NUMA +config X86_SPARSEMEM_DEBUG_NONUMA + bool "Enable SPARSEMEM on flat systems (debugging only)" + depends on !NUMA && EXPERIMENTAL + select SPARSEMEM_STATIC + select SPARSEMEM_MANUAL + +config ARCH_MEMORY_PROBE + def_bool y + depends on X86_SPARSEMEM_DEBUG_NONUMA + +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_SPARSEMEM_DEBUG_NONUMA + +config X86_SIMULATED_MEM_HOTPLUG + bool "Simulate memory hotplug on non-hotplug hardware" + depends on EXPERIMENTAL + config ARCH_SPARSEMEM_ENABLE def_bool y - depends on NUMA + depends on NUMA || X86_SPARSEMEM_DEBUG_NONUMA config ARCH_SELECT_MEMORY_MODEL def_bool y --- linux.orig/arch/i386/kernel/setup.c~B2.1-i386-discontig-consolidation 2005-09-30 12:38:12.000000000 -0700 +++ linux/arch/i386/kernel/setup.c 2005-09-30 12:38:20.000000000 -0700 @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -146,6 +147,7 @@ EXPORT_SYMBOL(ist_info); #endif struct e820map e820; +struct e820map bios_e820; extern void early_cpu_init(void); extern void dmi_scan_machine(void); @@ -365,6 +367,37 @@ } } +/* + * numa interface - we expect the numa architecture specfic code to have + * populated the following initialisation. + * + * 1) node_online_map - the map of all nodes configured (online) in the system + * 2) node_start_pfn - the starting page frame number for a node + * 3) node_end_pfn - the ending page fram number for a node + */ +unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; +unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; +bootmem_data_t node0_bdata; + +/* + * FLAT - support for basic PC memory model with discontig enabled, essentially + * a single node with all available processors in it with a flat + * memory map. + */ +int __init get_memcfg_numa_flat(void) +{ + printk("NUMA - single node, flat memory mode\n"); + + /* Run the memory configuration and find the top of memory. */ + node_start_pfn[0] = 0; + node_end_pfn[0] = max_pfn; + + /* Indicate there is one node available. */ + nodes_clear(node_online_map); + node_set_online(0); + return 1; +} + static void __init limit_regions(unsigned long long size) { unsigned long long current_addr = 0; @@ -946,6 +979,12 @@ return 0; } +static int __init +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) +{ + memory_present(0, start, end); + return 0; +} /* * Find the highest page frame number we have available @@ -957,6 +996,7 @@ max_pfn = 0; if (efi_enabled) { efi_memmap_walk(efi_find_max_pfn, &max_pfn); + efi_memmap_walk(efi_memory_present_wrapper, NULL); return; } @@ -971,6 +1011,7 @@ continue; if (end > max_pfn) max_pfn = end; + memory_present(0, start, end); } } @@ -1106,59 +1147,112 @@ reserve_bootmem(addr, PAGE_SIZE); } -#ifndef CONFIG_NEED_MULTIPLE_NODES -void __init setup_bootmem_allocator(void); -static unsigned long __init setup_memory(void) +static void __init find_max_pfn_node(int nid) { + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; /* - * partially used pages are not usable - thus - * we are rounding upwards: + * if a user has given mem=XXXX, then we need to make sure + * that the node _starts_ before that, too, not just ends */ - min_low_pfn = PFN_UP(init_pg_tables_end); + if (node_start_pfn[nid] > max_pfn) + node_start_pfn[nid] = max_pfn; + if (node_start_pfn[nid] > node_end_pfn[nid]) + BUG(); +} + +void __init setup_bootmem_allocator(void); +unsigned long __init setup_memory(void) +{ + int nid; + unsigned long reserve_pages; + /* + * When mapping a NUMA machine we allocate the node_mem_map arrays + * from node local memory. They are then mapped directly into KVA + * between zone normal and vmalloc space. Calculate the size of + * this space and use it to adjust the boundry between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ find_max_pfn(); + get_memcfg_numa(); + for_each_online_node(nid) + num_physpages = max(num_physpages, node_end_pfn[nid]); - max_low_pfn = find_max_low_pfn(); + reserve_pages = calculate_numa_remap_pages(); + /* partially used pages are not usable - thus round upwards */ + min_low_pfn = PFN_UP(init_pg_tables_end); + max_low_pfn = find_max_low_pfn() - reserve_pages; + + if (reserve_pages) + printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n", + reserve_pages, max_low_pfn + reserve_pages); + printk(KERN_DEBUG "max_pfn = %ld\n", max_pfn); #ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > max_low_pfn) { - highstart_pfn = max_low_pfn; - } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + pages_to_mb(max_pfn - max_low_pfn)); #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); + pages_to_mb(max_low_pfn - min_low_pfn)); + printk(KERN_DEBUG "min_low_pfn = %ld, max_low_pfn = %ld\n", + min_low_pfn, max_low_pfn); + + printk(KERN_NOTICE "Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + setup_numa_kva_remap(); + printk("High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + vmalloc_earlyreserve = reserve_pages * PAGE_SIZE; + for_each_online_node(nid) + find_max_pfn_node(nid); setup_bootmem_allocator(); - return max_low_pfn; } -void __init zone_sizes_init(void) +static inline unsigned long max_hardware_dma_pfn(void) +{ + return virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; +} +static inline unsigned long nid_size_pages(int nid) +{ + return node_end_pfn[nid] - node_start_pfn[nid]; +} +static inline int nid_starts_in_highmem(int nid) +{ + return node_start_pfn[nid] >= max_low_pfn; +} + +void __init nid_zone_sizes_init(int nid) { unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int max_dma, low; + unsigned long max_dma; + unsigned long start = node_start_pfn[nid]; + unsigned long end = node_end_pfn[nid]; + + if (node_has_online_mem(nid)){ + if (nid_starts_in_highmem(nid)) { + zones_size[ZONE_HIGHMEM] = nid_size_pages(nid); + } else { + max_dma = min(max_hardware_dma_pfn(), max_low_pfn); + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; + zones_size[ZONE_HIGHMEM] = end - max_low_pfn; + } + } - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - low = max_low_pfn; + free_area_init_node(nid, NODE_DATA(nid), zones_size, start, + get_zholes_size(nid)); +} - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = highend_pfn - low; -#endif - } - free_area_init(zones_size); +void __init zone_sizes_init(void) +{ + int nid; + + for_each_online_node(nid) + nid_zone_sizes_init(nid); } -#else -extern unsigned long __init setup_memory(void); -extern void zone_sizes_init(void); -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ void __init setup_bootmem_allocator(void) { @@ -1520,6 +1614,7 @@ else { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); print_memory_map(machine_specific_memory_setup()); + bios_e820 = e820; } copy_edd(); --- linux.orig/arch/i386/kernel/sys_i386.c~AA-PM-22-vm_immovable 2005-09-30 12:40:01.000000000 -0700 +++ linux/arch/i386/kernel/sys_i386.c 2005-09-30 12:40:01.000000000 -0700 @@ -70,7 +70,7 @@ unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + return do_mmap2(addr, len, prot, flags & ~MAP_IMMOVABLE, fd, pgoff); } /* @@ -101,7 +101,8 @@ if (a.offset & ~PAGE_MASK) goto out; - err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + err = do_mmap2(a.addr, a.len, a.prot, a.flags & ~MAP_IMMOVABLE, + a.fd, a.offset >> PAGE_SHIFT); out: return err; } --- linux.orig/arch/i386/mm/Makefile~B2.2-i386-create-numa.c 2005-09-30 12:38:13.000000000 -0700 +++ linux/arch/i386/mm/Makefile 2005-09-30 12:38:13.000000000 -0700 @@ -4,7 +4,8 @@ obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o -obj-$(CONFIG_NUMA) += discontig.o +obj-$(CONFIG_DISCONTIGMEM) += discontig.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o --- linux.orig/arch/i386/mm/discontig.c~FROM-MM-memory-hotplug-i386-addition-functions 2005-09-30 12:37:59.000000000 -0700 +++ linux/arch/i386/mm/discontig.c 2005-09-30 12:38:13.000000000 -0700 @@ -32,28 +32,10 @@ #include #include -#include #include #include #include -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); -bootmem_data_t node0_bdata; - -/* - * numa interface - we expect the numa architecture specfic code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; - - -#ifdef CONFIG_DISCONTIGMEM /* * 4) physnode_map - the mapping between a pfn and owning node * physnode_map keeps track of the physical memory layout of a generic @@ -94,342 +76,3 @@ return (nr_pages + 1) * sizeof(struct page); } -#endif - -extern unsigned long find_max_low_pfn(void); -extern void find_max_pfn(void); -extern void one_highpage_init(struct page *, int, int); - -extern struct e820map e820; -extern unsigned long init_pg_tables_end; -extern unsigned long highend_pfn, highstart_pfn; -extern unsigned long max_low_pfn; -extern unsigned long totalram_pages; -extern unsigned long totalhigh_pages; - -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -unsigned long node_remap_start_pfn[MAX_NUMNODES]; -unsigned long node_remap_size[MAX_NUMNODES]; -unsigned long node_remap_offset[MAX_NUMNODES]; -void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -void *node_remap_end_vaddr[MAX_NUMNODES]; -void *node_remap_alloc_vaddr[MAX_NUMNODES]; - -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -int __init get_memcfg_numa_flat(void) -{ - printk("NUMA - single node, flat memory mode\n"); - - /* Run the memory configuration and find the top of memory. */ - find_max_pfn(); - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - memory_present(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init find_max_pfn_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - if (node_start_pfn[nid] > node_end_pfn[nid]) - BUG(); -} - -/* Find the owning node for a pfn. */ -int early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - for_each_node(nid) { - if (node_end_pfn[nid] == 0) - break; - if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn) - return nid; - } - - return 0; -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - if (nid && node_has_online_mem(nid)) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { - NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); - min_low_pfn += PFN_UP(sizeof(pg_data_t)); - } -} - -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) - return 0; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn< max_pfn) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - /* - * Validate the region we are allocating only contains valid - * pages. - */ - for (pfn = node_end_pfn[nid] - size; - pfn < node_end_pfn[nid]; pfn++) - if (!page_is_ram(pfn)) - break; - - if (pfn != node_end_pfn[nid]) - size = 0; - - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); - node_remap_size[nid] = size; - node_remap_offset[nid] = reserve_pages; - reserve_pages += size; - printk("Shrinking node %d from %ld pages to %ld pages\n", - nid, node_end_pfn[nid], node_end_pfn[nid] - size); - - if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { - /* - * Align node_end_pfn[] and node_remap_start_pfn[] to - * pmd boundary. remap_numa_kva will barf otherwise. - */ - printk("Shrinking node %d further by %ld pages for proper alignment\n", - nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); - size += node_end_pfn[nid] & (PTRS_PER_PTE-1); - } - - node_end_pfn[nid] -= size; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - } - printk("Reserving total of %ld pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; -} - -extern void setup_bootmem_allocator(void); -unsigned long __init setup_memory(void) -{ - int nid; - unsigned long system_start_pfn, system_max_low_pfn; - unsigned long reserve_pages; - - /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundry between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ - find_max_pfn(); - get_memcfg_numa(); - - reserve_pages = calculate_numa_remap_pages(); - - /* partially used pages are not usable - thus round upwards */ - system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - - system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages; - printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n", - reserve_pages, max_low_pfn + reserve_pages); - printk("max_pfn = %ld\n", max_pfn); -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > system_max_low_pfn) - highstart_pfn = system_max_low_pfn; - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); - printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", - min_low_pfn, max_low_pfn, highstart_pfn); - - printk("Low memory ends at vaddr %08lx\n", - (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - node_remap_start_vaddr[nid] = pfn_to_kaddr( - highstart_pfn + node_remap_offset[nid]); - /* Init the node remap allocator */ - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - allocate_pgdat(nid); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) pfn_to_kaddr(highstart_pfn - + node_remap_offset[nid] + node_remap_size[nid])); - } - printk("High memory starts at vaddr %08lx\n", - (ulong) pfn_to_kaddr(highstart_pfn)); - vmalloc_earlyreserve = reserve_pages * PAGE_SIZE; - for_each_online_node(nid) - find_max_pfn_node(nid); - - memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); - NODE_DATA(0)->bdata = &node0_bdata; - setup_bootmem_allocator(); - return max_low_pfn; -} - -void __init zone_sizes_init(void) -{ - int nid; - - /* - * Insert nodes into pgdat_list backward so they appear in order. - * Clobber node 0's links and NULL out pgdat_list before starting. - */ - pgdat_list = NULL; - for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) { - if (!node_online(nid)) - continue; - NODE_DATA(nid)->pgdat_next = pgdat_list; - pgdat_list = NODE_DATA(nid); - } - - for_each_online_node(nid) { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned long *zholes_size; - unsigned int max_dma; - - unsigned long low = max_low_pfn; - unsigned long start = node_start_pfn[nid]; - unsigned long high = node_end_pfn[nid]; - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - if (node_has_online_mem(nid)){ - if (start > low) { -#ifdef CONFIG_HIGHMEM - BUG_ON(start > high); - zones_size[ZONE_HIGHMEM] = high - start; -#endif - } else { - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - BUG_ON(max_dma > low); - BUG_ON(low > high); - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; -#endif - } - } - } - - zholes_size = get_zholes_size(nid); - - free_area_init_node(nid, NODE_DATA(nid), zones_size, start, - zholes_size); - } - return; -} - -void __init set_highmem_pages_init(int bad_ppro) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - struct page *page; - - for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; - - if (!is_highmem(zone)) - continue; - - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone_start_pfn + zone->spanned_pages; - - printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone->zone_pgdat->node_id, - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - one_highpage_init(page, node_pfn, bad_ppro); - } - } - totalram_pages += totalhigh_pages; -#endif -} --- linux.orig/arch/i386/mm/init.c~FROM-MM-memory-hotplug-i386-addition-functions 2005-09-30 12:37:59.000000000 -0700 +++ linux/arch/i386/mm/init.c 2005-09-30 12:39:28.000000000 -0700 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +45,6 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -unsigned long highstart_pfn, highend_pfn; static int noinline do_test_wp_bit(void); @@ -191,39 +192,43 @@ extern int is_available_memory(efi_memory_desc_t *); -int page_is_ram(unsigned long pagenr) +static int page_is_ram_efi(unsigned long pagenr) { +#ifdef CONFIG_EFI int i; unsigned long addr, end; - - if (efi_enabled) { - efi_memory_desc_t *md; + efi_memory_desc_t *md; void *p; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (!is_available_memory(md)) + continue; + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; + if ((pagenr >= addr) && (pagenr < end)) + return 1; } +#endif /* CONFIG_EFI */ + return 0; +} - for (i = 0; i < e820.nr_map; i++) { +int page_is_ram_e820(unsigned long pagenr, struct e820map *local_e820) +{ + int i; + unsigned long addr, end; - if (e820.map[i].type != E820_RAM) /* not usable memory */ + for (i = 0; i < local_e820->nr_map; i++) { + + if (local_e820->map[i].type != E820_RAM) /* not usable memory */ continue; /* * !!!FIXME!!! Some BIOSen report areas as RAM that * are not. Notably the 640->1Mb area. We need a sanity * check here. */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + addr = (local_e820->map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (local_e820->map[i].addr+local_e820->map[i].size) >> PAGE_SHIFT; if ((pagenr >= addr) && (pagenr < end)) return 1; } @@ -266,28 +271,72 @@ pkmap_page_table = pte; } -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +void __devinit free_new_highpage(struct page *page) +{ + set_page_count(page, 1); + __free_page(page); + totalhigh_pages++; +} + +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - totalhigh_pages++; + free_new_highpage(page); } else SetPageReserved(page); } -#ifdef CONFIG_NUMA -extern void set_highmem_pages_init(int); -#else -static void __init set_highmem_pages_init(int bad_ppro) +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) { - int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + free_new_highpage(page); + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr = max(pfn, max_mapnr); +#endif + num_physpages++; + return 0; +} + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage_hotplug(page, page_to_pfn(page)); +} + +void __init set_highmem_pages_init(int bad_ppro) +{ + struct zone *zone; + struct page *page; + + for_each_zone(zone) { + unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + printk("Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, zone->zone_pgdat->node_id, + zone_start_pfn, zone_end_pfn); + + for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + } totalram_pages += totalhigh_pages; } -#endif /* CONFIG_FLATMEM */ #else #define kmap_init() do { } while (0) @@ -299,12 +348,6 @@ EXPORT_SYMBOL(__PAGE_KERNEL); unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; -#ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); -#else -#define remap_numa_kva() do {} while (0) -#endif - static void __init pagetable_init (void) { unsigned long vaddr; @@ -522,11 +565,6 @@ static void __init set_max_mapnr_init(void) { -#ifdef CONFIG_HIGHMEM - num_physpages = highend_pfn; -#else - num_physpages = max_low_pfn; -#endif #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif @@ -560,11 +598,7 @@ set_max_mapnr_init(); -#ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); @@ -615,6 +649,28 @@ #endif } +/* + * this is for the non-NUMA, single node SMP system case. + * Specifically, in the case of x86, we will always add + * memory to the highmem for now. + */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +int add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = &contig_page_data; + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages); +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +#endif + kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; @@ -695,3 +751,10 @@ } } #endif + +int page_is_ram(unsigned long pagenr) +{ + if (efi_enabled) + return page_is_ram_efi(pagenr); + return page_is_ram_e820(pagenr, &e820); +} --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/arch/i386/mm/numa.c 2005-09-30 12:38:13.000000000 -0700 @@ -0,0 +1,167 @@ +#include +#include +#include + +#include +#include + +unsigned long node_remap_start_pfn[MAX_NUMNODES]; +unsigned long node_remap_size[MAX_NUMNODES]; +unsigned long node_remap_offset[MAX_NUMNODES]; +void *node_remap_start_vaddr[MAX_NUMNODES]; + +void *node_remap_end_vaddr[MAX_NUMNODES]; +void *node_remap_alloc_vaddr[MAX_NUMNODES]; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +/* + * Allocate memory for the pg_data_t for this node via a crude pre-bootmem + * method. For node zero take this from the bottom of memory, for + * subsequent nodes place them at node_remap_start_vaddr which contains + * node local data in physically node local memory. See setup_memory() + * for details. + */ +static bootmem_data_t node0_bdata; +static void __init allocate_pgdat(int nid) +{ + if (nid && node_has_online_mem(nid)) + NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; + else { + NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + min_low_pfn += PFN_UP(sizeof(pg_data_t)); + memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); + NODE_DATA(0)->bdata = &node0_bdata; + } +} + +void setup_numa_kva_remap(void) +{ + int nid; + for_each_online_node(nid) { + if (NODE_DATA(nid)) + continue; + node_remap_start_vaddr[nid] = pfn_to_kaddr( + max_low_pfn + node_remap_offset[nid]); + /* Init the node remap allocator */ + node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + + (node_remap_size[nid] * PAGE_SIZE); + node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + + ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + allocate_pgdat(nid); + printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) pfn_to_kaddr(max_low_pfn + + node_remap_offset[nid] + node_remap_size[nid])); + } +} + +void *alloc_remap(int nid, unsigned long size) +{ + void *allocation = node_remap_alloc_vaddr[nid]; + + size = ALIGN(size, L1_CACHE_BYTES); + + if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + return 0; + + node_remap_alloc_vaddr[nid] += size; + memset(allocation, 0, size); + + return allocation; +} + +void __init remap_numa_kva(void) +{ + void *vaddr; + unsigned long pfn; + int node; + + for_each_online_node(node) { + for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { + vaddr = node_remap_start_vaddr[node]+(pfn< max_pfn) + continue; + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + + /* ensure the remap includes space for the pgdat. */ + size = node_remap_size[nid] + sizeof(pg_data_t); + + /* convert size to large (pmd size) pages, rounding up */ + size = (size + PMD_SIZE - 1) / PMD_SIZE; + /* now the roundup is correct, convert to PAGE_SIZE pages */ + size = size * PTRS_PER_PTE; + + /* + * Validate the region we are allocating only contains valid + * pages. + */ + for (pfn = node_end_pfn[nid] - size; + pfn < node_end_pfn[nid]; pfn++) + if (!page_is_ram(pfn)) + break; + + if (pfn != node_end_pfn[nid]) + size = 0; + + printk("Reserving %ld pages of KVA for lmem_map of node %d\n", + size, nid); + node_remap_size[nid] = size; + node_remap_offset[nid] = reserve_pages; + reserve_pages += size; + printk("Shrinking node %d from %ld pages to %ld pages\n", + nid, node_end_pfn[nid], node_end_pfn[nid] - size); + + if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { + /* + * Align node_end_pfn[] and node_remap_start_pfn[] to + * pmd boundary. remap_numa_kva will barf otherwise. + */ + printk("Shrinking node %d further by %ld pages for proper alignment\n", + nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); + size += node_end_pfn[nid] & (PTRS_PER_PTE-1); + } + + node_end_pfn[nid] -= size; + node_remap_start_pfn[nid] = node_end_pfn[nid]; + } + printk("Reserving total of %ld pages for numa KVA remap\n", + reserve_pages); + return reserve_pages; +} + +/* Find the owning node for a pfn. */ +int early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + for_each_node(nid) { + if (node_end_pfn[nid] == 0) + break; + if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn) + return nid; + } + + return 0; +} --- linux.orig/arch/i386/mm/pgtable.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/i386/mm/pgtable.c 2005-09-30 12:37:54.000000000 -0700 @@ -31,11 +31,13 @@ pg_data_t *pgdat; unsigned long i; struct page_state ps; + unsigned long flags; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -48,6 +50,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); --- linux.orig/arch/ia64/Kconfig~F3-create-__boot-ia64 2005-09-30 12:38:24.000000000 -0700 +++ linux/arch/ia64/Kconfig 2005-09-30 12:38:24.000000000 -0700 @@ -298,6 +298,9 @@ source "mm/Kconfig" +config ARCH_HAS_BOOTPA + def_bool y + config IA32_SUPPORT bool "Support for Linux/x86 binaries" help --- linux.orig/arch/ia64/mm/discontig.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/ia64/mm/discontig.c 2005-09-30 12:38:14.000000000 -0700 @@ -376,30 +376,6 @@ return ptr; } -/** - * pgdat_insert - insert the pgdat into global pgdat_list - * @pgdat: the pgdat for a node. - */ -static void __init pgdat_insert(pg_data_t *pgdat) -{ - pg_data_t *prev = NULL, *next; - - for_each_pgdat(next) - if (pgdat->node_id < next->node_id) - break; - else - prev = next; - - if (prev) { - prev->pgdat_next = pgdat; - pgdat->pgdat_next = next; - } else { - pgdat->pgdat_next = pgdat_list; - pgdat_list = pgdat; - } - - return; -} /** * memory_less_nodes - allocate and initialize CPU only nodes pernode @@ -524,9 +500,13 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long present = pgdat->node_present_pages; + unsigned long present; + unsigned long flags; int shared = 0, cached = 0, reserved = 0; + printk("Node ID: %d\n", pgdat->node_id); + pgdat_resize_lock(pgdat, &flags); + present = pgdat->node_present_pages; for(i = 0; i < pgdat->node_spanned_pages; i++) { struct page *page = pgdat_page_nr(pgdat, i); if (!ia64_pfn_valid(pgdat->node_start_pfn+i)) @@ -538,6 +518,7 @@ else if (page_count(page)) shared += page_count(page)-1; } + pgdat_resize_unlock(pgdat, &flags); total_present += present; total_reserved += reserved; total_cached += cached; @@ -695,11 +676,5 @@ pfn_offset, zholes_size); } - /* - * Make memory less nodes become a member of the known nodes. - */ - for_each_node_mask(node, memory_less_mask) - pgdat_insert(mem_data[node].pgdat); - zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } --- linux.orig/arch/m32r/mm/discontig.c~B3.4-remove-pgdat_list-ver2-m32r 2005-09-30 12:38:16.000000000 -0700 +++ linux/arch/m32r/mm/discontig.c 2005-09-30 12:38:16.000000000 -0700 @@ -137,12 +137,6 @@ int nid, i; mem_prof_t *mp; - pgdat_list = NULL; - for (nid = num_online_nodes() - 1 ; nid >= 0 ; nid--) { - NODE_DATA(nid)->pgdat_next = pgdat_list; - pgdat_list = NODE_DATA(nid); - } - for_each_online_node(nid) { mp = &mem_prof[nid]; for (i = 0 ; i < MAX_NR_ZONES ; i++) { --- linux.orig/arch/m32r/mm/init.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/m32r/mm/init.c 2005-09-30 12:37:54.000000000 -0700 @@ -48,6 +48,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -60,6 +62,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); @@ -150,10 +153,14 @@ int reservedpages, nid, i; reservedpages = 0; - for_each_online_node(nid) + for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) if (PageReserved(nid_page_nr(nid, i))) reservedpages++; + pgdat_resize_unlock(NODE_DATA(nid), &flags); + } return reservedpages; } --- linux.orig/arch/parisc/mm/init.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/parisc/mm/init.c 2005-09-30 12:37:54.000000000 -0700 @@ -505,7 +505,9 @@ for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { struct page *p; + unsigned long flags; + pgdat_resize_lock(NODE_DATA(i), &flags); p = nid_page_nr(i, j) - node_start_pfn(i); total++; @@ -517,6 +519,7 @@ free++; else shared += page_count(p) - 1; + pgdat_resize_unlock(NODE_DATA(i), &flags); } } #endif --- linux.orig/arch/ppc64/Kconfig~A4.3-antonb-ppc64-convert_to_sparsemem 2005-09-30 12:38:07.000000000 -0700 +++ linux/arch/ppc64/Kconfig 2005-09-30 12:38:22.000000000 -0700 @@ -227,6 +227,10 @@ depends on SMP default "32" +config ARCH_HAS_BOOTPA + bool + default y + config HMT bool "Hardware multithreading" depends on SMP && PPC_PSERIES && BROKEN @@ -238,23 +242,14 @@ def_bool y config ARCH_FLATMEM_ENABLE - def_bool y - depends on !NUMA - -config ARCH_DISCONTIGMEM_ENABLE - def_bool y - depends on SMP && PPC_PSERIES - -config ARCH_DISCONTIGMEM_DEFAULT def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE -config ARCH_FLATMEM_ENABLE +config ARCH_SPARSEMEM_ENABLE def_bool y -config ARCH_SPARSEMEM_ENABLE +config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE + depends on PPC_PSERIES source "mm/Kconfig" @@ -276,7 +271,8 @@ config NUMA bool "NUMA support" - default y if DISCONTIGMEM || SPARSEMEM + depends on SPARSEMEM + default y if SPARSEMEM config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" --- linux.orig/arch/ppc64/kernel/pSeries_setup.c~no-found-boot_cpuid 2005-09-30 12:37:50.000000000 -0700 +++ linux/arch/ppc64/kernel/pSeries_setup.c 2005-09-30 12:37:50.000000000 -0700 @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include --- linux.orig/arch/ppc64/kernel/prom_init.c~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/arch/ppc64/kernel/prom_init.c 2005-09-30 12:38:27.000000000 -0700 @@ -1141,11 +1141,11 @@ extern unsigned long __secondary_hold_spinloop; extern unsigned long __secondary_hold_acknowledge; unsigned long *spinloop - = (void *)virt_to_abs(&__secondary_hold_spinloop); + = (void *)boot_virt_to_abs((unsigned long)&__secondary_hold_spinloop); unsigned long *acknowledge - = (void *)virt_to_abs(&__secondary_hold_acknowledge); + = (void *)boot_virt_to_abs((unsigned long)&__secondary_hold_acknowledge); unsigned long secondary_hold - = virt_to_abs(*PTRRELOC((unsigned long *)__secondary_hold)); + = boot_virt_to_abs(*PTRRELOC((unsigned long *)__secondary_hold)); struct prom_t *_prom = PTRRELOC(&prom); prom_debug("prom_hold_cpus: start...\n"); @@ -1871,7 +1871,7 @@ if ( r3 && r4 && r4 != 0xdeadbeef) { u64 val; - RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __pa(r3) : r3; + RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __boot_pa(r3) : r3; RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4; val = (u64)RELOC(prom_initrd_start); --- linux.orig/arch/ppc64/kernel/rtas.c~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/arch/ppc64/kernel/rtas.c 2005-09-30 12:38:26.000000000 -0700 @@ -36,6 +36,7 @@ struct rtas_t rtas = { .lock = SPIN_LOCK_UNLOCKED }; +static unsigned long rtas_args_paddr; EXPORT_SYMBOL(rtas); @@ -309,8 +310,7 @@ for (i = 0; i < nret; ++i) rtas_args->rets[i] = 0; - PPCDBG(PPCDBG_RTAS, "\tentering rtas with 0x%lx\n", - __pa(rtas_args)); + PPCDBG(PPCDBG_RTAS, "\tentering rtas with 0x%lx\n", rtas_args_paddr); enter_rtas(__pa(rtas_args)); PPCDBG(PPCDBG_RTAS, "\treturned from rtas ...\n"); @@ -758,6 +758,8 @@ #endif /* CONFIG_HOTPLUG_CPU */ } + /* Get and save off phys address of rtas structure argunemt field */ + rtas_args_paddr = __boot_pa(&rtas.args); } --- linux.orig/arch/ppc64/kernel/setup.c~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/arch/ppc64/kernel/setup.c 2005-09-30 12:38:26.000000000 -0700 @@ -376,7 +376,7 @@ * tree, like retreiving the physical memory map or * calculating/retreiving the hash table size */ - early_init_devtree(__va(dt_ptr)); + early_init_devtree(__boot_va(dt_ptr)); /* * Iterate all ppc_md structures until we find the proper @@ -505,11 +505,11 @@ prop = (u64 *)get_property(of_chosen, "linux,initrd-start", NULL); if (prop != NULL) { - initrd_start = (unsigned long)__va(*prop); + initrd_start = (unsigned long)__boot_va(*prop); prop = (u64 *)get_property(of_chosen, "linux,initrd-end", NULL); if (prop != NULL) { - initrd_end = (unsigned long)__va(*prop); + initrd_end = (unsigned long)__boot_va(*prop); initrd_below_start_ok = 1; } else initrd_start = 0; @@ -940,9 +940,9 @@ * SLB misses on them. */ for_each_cpu(i) { - softirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE, + softirq_ctx[i] = (struct thread_info *)__boot_va(lmb_alloc_base(THREAD_SIZE, THREAD_SIZE, 0x10000000)); - hardirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE, + hardirq_ctx[i] = (struct thread_info *)__boot_va(lmb_alloc_base(THREAD_SIZE, THREAD_SIZE, 0x10000000)); } } @@ -971,7 +971,7 @@ limit = min(0x10000000UL, lmb.rmo_size); for_each_cpu(i) - paca[i].emergency_sp = __va(lmb_alloc_base(PAGE_SIZE, 128, + paca[i].emergency_sp = __boot_va(lmb_alloc_base(PAGE_SIZE, 128, limit)) + PAGE_SIZE; } --- linux.orig/arch/ppc64/kernel/time.c~no-found-boot_cpuid 2005-09-30 12:37:50.000000000 -0700 +++ linux/arch/ppc64/kernel/time.c 2005-09-30 12:37:50.000000000 -0700 @@ -65,6 +65,7 @@ #include #include #include +#include #include #include --- linux.orig/arch/ppc64/mm/hash_utils.c~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/arch/ppc64/mm/hash_utils.c 2005-09-30 12:38:28.000000000 -0700 @@ -132,12 +132,12 @@ #ifdef CONFIG_PPC_PSERIES if (systemcfg->platform & PLATFORM_LPAR) ret = pSeries_lpar_hpte_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, + boot_virt_to_abs(addr) >> PAGE_SHIFT, vflags, tmp_mode); else #endif /* CONFIG_PPC_PSERIES */ ret = native_hpte_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, + boot_virt_to_abs(addr) >> PAGE_SHIFT, vflags, tmp_mode); if (ret == -1) { @@ -147,6 +147,13 @@ } } +void create_lmb_mapping(unsigned long start, unsigned long end) +{ + create_pte_mapping(start, end, + _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX, + cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE ? 1 : 0); +} + void __init htab_initialize(void) { unsigned long table, htab_size_bytes; --- linux.orig/arch/ppc64/mm/init.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/ppc64/mm/init.c 2005-09-30 12:38:28.000000000 -0700 @@ -104,6 +104,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); total++; @@ -114,6 +116,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%ld pages of RAM\n", total); printk("%ld reserved pages\n", reserved); @@ -633,7 +636,8 @@ unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; num_physpages = max_low_pfn; /* RAM is assumed contiguous */ - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + /* The strange -1 +1 is to avoid calling __va on an invalid address */ + high_memory = (void *) (__va(max_low_pfn * PAGE_SIZE - 1) + 1); #ifdef CONFIG_NEED_MULTIPLE_NODES for_each_online_node(nid) { @@ -649,11 +653,14 @@ #endif for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); if (PageReserved(page)) reservedpages++; } + pgdat_resize_unlock(pgdat, &flags); } codesize = (unsigned long)&_etext - (unsigned long)&_stext; @@ -868,3 +875,83 @@ return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); + +#ifdef CONFIG_MEMORY_HOTPLUG + +void online_page(struct page *page) +{ + ClearPageReserved(page); + free_cold_page(page); + totalram_pages++; + num_physpages++; +} + +/* + * This works only for the non-NUMA case. Later, we'll need a lookup + * to convert from real physical addresses to nid, that doesn't use + * pfn_to_nid(). + */ +int __devinit add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(0); + struct zone *zone; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + start += KERNELBASE; + create_lmb_mapping(start, start + size); + + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones; + + return __add_pages(zone, start_pfn, nr_pages); + + return 0; +} + +/* + * First pass at this code will check to determine if the remove + * request is within the RMO. Do not allow removal within the RMO. + */ +int __devinit remove_memory(u64 start, u64 size) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + end_pfn = start_pfn + nr_pages; + + printk("%s(): Attempting to remove memoy in range " + "%lx to %lx\n", __func__, start, start+size); + /* + * check for range within RMO + */ + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s(): memory will be removed from " + "the %s zone\n", __func__, zone->name); + + /* + * not handling removing memory ranges that + * overlap multiple zones yet + */ + if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages)) + goto overlap; + + /* make sure it is NOT in RMO */ + if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) { + printk("%s(): range to be removed must NOT be in RMO!\n", + __func__); + goto in_rmo; + } + + return __remove_pages(zone, start_pfn, nr_pages); + +overlap: + printk("%s(): memory range to be removed overlaps " + "multiple zones!!!\n", __func__); +in_rmo: + return -1; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ --- linux.orig/arch/ppc64/mm/numa.c~A4.2-antonb-ppc64-_use_generic_nr_cpus_node 2005-09-30 12:38:06.000000000 -0700 +++ linux/arch/ppc64/mm/numa.c 2005-09-30 12:38:07.000000000 -0700 @@ -17,54 +17,121 @@ #include #include #include +#include #include -#include -#include static int numa_enabled = 1; static int numa_debug; #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } -#ifdef DEBUG_NUMA -#define ARRAY_INITIALISER -1 -#else -#define ARRAY_INITIALISER 0 -#endif - -int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] = - ARRAY_INITIALISER}; -char *numa_memory_lookup_table; +int numa_cpu_lookup_table[NR_CPUS]; cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; -int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0}; - struct pglist_data *node_data[MAX_NUMNODES]; -bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; + +EXPORT_SYMBOL(numa_cpu_lookup_table); +EXPORT_SYMBOL(numa_cpumask_lookup_table); +EXPORT_SYMBOL(node_data); + +static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; static int min_common_depth; /* - * We need somewhere to store start/span for each node until we have + * We need somewhere to store start/end/node for each region until we have * allocated the real node_data structures. */ +#define MAX_REGIONS (MAX_LMB_REGIONS*2) static struct { - unsigned long node_start_pfn; - unsigned long node_end_pfn; - unsigned long node_present_pages; -} init_node_data[MAX_NUMNODES] __initdata; + unsigned long start_pfn; + unsigned long end_pfn; + int nid; +} init_node_data[MAX_REGIONS] __initdata; -EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(numa_cpu_lookup_table); -EXPORT_SYMBOL(numa_memory_lookup_table); -EXPORT_SYMBOL(numa_cpumask_lookup_table); -EXPORT_SYMBOL(nr_cpus_in_node); +int __init early_pfn_to_nid(unsigned long pfn) +{ + unsigned int i; + + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start_pfn = init_node_data[i].start_pfn; + unsigned long end_pfn = init_node_data[i].end_pfn; + + if ((start_pfn <= pfn) && (pfn < end_pfn)) + return init_node_data[i].nid; + } + + return -1; +} + +void __init add_region(unsigned int nid, unsigned long start_pfn, + unsigned long pages) +{ + unsigned int i; + + dbg("add_region nid %d start_pfn 0x%lx pages 0x%lx\n", + nid, start_pfn, pages); + + for (i = 0; init_node_data[i].end_pfn; i++) { + if (init_node_data[i].nid != nid) + continue; + if (init_node_data[i].end_pfn == start_pfn) { + init_node_data[i].end_pfn += pages; + return; + } + if (init_node_data[i].start_pfn == (start_pfn + pages)) { + init_node_data[i].start_pfn -= pages; + return; + } + } + + /* + * Leave last entry NULL so we dont iterate off the end (we use + * entry.end_pfn to terminate the walk). + */ + if (i >= (MAX_REGIONS - 1)) { + printk(KERN_ERR "WARNING: too many memory regions in " + "numa code, truncating\n"); + return; + } + + init_node_data[i].start_pfn = start_pfn; + init_node_data[i].end_pfn = start_pfn + pages; + init_node_data[i].nid = nid; +} + +/* We assume init_node_data has no overlapping regions */ +void __init get_region(unsigned int nid, unsigned long *start_pfn, + unsigned long *end_pfn, unsigned long *pages_present) +{ + unsigned int i; + + *start_pfn = -1UL; + *end_pfn = *pages_present = 0; + + for (i = 0; init_node_data[i].end_pfn; i++) { + if (init_node_data[i].nid != nid) + continue; + + *pages_present += init_node_data[i].end_pfn - + init_node_data[i].start_pfn; + + if (init_node_data[i].start_pfn < *start_pfn) + *start_pfn = init_node_data[i].start_pfn; + + if (init_node_data[i].end_pfn > *end_pfn) + *end_pfn = init_node_data[i].end_pfn; + } + + /* We didnt find a matching region, return start/end as 0 */ + if (*start_pfn == -1UL) + start_pfn = 0; +} static inline void map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; - if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) { + + if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) cpu_set(cpu, numa_cpumask_lookup_table[node]); - nr_cpus_in_node[node]++; - } } #ifdef CONFIG_HOTPLUG_CPU @@ -76,7 +143,6 @@ if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { cpu_clear(cpu, numa_cpumask_lookup_table[node]); - nr_cpus_in_node[node]--; } else { printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", cpu, node); @@ -84,7 +150,7 @@ } #endif /* CONFIG_HOTPLUG_CPU */ -static struct device_node * __devinit find_cpu_node(unsigned int cpu) +static struct device_node *find_cpu_node(unsigned int cpu) { unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); struct device_node *cpu_node = NULL; @@ -211,7 +277,7 @@ return rc; } -static unsigned long read_n_cells(int n, unsigned int **buf) +static unsigned long __init read_n_cells(int n, unsigned int **buf) { unsigned long result = 0; @@ -293,7 +359,8 @@ * or zero. If the returned value of size is 0 the region should be * discarded as it lies wholy above the memory limit. */ -static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size) +static unsigned long __init numa_enforce_memory_limit(unsigned long start, + unsigned long size) { /* * We use lmb_end_of_DRAM() in here instead of memory_limit because @@ -319,8 +386,7 @@ struct device_node *cpu = NULL; struct device_node *memory = NULL; int addr_cells, size_cells; - int max_domain = 0; - long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; + int max_domain; unsigned long i; if (numa_enabled == 0) { @@ -328,13 +394,6 @@ return -1; } - numa_memory_lookup_table = - (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); - memset(numa_memory_lookup_table, 0, entries * sizeof(char)); - - for (i = 0; i < entries ; i++) - numa_memory_lookup_table[i] = ARRAY_INITIALISER; - min_common_depth = find_min_common_depth(); dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); @@ -386,9 +445,6 @@ start = read_n_cells(addr_cells, &memcell_buf); size = read_n_cells(size_cells, &memcell_buf); - start = _ALIGN_DOWN(start, MEMORY_INCREMENT); - size = _ALIGN_UP(size, MEMORY_INCREMENT); - numa_domain = of_node_numa_domain(memory); if (numa_domain >= MAX_NUMNODES) { @@ -402,44 +458,15 @@ if (max_domain < numa_domain) max_domain = numa_domain; - if (! (size = numa_enforce_memory_limit(start, size))) { + if (!(size = numa_enforce_memory_limit(start, size))) { if (--ranges) goto new_range; else continue; } - /* - * Initialize new node struct, or add to an existing one. - */ - if (init_node_data[numa_domain].node_end_pfn) { - if ((start / PAGE_SIZE) < - init_node_data[numa_domain].node_start_pfn) - init_node_data[numa_domain].node_start_pfn = - start / PAGE_SIZE; - if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) > - init_node_data[numa_domain].node_end_pfn) - init_node_data[numa_domain].node_end_pfn = - (start / PAGE_SIZE) + - (size / PAGE_SIZE); - - init_node_data[numa_domain].node_present_pages += - size / PAGE_SIZE; - } else { - node_set_online(numa_domain); - - init_node_data[numa_domain].node_start_pfn = - start / PAGE_SIZE; - init_node_data[numa_domain].node_end_pfn = - init_node_data[numa_domain].node_start_pfn + - size / PAGE_SIZE; - init_node_data[numa_domain].node_present_pages = - size / PAGE_SIZE; - } - - for (i = start ; i < (start+size); i += MEMORY_INCREMENT) - numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = - numa_domain; + add_region(numa_domain, start >> PAGE_SHIFT, + size >> PAGE_SHIFT); if (--ranges) goto new_range; @@ -455,32 +482,15 @@ { unsigned long top_of_ram = lmb_end_of_DRAM(); unsigned long total_ram = lmb_phys_mem_size(); - unsigned long i; printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); printk(KERN_INFO "Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); - if (!numa_memory_lookup_table) { - long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT; - numa_memory_lookup_table = - (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); - memset(numa_memory_lookup_table, 0, entries * sizeof(char)); - for (i = 0; i < entries ; i++) - numa_memory_lookup_table[i] = ARRAY_INITIALISER; - } - map_cpu_to_node(boot_cpuid, 0); - + add_region(0, 0, lmb_end_of_DRAM() >> PAGE_SHIFT); node_set_online(0); - - init_node_data[0].node_start_pfn = 0; - init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE; - init_node_data[0].node_present_pages = total_ram / PAGE_SIZE; - - for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) - numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; } static void __init dump_numa_topology(void) @@ -498,8 +508,9 @@ count = 0; - for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) { - if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) { + for (i = 0; i < lmb_end_of_DRAM(); + i += (1 << SECTION_SIZE_BITS)) { + if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { if (count == 0) printk(" 0x%lx", i); ++count; @@ -524,10 +535,12 @@ * * Returns the physical address of the memory. */ -static unsigned long careful_allocation(int nid, unsigned long size, - unsigned long align, unsigned long end) +static void __init *careful_allocation(int nid, unsigned long size, + unsigned long align, + unsigned long end_pfn) { - unsigned long ret = lmb_alloc_base(size, align, end); + int new_nid; + unsigned long ret = lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); /* retry over all memory */ if (!ret) @@ -541,28 +554,27 @@ * If the memory came from a previously allocated node, we must * retry with the bootmem allocator. */ - if (pa_to_nid(ret) < nid) { - nid = pa_to_nid(ret); - ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid), + new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT); + if (new_nid < nid) { + ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid), size, align, 0); if (!ret) panic("numa.c: cannot allocate %lu bytes on node %d", - size, nid); + size, new_nid); - ret = virt_to_abs(ret); + ret = __pa(ret); dbg("alloc_bootmem %lx %lx\n", ret, size); } - return ret; + return (void *)ret; } void __init do_init_bootmem(void) { int nid; - int addr_cells, size_cells; - struct device_node *memory = NULL; + unsigned int i; static struct notifier_block ppc64_numa_nb = { .notifier_call = cpu_numa_callback, .priority = 1 /* Must run before sched domains notifier. */ @@ -580,99 +592,66 @@ register_cpu_notifier(&ppc64_numa_nb); for_each_online_node(nid) { - unsigned long start_paddr, end_paddr; - int i; + unsigned long start_pfn, end_pfn, pages_present; unsigned long bootmem_paddr; unsigned long bootmap_pages; - start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE; - end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE; + get_region(nid, &start_pfn, &end_pfn, &pages_present); /* Allocate the node structure node local if possible */ - NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid, + NODE_DATA(nid) = careful_allocation(nid, sizeof(struct pglist_data), - SMP_CACHE_BYTES, end_paddr); - NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid)); + SMP_CACHE_BYTES, end_pfn); + NODE_DATA(nid) = __va(NODE_DATA(nid)); memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); dbg("node %d\n", nid); dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; - NODE_DATA(nid)->node_start_pfn = - init_node_data[nid].node_start_pfn; - NODE_DATA(nid)->node_spanned_pages = - end_paddr - start_paddr; + NODE_DATA(nid)->node_start_pfn = start_pfn; + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; if (NODE_DATA(nid)->node_spanned_pages == 0) continue; - dbg("start_paddr = %lx\n", start_paddr); - dbg("end_paddr = %lx\n", end_paddr); + dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); + dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); - bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT); + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmem_paddr = (unsigned long)careful_allocation(nid, + bootmap_pages << PAGE_SHIFT, + PAGE_SIZE, end_pfn); + memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); - bootmem_paddr = careful_allocation(nid, - bootmap_pages << PAGE_SHIFT, - PAGE_SIZE, end_paddr); - memset(abs_to_virt(bootmem_paddr), 0, - bootmap_pages << PAGE_SHIFT); dbg("bootmap_paddr = %lx\n", bootmem_paddr); init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, - start_paddr >> PAGE_SHIFT, - end_paddr >> PAGE_SHIFT); - - /* - * We need to do another scan of all memory sections to - * associate memory with the correct node. - */ - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long mem_start, mem_size; - int numa_domain, ranges; - unsigned int *memcell_buf; - unsigned int len; - - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; + start_pfn, end_pfn); - ranges = memory->n_addrs; /* ranges in cell */ -new_range: - mem_start = read_n_cells(addr_cells, &memcell_buf); - mem_size = read_n_cells(size_cells, &memcell_buf); - if (numa_enabled) { - numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) - numa_domain = 0; - } else - numa_domain = 0; + /* Add free regions on this node */ + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start, end; - if (numa_domain != nid) + if (init_node_data[i].nid != nid) continue; - mem_size = numa_enforce_memory_limit(mem_start, mem_size); - if (mem_size) { - dbg("free_bootmem %lx %lx\n", mem_start, mem_size); - free_bootmem_node(NODE_DATA(nid), mem_start, mem_size); - } + start = init_node_data[i].start_pfn << PAGE_SHIFT; + end = init_node_data[i].end_pfn << PAGE_SHIFT; - if (--ranges) /* process all ranges in cell */ - goto new_range; + dbg("free_bootmem %lx %lx\n", start, end - start); + free_bootmem_node(NODE_DATA(nid), start, end - start); } - /* - * Mark reserved regions on this node - */ + /* Mark reserved regions on this node */ for (i = 0; i < lmb.reserved.cnt; i++) { unsigned long physbase = lmb.reserved.region[i].base; unsigned long size = lmb.reserved.region[i].size; + unsigned long start_paddr = start_pfn << PAGE_SHIFT; + unsigned long end_paddr = end_pfn << PAGE_SHIFT; - if (pa_to_nid(physbase) != nid && - pa_to_nid(physbase+size-1) != nid) + if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && + early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) continue; if (physbase < end_paddr && @@ -692,46 +671,19 @@ size); } } - /* - * This loop may look famaliar, but we have to do it again - * after marking our reserved memory to mark memory present - * for sparsemem. - */ - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long mem_start, mem_size; - int numa_domain, ranges; - unsigned int *memcell_buf; - unsigned int len; - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; + /* Add regions into sparsemem */ + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start, end; - ranges = memory->n_addrs; /* ranges in cell */ -new_range2: - mem_start = read_n_cells(addr_cells, &memcell_buf); - mem_size = read_n_cells(size_cells, &memcell_buf); - if (numa_enabled) { - numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) - numa_domain = 0; - } else - numa_domain = 0; - - if (numa_domain != nid) + if (init_node_data[i].nid != nid) continue; - mem_size = numa_enforce_memory_limit(mem_start, mem_size); - memory_present(numa_domain, mem_start >> PAGE_SHIFT, - (mem_start + mem_size) >> PAGE_SHIFT); + start = init_node_data[i].start_pfn; + end = init_node_data[i].end_pfn; - if (--ranges) /* process all ranges in cell */ - goto new_range2; + memory_present(nid, start, end); } - } } @@ -745,21 +697,18 @@ memset(zholes_size, 0, sizeof(zholes_size)); for_each_online_node(nid) { - unsigned long start_pfn; - unsigned long end_pfn; + unsigned long start_pfn, end_pfn, pages_present; - start_pfn = init_node_data[nid].node_start_pfn; - end_pfn = init_node_data[nid].node_end_pfn; + get_region(nid, &start_pfn, &end_pfn, &pages_present); zones_size[ZONE_DMA] = end_pfn - start_pfn; - zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - - init_node_data[nid].node_present_pages; + zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - pages_present; dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); - free_area_init_node(nid, NODE_DATA(nid), zones_size, - start_pfn, zholes_size); + free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, + zholes_size); } } --- linux.orig/arch/x86_64/Kconfig~F2-create-__boot-x86_64 2005-09-30 12:38:23.000000000 -0700 +++ linux/arch/x86_64/Kconfig 2005-09-30 12:38:23.000000000 -0700 @@ -277,6 +277,9 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool y +config ARCH_HAS_BOOTPA + def_bool y + config NR_CPUS int "Maximum number of CPUs (2-256)" range 2 256 --- linux.orig/arch/x86_64/mm/init.c~A0.0-will-not-push-x86_64-hotplug-functions 2005-09-30 12:38:01.000000000 -0700 +++ linux/arch/x86_64/mm/init.c 2005-09-30 12:38:01.000000000 -0700 @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -179,13 +181,19 @@ {} }; -static __init void *alloc_low_page(int *index, unsigned long *phys) +static __devinit void *alloc_low_page(int *index, unsigned long *phys) { struct temp_map *ti; int i; unsigned long pfn = table_end++, paddr; void *adr; + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC); + *phys = __pa(adr); + return adr; + } + if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); for (i = 0; temp_mappings[i].allocated; i++) { @@ -198,55 +206,95 @@ ti->allocated = 1; __flush_tlb(); adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + memset(adr, 0, PAGE_SIZE); *index = i; *phys = pfn * PAGE_SIZE; return adr; } -static __init void unmap_low_page(int i) +static __devinit void unmap_low_page(int i) { - struct temp_map *ti = &temp_mappings[i]; + struct temp_map *ti; + + if (after_bootmem) + return; + ti = &temp_mappings[i]; set_pmd(ti->pmd, __pmd(0)); ti->allocated = 0; } -static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) + +static void __devinit +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +{ + int i; + + printk("%s: pmd: 0x%p, address: 0x%lx end: 0x%lx\n", + __func__, pmd, address, end); + + for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + unsigned long entry; + + if (address > end) { + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + entry = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | address; + entry &= __supported_pte_mask; + set_pmd(pmd, __pmd(entry)); + } +} + + +static void __devinit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); + + printk("%s: addr: 0x%lx end: 0x%lx pmd: 0x%p\n", + __func__, address, end, pmd); + + if (pmd_none(*pmd)) { + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); + } +} + + + +static void __devinit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { - long i, j; + long i = pud_index(address); - i = pud_index(address); pud = pud + i; + + if (after_bootmem && pud_val(*pud)) { + phys_pmd_update(pud, address, end); + return; + } + for (; i < PTRS_PER_PUD; pud++, i++) { int map; unsigned long paddr, pmd_phys; pmd_t *pmd; - paddr = address + i*PUD_SIZE; - if (paddr >= end) { - for (; i < PTRS_PER_PUD; i++, pud++) - set_pud(pud, __pud(0)); + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; + if (paddr >= end) break; - } - if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { set_pud(pud, __pud(0)); continue; } pmd = alloc_low_page(&map, &pmd_phys); + if (after_bootmem) spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) { - for (; j < PTRS_PER_PMD; j++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } + phys_pmd_init(pmd, paddr, end); + if (after_bootmem) spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } __flush_tlb(); @@ -267,12 +315,16 @@ table_start >>= PAGE_SHIFT; table_end = table_start; + + early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, + table_start< end) next = end; phys_pud_init(pud, __pa(start), __pa(next)); - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + if (!after_bootmem) + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(map); } - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + if (!after_bootmem) + asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); __flush_tlb_all(); - early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, - table_start<node_zones + MAX_NR_ZONES - 2; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + ret = __add_pages(zone, start_pfn, nr_pages, attr); + if (ret) + goto error; + + init_memory_mapping(start, (start + size - 1)); + + return ret; +error: + printk("%s: Problem encountered in __add_pages!\n", __func__); + return ret; +} +EXPORT_SYMBOL(add_memory); + +int remove_memory(u64 start, u64 size, unsigned long attr) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + printk("%s: start: 0x%llx size: 0x%llx attr: 0x%lx\n", + __func__, start, size, attr); + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + /* end_pfn is the last *valid* pfn */ + end_pfn = start_pfn + nr_pages - 1; + + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s: memory will be removed from the %s zone\n", + __func__, zone->name); + printk("%s: start_pfn: 0x%lx nr_pages: 0x%lx end_pfn: 0x%lx\n", + __func__, start_pfn, nr_pages, end_pfn); + + if (zone != page_zone(pfn_to_page(end_pfn))) + goto overlap; + + printk("%s: just before remove pages\n", __func__); + + return __remove_pages(zone, start_pfn, nr_pages, attr); +overlap: + printk("%s: memory range overlaps multiple zones?\n", __func__); + return -ENOSYS; +} +EXPORT_SYMBOL(remove_memory); + +#endif static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; --- linux.orig/drivers/acpi/acpi_memhotplug.c~FROM-MM-memory-hotplug-move-section_mem_map-alloc-to-sparsec-fix 2005-09-30 12:37:57.000000000 -0700 +++ linux/drivers/acpi/acpi_memhotplug.c 2005-09-30 12:38:04.000000000 -0700 @@ -30,6 +30,7 @@ #include #include #include +#include #include #define ACPI_MEMORY_DEVICE_COMPONENT 0x08000000UL @@ -180,6 +181,19 @@ return_VALUE(0); } +static acpi_status acpi_memory_set_name(struct acpi_memory_device *mem_device) +{ + struct acpi_device *device = NULL; + acpi_status status; + int ret; + status = acpi_bus_get_device(mem_device->handle, &device); + if (ACPI_FAILURE(status)) + return status; + ret = attach_device_to_memsection(mem_device->start_addr, + mem_device->end_addr, &device->kobj); + return_VALUE(ret); +} + static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) { int result; @@ -200,14 +214,14 @@ * Note: Assume that this function returns zero on success */ result = add_memory(mem_device->start_addr, - (mem_device->end_addr - mem_device->start_addr) + 1, - mem_device->read_write_attribute); + (mem_device->end_addr - mem_device->start_addr) + 1); if (result) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n")); mem_device->state = MEMORY_INVALID_STATE; return result; } - + /* link to /sys/devices/system/memory/memoryX */ + result = acpi_memory_set_name(mem_device); return result; } @@ -259,7 +273,7 @@ * Ask the VM to offline this memory range. * Note: Assume that this function returns zero on success */ - result = remove_memory(start, len, attr); + result = remove_memory(start, len); if (result) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n")); return_VALUE(result); @@ -473,6 +487,21 @@ return_ACPI_STATUS(status); } +static acpi_status __init acpi_memory_set_name_cb(acpi_handle handle, u32 level, + void *ctxt, void **retv) +{ + acpi_status status; + struct acpi_memory_device *mem_device; + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return_ACPI_STATUS(AE_OK); + if (acpi_memory_get_device(handle, &mem_device)) { + ACPI_DEBUG_PRINT((ACPI_DB_ERROR, + "Error in finding driver data\n")); + } + return acpi_memory_set_name(mem_device); +} + static int __init acpi_memory_device_init(void) { int result; @@ -496,6 +525,16 @@ return_VALUE(-ENODEV); } + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_set_name_cb, + NULL, NULL); + if (ACPI_FAILURE(status)) { + ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "walk_namespace failed\n")); + acpi_bus_unregister_driver(&acpi_memory_device_driver); + return_VALUE(-ENODEV); + } + return_VALUE(0); } --- linux.orig/drivers/acpi/processor_idle.c~A9.2-acpi-warnings 2005-09-30 12:38:08.000000000 -0700 +++ linux/drivers/acpi/processor_idle.c 2005-09-30 12:38:08.000000000 -0700 @@ -910,7 +910,7 @@ if (!pr) goto end; - seq_printf(seq, "active state: C%zd\n" + seq_printf(seq, "active state: C%d\n" "max_cstate: C%d\n" "bus master activity: %08x\n", pr->power.state ? pr->power.state - pr->power.states : 0, @@ -944,14 +944,14 @@ } if (pr->power.states[i].promotion.state) - seq_printf(seq, "promotion[C%zd] ", + seq_printf(seq, "promotion[C%d] ", (pr->power.states[i].promotion.state - pr->power.states)); else seq_puts(seq, "promotion[--] "); if (pr->power.states[i].demotion.state) - seq_printf(seq, "demotion[C%zd] ", + seq_printf(seq, "demotion[C%d] ", (pr->power.states[i].demotion.state - pr->power.states)); else --- linux.orig/drivers/base/Makefile~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/drivers/base/Makefile 2005-09-30 12:37:55.000000000 -0700 @@ -7,6 +7,7 @@ obj-y += power/ obj-$(CONFIG_FW_LOADER) += firmware_class.o obj-$(CONFIG_NUMA) += node.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_DEBUG_DRIVER),y) EXTRA_CFLAGS += -DDEBUG --- linux.orig/drivers/base/init.c~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/drivers/base/init.c 2005-09-30 12:37:55.000000000 -0700 @@ -9,6 +9,7 @@ #include #include +#include extern int devices_init(void); extern int buses_init(void); @@ -39,5 +40,6 @@ platform_bus_init(); system_bus_init(); cpu_dev_init(); + memory_dev_init(); attribute_container_init(); } --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/drivers/base/memory.c 2005-09-30 12:39:27.000000000 -0700 @@ -0,0 +1,498 @@ +/* + * drivers/base/memory.c - basic Memory class support + * + * Written by Matt Tolentino + * Dave Hansen + * + * This file provides the necessary infrastructure to represent + * a SPARSEMEM-memory-model system's physical memory in /sysfs. + * All arch-independent code that assumes MEMORY_HOTPLUG requires + * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. + */ + +#include +#include +#include +#include /* capable() */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMORY_CLASS_NAME "memory" + +static struct sysdev_class memory_sysdev_class = { + set_kset_name(MEMORY_CLASS_NAME), +}; +EXPORT_SYMBOL(memory_sysdev_class); + +static const char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) +{ + return MEMORY_CLASS_NAME; +} + +static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + int retval = 0; + + return retval; +} + +static struct kset_hotplug_ops memory_hotplug_ops = { + .name = memory_hotplug_name, + .hotplug = memory_hotplug, +}; + +static struct notifier_block *memory_chain; + +int register_memory_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&memory_chain, nb); +} + +void unregister_memory_notifier(struct notifier_block *nb) +{ + notifier_chain_unregister(&memory_chain, nb); +} + +/* + * register_memory - Setup a sysfs device for a memory block + */ +int register_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + int error; + + memory->sysdev.cls = &memory_sysdev_class; + memory->sysdev.id = __section_nr(section); + + error = sysdev_register(&memory->sysdev); + + if (root && !error) + error = sysfs_create_link(&root->sysdev.kobj, + &memory->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); + + return error; +} + +static void +unregister_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + BUG_ON(memory->sysdev.cls != &memory_sysdev_class); + BUG_ON(memory->sysdev.id != __section_nr(section)); + + sysdev_unregister(&memory->sysdev); + if (root) + sysfs_remove_link(&root->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); +} + +/* + * use this as the physical section index that this memsection + * uses. + */ + +static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%08lx\n", mem->phys_index); +} + +/* + * online, offline, going offline, etc. + */ +static ssize_t show_mem_state(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + ssize_t len = 0; + + /* + * We can probably put these states in a nice little array + * so that they're not open-coded + */ + switch (mem->state) { + case MEM_ONLINE: + len = sprintf(buf, "online\n"); + break; + case MEM_OFFLINE: + len = sprintf(buf, "offline\n"); + break; + case MEM_GOING_OFFLINE: + len = sprintf(buf, "going-offline\n"); + break; + default: + len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", + mem->state); + WARN_ON(1); + break; + } + + return len; +} + +static inline int memory_notify(unsigned long val, void *v) +{ + return notifier_call_chain(&memory_chain, val, v); +} + +/* + * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is + * OK to have direct references to sparsemem variables in here. + */ +static int +memory_block_action(struct memory_block *mem, unsigned long action) +{ + int i; + unsigned long psection; + unsigned long start_pfn, start_paddr; + struct page *first_page; + int ret; + int old_state = mem->state; + + psection = mem->phys_index; + first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); + + /* + * The probe routines leave the pages reserved, just + * as the bootmem code does. Make sure they're still + * that way. + */ + if (action == MEM_ONLINE) { + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageReserved(first_page+i)) + continue; + + printk(KERN_WARNING "section number %ld page number %d " + "not reserved, was it already online? \n", + psection, i); + return -EBUSY; + } + } + + switch (action) { + case MEM_ONLINE: + start_pfn = page_to_pfn(first_page); + ret = online_pages(start_pfn, PAGES_PER_SECTION); + break; + case MEM_OFFLINE: + mem->state = MEM_GOING_OFFLINE; + memory_notify(MEM_GOING_OFFLINE, NULL); + start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; + ret = remove_memory(start_paddr, + PAGES_PER_SECTION << PAGE_SHIFT); + if (ret) { + mem->state = old_state; + break; + } + memory_notify(MEM_MAPPING_INVALID, NULL); + break; + default: + printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", + __FUNCTION__, mem, action, action); + WARN_ON(1); + ret = -EINVAL; + } + /* + * For now, only notify on successful memory operations + */ + if (!ret) + memory_notify(action, NULL); + + return ret; +} + +static int memory_block_change_state(struct memory_block *mem, + unsigned long to_state, unsigned long from_state_req) +{ + int ret = 0; + down(&mem->state_sem); + + if (mem->state != from_state_req) { + ret = -EINVAL; + goto out; + } + + ret = memory_block_action(mem, to_state); + if (!ret) + mem->state = to_state; + +out: + up(&mem->state_sem); + return ret; +} + +static ssize_t +store_mem_state(struct sys_device *dev, const char *buf, size_t count) +{ + struct memory_block *mem; + unsigned int phys_section_nr; + int ret = -EINVAL; + + mem = container_of(dev, struct memory_block, sysdev); + phys_section_nr = mem->phys_index; + + if (!valid_section_nr(phys_section_nr)) + goto out; + + if (!strncmp(buf, "online", min((int)count, 6))) + ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + else if(!strncmp(buf, "offline", min((int)count, 7))) + ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); +out: + if (ret) + return ret; + return count; +} + + +static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); +static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); + + +#define mem_create_simple_file(mem, attr_name) \ + sysdev_create_file(&mem->sysdev, &attr_##attr_name) +#define mem_remove_simple_file(mem, attr_name) \ + sysdev_remove_file(&mem->sysdev, &attr_##attr_name) + +/* + * Block size attribute stuff + */ +static ssize_t +print_block_size(struct class *class, char *buf) +{ + return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); +} + +static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); + +static int block_size_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_block_size_bytes.attr); + return 0; +} + +/* + * Some architectures will have custom drivers to do this, and + * will not need to do it from userspace. The fake hot-add code + * as well as ppc64 will do all of their discovery in userspace + * and will require this interface. + */ +extern int page_is_hotpluggable_ram(unsigned long pfn); +#ifdef CONFIG_ARCH_MEMORY_PROBE +static ssize_t +memory_probe_store(struct class *class, const char __user *buf, size_t count) +{ + u64 phys_addr; + unsigned long offset; + int ret; + + phys_addr = simple_strtoull(buf, NULL, 0); + + for (offset = 0; offset < PAGES_PER_SECTION; offset++) { + unsigned long page_nr = (phys_addr >> PAGE_SHIFT) + offset; + if (page_is_hotpluggable_ram(page_nr)) + break; + } + if (offset == PAGES_PER_SECTION) + return -EINVAL; + + ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); + + if (ret) + count = ret; + + return count; +} +static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); + +static int memory_probe_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_probe.attr); + return 0; +} +#else +#define memory_probe_init(...) do {} while (0) +#endif + +static int attach_phys_device(struct memory_block *mem, struct kobject *kobj) +{ + char name[24]; + int ret; + sprintf(name, "phys_device%d",mem->phys_device); + ret = sysfs_create_link(&mem->sysdev.kobj, kobj, name); + if (ret) + return ret; + mem->phys_device++; + return 0; +} + +static void remove_all_phys_device(struct memory_block *mem) +{ + char name[24]; + int i; + for (i = 0; i < mem->phys_device; i++) { + sprintf(name, "phys_device%d",i); + sysfs_remove_link(&mem->sysdev.kobj, name); + } + mem->phys_device = 0; +} + +/* + * kobj is a kobject of physical memory device which includes the specified range + * It is here to allow for differentiation between which *physical* devices each + * section belongs to... + * If kobj != NULL, symbolic link to device from mem_section is created. + */ + +static int add_memory_block(unsigned long node_id, struct mem_section *section, + unsigned long state, struct kobject *kobj) +{ + struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); + int ret = 0; + + if (!mem) + return -ENOMEM; + + mem->phys_index = __section_nr(section); + mem->state = state; + init_MUTEX(&mem->state_sem); + + ret = register_memory(mem, section, NULL); + if (!ret) + ret = mem_create_simple_file(mem, phys_index); + if (!ret) + ret = mem_create_simple_file(mem, state); + if (!ret && kobj) + ret = attach_phys_device(mem, kobj); + + return ret; +} + +/* + * For now, we have a linear search to go find the appropriate + * memory_block corresponding to a particular phys_index. If + * this gets to be a real problem, we can always use a radix + * tree or something here. + * + * This could be made generic for all sysdev classes. + */ +static struct memory_block *find_memory_block(struct mem_section *section) +{ + struct kobject *kobj; + struct sys_device *sysdev; + struct memory_block *mem; + char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; + + /* + * This only works because we know that section == sysdev->id + * slightly redundant with sysdev_register() + */ + sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); + + kobj = kset_find_obj(&memory_sysdev_class.kset, name); + if (!kobj) + return NULL; + + sysdev = container_of(kobj, struct sys_device, kobj); + mem = container_of(sysdev, struct memory_block, sysdev); + + return mem; +} + +static struct memory_block *pfn_to_memory_block(unsigned long pfn) +{ + struct mem_section *section; + section = __nr_to_section(pfn_to_section_nr(pfn)); + return find_memory_block(section); +} + +int remove_memory_block(unsigned long node_id, struct mem_section *section) +{ + struct memory_block *mem; + + mem = find_memory_block(section); + mem_remove_simple_file(mem, phys_index); + mem_remove_simple_file(mem, state); + if (mem->phys_device) + remove_all_phys_device(mem); + unregister_memory(mem, section, NULL); + + return 0; +} + +/* + * creating symbolic link from mem_section[] in specified address range + * to specified device. This device here is expected to be physical memory device. + * This symbolic link will be used to show relationship between mem_section and device. + */ +int attach_device_to_memsection(u64 start_addr, u64 end_addr, struct kobject *kobj) +{ + unsigned long pfn = start_addr >> PAGE_SHIFT; + unsigned long end_pfn = end_addr >> PAGE_SHIFT; + struct memory_block *mem; + int ret = 0; + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + mem = pfn_to_memory_block(pfn); + if (mem) { + ret = attach_phys_device(mem, kobj); + if (ret) + break; + } + } + return ret; +} + +/* + * need an interface for the VM to add new memory regions, + * but without onlining it. + */ +int register_new_memory(struct mem_section *section) +{ + return add_memory_block(0, section, MEM_OFFLINE, NULL); +} + +int unregister_memory_section(struct mem_section *section) +{ + if (!valid_section(section)) + return -EINVAL; + + return remove_memory_block(0, section); +} + +/* + * Initialize the sysfs support for memory devices... + */ +int __init memory_dev_init(void) +{ + unsigned int i; + int ret; + + memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; + ret = sysdev_class_register(&memory_sysdev_class); + + /* + * Create entries for memory sections that were found + * during boot and have been initialized + */ + for (i = 0; i < NR_MEM_SECTIONS; i++) { + if (!valid_section_nr(i)) + continue; + add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); + } + + memory_probe_init(); + block_size_init(); + + return ret; +} --- linux.orig/drivers/scsi/scsi_scan.c~A9.3-uninit-scsi_report_lun_scan-result 2005-09-30 12:38:09.000000000 -0700 +++ linux/drivers/scsi/scsi_scan.c 2005-09-30 12:38:09.000000000 -0700 @@ -1063,7 +1063,7 @@ unsigned int lun; unsigned int num_luns; unsigned int retries; - int result; + int result = 0; struct scsi_lun *lunp, *lun_data; u8 *data; struct scsi_sense_hdr sshdr; --- linux.orig/fs/aio.c~AA-PM-24-aio 2005-09-30 12:40:02.000000000 -0700 +++ linux/fs/aio.c 2005-09-30 12:40:02.000000000 -0700 @@ -132,7 +132,8 @@ dprintk("attempting mmap of %lu bytes\n", info->mmap_size); down_write(&ctx->mm->mmap_sem); info->mmap_base = do_mmap(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, + PROT_READ|PROT_WRITE, + MAP_ANON|MAP_PRIVATE|MAP_IMMOVABLE, 0); if (IS_ERR((void *)info->mmap_base)) { up_write(&ctx->mm->mmap_sem); --- linux.orig/fs/buffer.c~AA-PM-20.0-nowriteback 2005-09-30 12:39:59.000000000 -0700 +++ linux/fs/buffer.c 2005-09-30 12:39:59.000000000 -0700 @@ -3012,6 +3012,50 @@ return 0; } +void +generic_move_buffer(struct page *page, struct page *newpage) +{ + struct buffer_head *bh, *head; + + spin_lock(&page->mapping->private_lock); + bh = head = page_buffers(page); + do { + get_bh(bh); + lock_buffer(bh); + } while ((bh = bh->b_this_page) != head); + + newpage->private = page->private; + page->private = 0; + page_cache_release(page); + page_cache_get(newpage); + + /* XXX */ + ClearPagePrivate(page); + SetPagePrivate(newpage); + + bh = head; + do { + BUG_ON(bh->b_page != page); + set_bh_page(bh, newpage, (unsigned long)bh->b_data & (PAGE_SIZE - 1)); + } while ((bh = bh->b_this_page) != head); + spin_unlock(&page->mapping->private_lock); + /* buffers are unlocked when remapping is complete */ +} + +void +unlock_page_buffer(struct page *page) +{ + struct buffer_head *bh, *head; + + spin_lock(&page->mapping->private_lock); + bh = head = page_buffers(page); + do { + put_bh(bh); + unlock_buffer(bh); + } while ((bh = bh->b_this_page) != head); + spin_unlock(&page->mapping->private_lock); +} + /* * Buffer-head allocation */ @@ -3134,6 +3178,7 @@ EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand); +EXPORT_SYMBOL(generic_move_buffer); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(ll_rw_block); --- linux.orig/fs/ext2/inode.c~AA-PM-21-nowriteback-ext2 2005-09-30 12:39:59.000000000 -0700 +++ linux/fs/ext2/inode.c 2005-09-30 12:39:59.000000000 -0700 @@ -31,6 +31,7 @@ #include #include #include +#include #include "ext2.h" #include "acl.h" #include "xip.h" @@ -692,6 +693,12 @@ return mpage_writepages(mapping, wbc, ext2_get_block); } +static int +ext2_migrate_page(struct page *from, struct page *to) +{ + return generic_migrate_page(from, to, migrate_page_buffer); +} + struct address_space_operations ext2_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -702,6 +709,7 @@ .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migrate_page = ext2_migrate_page, }; struct address_space_operations ext2_aops_xip = { @@ -719,6 +727,7 @@ .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migrate_page = ext2_migrate_page, }; /* --- linux.orig/fs/ext3/inode.c~AA-PM-21-nowriteback-ext3 2005-09-30 12:40:00.000000000 -0700 +++ linux/fs/ext3/inode.c 2005-09-30 12:40:00.000000000 -0700 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "xattr.h" #include "acl.h" @@ -1539,6 +1540,12 @@ return __set_page_dirty_nobuffers(page); } +static int +ext3_migrate_page(struct page *from, struct page *to) +{ + return generic_migrate_page(from, to, migrate_page_buffer); +} + static struct address_space_operations ext3_ordered_aops = { .readpage = ext3_readpage, .readpages = ext3_readpages, @@ -1550,6 +1557,7 @@ .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migrate_page = ext3_migrate_page, }; static struct address_space_operations ext3_writeback_aops = { @@ -1563,6 +1571,7 @@ .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migrate_page = ext3_migrate_page, }; static struct address_space_operations ext3_journalled_aops = { @@ -1576,6 +1585,7 @@ .bmap = ext3_bmap, .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, + .migrate_page = ext3_migrate_page, }; void ext3_set_aops(struct inode *inode) --- linux.orig/fs/namei.c~AA-PM-27-symlink 2005-09-30 12:40:03.000000000 -0700 +++ linux/fs/namei.c 2005-09-30 12:40:03.000000000 -0700 @@ -2402,10 +2402,19 @@ int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct page *page; int err = -ENOMEM; char *kaddr; + /* XXXX: + * This is temporary code. This code should be replaced with proper one + * After the scheme to specify hot removable memory region has defined. + * Or remove this code if pages for symlink files become hot-pluggable. + * 5/Oct/2004 -- taka + */ + mapping_set_gfp_mask(mapping, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); + + page = grab_cache_page(mapping, 0); if (!page) goto fail; err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); --- linux.orig/include/asm-alpha/mmzone.h~A4.1-antonb-Remove_kvaddr_to_nid_and_local_mapnr 2005-09-30 12:38:06.000000000 -0700 +++ linux/include/asm-alpha/mmzone.h 2005-09-30 12:38:06.000000000 -0700 @@ -32,8 +32,6 @@ #define pa_to_nid(pa) alpha_pa_to_nid(pa) #define NODE_DATA(nid) (&node_data[(nid)]) -#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) - #if 1 #define PLAT_NODE_DATA_LOCALNR(p, n) \ (((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn) @@ -49,28 +47,14 @@ #ifdef CONFIG_DISCONTIGMEM +#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) + /* * Following are macros that each numa implementation must define. */ -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define local_mapnr(kvaddr) \ - ((__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr))) - -/* - * Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory - * and returns the kaddr corresponding to first physical page in the - * node's mem_map. - */ -#define LOCAL_BASE_ADDR(kaddr) \ - ((unsigned long)__va(NODE_DATA(kvaddr_to_nid(kaddr))->node_start_pfn \ - << PAGE_SHIFT)) - /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) @@ -106,8 +90,9 @@ #define pfn_to_page(pfn) \ ({ \ - unsigned long kaddr = (unsigned long)__va((pfn) << PAGE_SHIFT); \ - (NODE_DATA(kvaddr_to_nid(kaddr))->node_mem_map + local_mapnr(kaddr)); \ + unsigned long __tmp = pfn; \ + (NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \ + node_localnr(__tmp, pfn_to_nid(__tmp))); \ }) #define page_to_pfn(page) \ --- linux.orig/include/asm-i386/mman.h~AA-PM-22-vm_immovable 2005-09-30 12:40:01.000000000 -0700 +++ linux/include/asm-i386/mman.h 2005-09-30 12:40:01.000000000 -0700 @@ -22,6 +22,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ --- linux.orig/include/asm-i386/mmzone.h~FROM-MM-memory-hotplug-prep-kill-local_mapnr 2005-09-30 12:37:50.000000000 -0700 +++ linux/include/asm-i386/mmzone.h 2005-09-30 12:38:13.000000000 -0700 @@ -38,10 +38,15 @@ } extern int early_pfn_to_nid(unsigned long pfn); - +extern void __init remap_numa_kva(void); +extern unsigned long calculate_numa_remap_pages(void); +extern void setup_numa_kva_remap(void); #else /* !CONFIG_NUMA */ #define get_memcfg_numa get_memcfg_numa_flat #define get_zholes_size(n) (0) +#define remap_numa_kva() do {} while (0) +#define setup_numa_kva_remap() do {} while (0) +#define calculate_numa_remap_pages() (0) #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM @@ -76,11 +81,6 @@ * Following are macros that each numa implmentation must define. */ -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) - #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -88,12 +88,6 @@ __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) --- linux.orig/include/asm-i386/page.h~F0-create-__boot-i386 2005-09-30 12:38:22.000000000 -0700 +++ linux/include/asm-i386/page.h 2005-09-30 12:38:22.000000000 -0700 @@ -122,8 +122,10 @@ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) #define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) -#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) -#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#define __boot_pa(x) ((unsigned long)(x)-PAGE_OFFSET) +#define __boot_va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#define __pa(x) __boot_pa(x) +#define __va(x) __boot_va(x) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) --- linux.orig/include/asm-i386/pgtable-3level.h~B2.2-i386-create-numa.c 2005-09-30 12:38:13.000000000 -0700 +++ linux/include/asm-i386/pgtable-3level.h 2005-09-30 12:38:13.000000000 -0700 @@ -65,6 +65,7 @@ set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval)) #define set_pud(pudptr,pudval) \ (*(pudptr) = (pudval)) +extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); /* * Pentium-II erratum A13: in PAE mode we explicitly have to flush --- linux.orig/include/asm-i386/sparsemem.h~E6-for-debugging-more-FLAGS_RESERVED 2005-09-30 12:38:21.000000000 -0700 +++ linux/include/asm-i386/sparsemem.h 2005-09-30 12:38:21.000000000 -0700 @@ -15,7 +15,7 @@ * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space */ #ifdef CONFIG_X86_PAE -#define SECTION_SIZE_BITS 30 +#define SECTION_SIZE_BITS 28 #define MAX_PHYSADDR_BITS 36 #define MAX_PHYSMEM_BITS 36 #else --- linux.orig/include/asm-ia64/mman.h~AA-PM-22-vm_immovable 2005-09-30 12:40:01.000000000 -0700 +++ linux/include/asm-ia64/mman.h 2005-09-30 12:40:01.000000000 -0700 @@ -30,6 +30,7 @@ #define MAP_NORESERVE 0x04000 /* don't check for reservations */ #define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ --- linux.orig/include/asm-m32r/mmzone.h~FROM-MM-memory-hotplug-prep-kill-local_mapnr 2005-09-30 12:37:50.000000000 -0700 +++ linux/include/asm-m32r/mmzone.h 2005-09-30 12:37:50.000000000 -0700 @@ -21,12 +21,6 @@ __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = pfn; \ --- linux.orig/include/asm-mips/mmzone.h~A4.1-antonb-Remove_kvaddr_to_nid_and_local_mapnr 2005-09-30 12:38:06.000000000 -0700 +++ linux/include/asm-mips/mmzone.h 2005-09-30 12:38:06.000000000 -0700 @@ -10,7 +10,6 @@ #ifdef CONFIG_DISCONTIGMEM -#define kvaddr_to_nid(kvaddr) pa_to_nid(__pa(kvaddr)) #define pfn_to_nid(pfn) pa_to_nid((pfn) << PAGE_SHIFT) #define pfn_valid(pfn) \ --- linux.orig/include/asm-parisc/mmzone.h~FROM-MM-memory-hotplug-prep-kill-local_mapnr 2005-09-30 12:37:50.000000000 -0700 +++ linux/include/asm-parisc/mmzone.h 2005-09-30 12:38:06.000000000 -0700 @@ -14,11 +14,6 @@ #define NODE_DATA(nid) (&node_data[nid].pg_data) -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) - #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -27,12 +22,6 @@ }) #define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ --- linux.orig/include/asm-powerpc/mman.h~AA-PM-22-vm_immovable-ppc64 2005-09-30 12:40:00.000000000 -0700 +++ linux/include/asm-powerpc/mman.h 2005-09-30 12:40:00.000000000 -0700 @@ -38,6 +38,7 @@ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MADV_NORMAL 0x0 /* default page-in behavior */ #define MADV_RANDOM 0x1 /* page-in minimum required */ --- linux.orig/include/asm-powerpc/topology.h~A4.2-antonb-ppc64-_use_generic_nr_cpus_node 2005-09-30 12:38:06.000000000 -0700 +++ linux/include/asm-powerpc/topology.h 2005-09-30 12:38:07.000000000 -0700 @@ -9,15 +9,7 @@ static inline int cpu_to_node(int cpu) { - int node; - - node = numa_cpu_lookup_table[cpu]; - -#ifdef DEBUG_NUMA - BUG_ON(node == -1); -#endif - - return node; + return numa_cpu_lookup_table[cpu]; } #define parent_node(node) (node) @@ -37,8 +29,6 @@ #define pcibus_to_node(node) (-1) #define pcibus_to_cpumask(bus) (cpu_online_map) -#define nr_cpus_node(node) (nr_cpus_in_node[node]) - /* sched_domains SD_NODE_INIT for PPC64 machines */ #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ --- linux.orig/include/asm-ppc64/abs_addr.h~G1-kravetz-ppc64-fixes-static_inlines 2005-09-30 12:38:27.000000000 -0700 +++ linux/include/asm-ppc64/abs_addr.h 2005-09-30 12:38:27.000000000 -0700 @@ -62,5 +62,14 @@ /* Convenience macros */ #define virt_to_abs(va) phys_to_abs(__pa(va)) #define abs_to_virt(aa) __va(aa) +static inline unsigned long boot_virt_to_abs(unsigned long va) +{ + return phys_to_abs(__boot_pa(va)); +} +static inline void *boot_abs_to_virt(unsigned long aa) +{ + return __boot_va(aa); +} + #endif /* _ABS_ADDR_H */ --- linux.orig/include/asm-ppc64/dma.h~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/include/asm-ppc64/dma.h 2005-09-30 12:38:26.000000000 -0700 @@ -26,6 +26,8 @@ /* The maximum address that we can perform a DMA transfer to on this platform */ /* Doesn't really apply... */ #define MAX_DMA_ADDRESS (~0UL) +#define MAX_DMA_PHYSADDR MAX_DMA_ADDRESS +#define MAX_DMA_PHYSADDR MAX_DMA_ADDRESS #if !defined(CONFIG_PPC_ISERIES) || defined(CONFIG_PCI) --- linux.orig/include/asm-ppc64/mmzone.h~FROM-MM-memory-hotplug-prep-kill-local_mapnr 2005-09-30 12:37:50.000000000 -0700 +++ linux/include/asm-ppc64/mmzone.h 2005-09-30 12:38:07.000000000 -0700 @@ -8,15 +8,14 @@ #define _ASM_MMZONE_H_ #include -#include -/* generic non-linear memory support: +/* + * generic non-linear memory support: * * 1) we will not split memory into more chunks than will fit into the * flags field of the struct page */ - #ifdef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data *node_data[]; @@ -30,35 +29,7 @@ */ extern int numa_cpu_lookup_table[]; -extern char *numa_memory_lookup_table; extern cpumask_t numa_cpumask_lookup_table[]; -extern int nr_cpus_in_node[]; - -/* 16MB regions */ -#define MEMORY_INCREMENT_SHIFT 24 -#define MEMORY_INCREMENT (1UL << MEMORY_INCREMENT_SHIFT) - -/* NUMA debugging, will not work on a DLPAR machine */ -#undef DEBUG_NUMA - -static inline int pa_to_nid(unsigned long pa) -{ - int nid; - - nid = numa_memory_lookup_table[pa >> MEMORY_INCREMENT_SHIFT]; - -#ifdef DEBUG_NUMA - /* the physical address passed in is not in the map for the system */ - if (nid == -1) { - printk("bad address: %lx\n", pa); - BUG(); - } -#endif - - return nid; -} - -#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) /* * Following are macros that each numa implmentation must define. @@ -67,42 +38,10 @@ #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) - -#ifdef CONFIG_DISCONTIGMEM - -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) - -#define pfn_to_nid(pfn) pa_to_nid((unsigned long)(pfn) << PAGE_SHIFT) - -/* Written this way to avoid evaluating arguments twice */ -#define discontigmem_pfn_to_page(pfn) \ -({ \ - unsigned long __tmp = pfn; \ - (NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \ - node_localnr(__tmp, pfn_to_nid(__tmp))); \ -}) - -#define discontigmem_page_to_pfn(p) \ -({ \ - struct page *__tmp = p; \ - (((__tmp) - page_zone(__tmp)->zone_mem_map) + \ - page_zone(__tmp)->zone_start_pfn); \ -}) - -/* XXX fix for discontiguous physical memory */ -#define discontigmem_pfn_valid(pfn) ((pfn) < num_physpages) - -#endif /* CONFIG_DISCONTIGMEM */ - #endif /* CONFIG_NEED_MULTIPLE_NODES */ #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -#define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) +extern int __init early_pfn_to_nid(unsigned long pfn); #endif #endif /* _ASM_MMZONE_H_ */ --- linux.orig/include/asm-ppc64/page.h~A4.3-antonb-ppc64-convert_to_sparsemem 2005-09-30 12:38:07.000000000 -0700 +++ linux/include/asm-ppc64/page.h 2005-09-30 12:38:22.000000000 -0700 @@ -172,7 +172,10 @@ #endif -#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) +#define __boot_pa(x) ((unsigned long)(x)-PAGE_OFFSET) +#define __boot_va(x) ((void *)((unsigned long)(x) + KERNELBASE)) +#define __pa(x) __boot_pa(x) +#define __va(x) __boot_va(x) extern int page_is_ram(unsigned long pfn); @@ -206,13 +209,6 @@ #define USER_REGION_ID (0UL) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) - -#ifdef CONFIG_DISCONTIGMEM -#define page_to_pfn(page) discontigmem_page_to_pfn(page) -#define pfn_to_page(pfn) discontigmem_pfn_to_page(pfn) -#define pfn_valid(pfn) discontigmem_pfn_valid(pfn) -#endif #ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) --- linux.orig/include/asm-x86_64/mman.h~AA-PM-99-x86_64-IMMOVABLE 2005-09-30 12:40:06.000000000 -0700 +++ linux/include/asm-x86_64/mman.h 2005-09-30 12:40:06.000000000 -0700 @@ -23,6 +23,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_IMMOVABLE 0x20000 #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ --- linux.orig/include/asm-x86_64/mmzone.h~A4.1-antonb-Remove_kvaddr_to_nid_and_local_mapnr 2005-09-30 12:38:06.000000000 -0700 +++ linux/include/asm-x86_64/mmzone.h 2005-09-30 12:38:06.000000000 -0700 @@ -39,7 +39,6 @@ #ifdef CONFIG_DISCONTIGMEM #define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) -#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) /* AK: this currently doesn't deal with invalid addresses. We'll see if the 2.5 kernel doesn't pass them @@ -57,7 +56,5 @@ nid__ != 0xff && (pfn) >= node_start_pfn(nid__) && (pfn) < node_end_pfn(nid__); })) #endif -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) #endif #endif --- linux.orig/include/linux/bootmem.h~F4-use-__boot-generic 2005-09-30 12:38:24.000000000 -0700 +++ linux/include/linux/bootmem.h 2005-09-30 12:38:25.000000000 -0700 @@ -10,6 +10,11 @@ #include #include +#ifndef CONFIG_ARCH_HAS_BOOTPA +#define __boot_pa(pa) __pa(pa) +#define __boot_va(pa) __va(va) +#endif + /* * simple boot-time physical memory area allocator. */ @@ -40,6 +45,10 @@ * up searching */ } bootmem_data_t; +#ifndef MAX_DMA_PHYSADDR +#define MAX_DMA_PHYSADDR (__boot_pa(MAX_DMA_ADDRESS)) +#endif + extern unsigned long __init bootmem_bootmap_pages (unsigned long); extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend); extern void __init free_bootmem (unsigned long addr, unsigned long size); @@ -47,11 +56,11 @@ #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void __init reserve_bootmem (unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ - __alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem((x), SMP_CACHE_BYTES, MAX_DMA_PHYSADDR) #define alloc_bootmem_low(x) \ __alloc_bootmem((x), SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ - __alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem((x), PAGE_SIZE, MAX_DMA_PHYSADDR) #define alloc_bootmem_low_pages(x) \ __alloc_bootmem((x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ @@ -64,9 +73,9 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem_node(pgdat, x) \ - __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, MAX_DMA_PHYSADDR) #define alloc_bootmem_pages_node(pgdat, x) \ - __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, MAX_DMA_PHYSADDR) #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ --- linux.orig/include/linux/buffer_head.h~AA-PM-20.0-nowriteback 2005-09-30 12:39:59.000000000 -0700 +++ linux/include/linux/buffer_head.h 2005-09-30 12:39:59.000000000 -0700 @@ -208,7 +208,8 @@ int nobh_truncate_page(struct address_space *, loff_t); int nobh_writepage(struct page *page, get_block_t *get_block, struct writeback_control *wbc); - +void generic_move_buffer(struct page *, struct page *); +void unlock_page_buffer(struct page *); /* * inline definitions --- linux.orig/include/linux/fs.h~AA-PM-13.1-migrate_page-operation 2005-09-30 12:39:54.000000000 -0700 +++ linux/include/linux/fs.h 2005-09-30 12:39:54.000000000 -0700 @@ -325,6 +325,7 @@ loff_t offset, unsigned long nr_segs); struct page* (*get_xip_page)(struct address_space *, sector_t, int); + int (*migrate_page)(struct page *, struct page *); }; struct backing_dev_info; --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/include/linux/memory.h 2005-09-30 12:38:19.000000000 -0700 @@ -0,0 +1,97 @@ +/* + * include/linux/memory.h - generic memory definition + * + * This is mainly for topological representation. We define the + * basic "struct memory_block" here, which can be embedded in per-arch + * definitions or NUMA information. + * + * Basic handling of the devices is done in drivers/base/memory.c + * and system devices are handled in drivers/base/sys.c. + * + * Memory block are exported via sysfs in the class/memory/devices/ + * directory. + * + */ +#ifndef _LINUX_MEMORY_H_ +#define _LINUX_MEMORY_H_ + +#include +#include +#include + +#include + +struct memory_block { + unsigned long phys_index; + unsigned long state; + /* + * This serializes all state change requests. It isn't + * held during creation because the control files are + * created long after the critical areas during + * initialization. + */ + struct semaphore state_sem; + int phys_device; /* num of attached phys_device */ + void *hw; /* optional pointer to fw/hw data */ + int (*phys_callback)(struct memory_block *); + struct sys_device sysdev; +}; + +/* These states are exposed to userspace as text strings in sysfs */ +#define MEM_ONLINE (1<<0) /* exposed to userspace */ +#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ +#define MEM_OFFLINE (1<<2) /* exposed to userspace */ + +/* + * All of these states are currently kernel-internal for notifying + * kernel components and architectures. + * + * For MEM_MAPPING_INVALID, all notifier chains with priority >0 + * are called before pfn_to_page() becomes invalid. The priority=0 + * entry is reserved for the function that actually makes + * pfn_to_page() stop working. Any notifiers that want to be called + * after that should have priority <0. + */ +#define MEM_MAPPING_INVALID (1<<3) + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline int memory_dev_init(void) +{ + return 0; +} +static inline int register_memory_notifier(struct notifier_block *nb) +{ + return 0; +} +static inline void unregister_memory_notifier(struct notifier_block *nb) +{ +} +#else +extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); +extern int register_new_memory(struct mem_section *); +extern int unregister_memory_section(struct mem_section *); +extern int memory_dev_init(void); +extern int register_memory_notifier(struct notifier_block *nb); +extern void unregister_memory_notifier(struct notifier_block *nb); +/* creating symbolic link between memory device and memory section */ +extern int attach_device_to_memsection(u64 start_addr, u64 end_addr, struct kobject *kobj); + +#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< +#include +#include +#include + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * pgdat resizing functions + */ +static inline +void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqsave(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_init(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->node_size_lock); +} +/* + * Zone resizing functions + */ +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return read_seqbegin(&zone->span_seqlock); +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return read_seqretry(&zone->span_seqlock, iv); +} +static inline void zone_span_writelock(struct zone *zone) +{ + write_seqlock(&zone->span_seqlock); +} +static inline void zone_span_writeunlock(struct zone *zone) +{ + write_sequnlock(&zone->span_seqlock); +} +static inline void zone_seqlock_init(struct zone *zone) +{ + seqlock_init(&zone->span_seqlock); +} +extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); +extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); +extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +/* need some defines for these for archs that don't support it */ +extern void online_page(struct page *page); +/* VM interface that may be used by firmware interface */ +extern int add_memory(u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern int online_pages(unsigned long, unsigned long); + +/* reasonably generic interface to expand the physical pages in a zone */ +extern int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); +#else /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Stub functions for when hotplug is off + */ +static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_init(struct pglist_data *pgdat) {} + +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return 0; +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return 0; +} +static inline void zone_span_writelock(struct zone *zone) {} +static inline void zone_span_writeunlock(struct zone *zone) {} +static inline void zone_seqlock_init(struct zone *zone) {} + +static inline int mhp_notimplemented(const char *func) +{ + printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); + dump_stack(); + return -ENOSYS; +} + +static inline int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + return mhp_notimplemented(__FUNCTION__); +} +#endif /* ! CONFIG_MEMORY_HOTPLUG */ +static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__); + dump_stack(); + return -ENOSYS; +} +#endif /* __LINUX_MEMORY_HOTPLUG_H */ --- linux.orig/include/linux/mm.h~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/include/linux/mm.h 2005-09-30 12:40:01.000000000 -0700 @@ -162,6 +162,7 @@ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#define VM_IMMOVABLE 0x02000000 /* Don't place in hot removable area */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -178,6 +179,11 @@ #define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) +#ifdef CONFIG_MEMORY_HOTPLUG +#define VM_Immovable(v) ((v)->vm_flags & VM_IMMOVABLE) +#else +#define VM_Immovable(v) (0) +#endif /* * mapping from the currently active vm_flags protection bits (the @@ -791,6 +797,9 @@ unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); +extern void zonetable_add(struct zone *, int, int, + unsigned long, unsigned long); +extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); --- linux.orig/include/linux/mm_inline.h~AA-PM-01-steal_page_from_lru 2005-09-30 12:38:29.000000000 -0700 +++ linux/include/linux/mm_inline.h 2005-09-30 12:38:29.000000000 -0700 @@ -38,3 +38,71 @@ zone->nr_inactive--; } } + +static inline int +isolate_lru_onepage(struct page *page, struct list_head *src, + struct list_head *dst) +{ + if (!TestClearPageLRU(page)) + BUG(); + list_del(&page->lru); + if (get_page_testone(page)) { + /* + * It is being freed elsewhere + */ + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + return 0; + } + list_add(&page->lru, dst); + return 1; +} + + +static inline int +__steal_page_from_lru(struct zone *zone, struct page *page, + struct list_head *dst) +{ + if (PageActive(page)) { + if (!isolate_lru_onepage(page, &zone->active_list, dst)) + return 0; + zone->nr_active--; + } else { + if (!isolate_lru_onepage(page, &zone->inactive_list, dst)) + return 0; + zone->nr_inactive--; + } + return 1; +} + +static inline int +steal_page_from_lru(struct zone *zone, struct page *page, + struct list_head *dst) +{ + int ret; + spin_lock_irq(&zone->lru_lock); + ret = __steal_page_from_lru(zone, page, dst); + spin_unlock_irq(&zone->lru_lock); + return ret; +} + +static inline void +__putback_page_to_lru(struct zone *zone, struct page *page) +{ + if (TestSetPageLRU(page)) + BUG(); + if (PageActive(page)) + add_page_to_active_list(zone, page); + else + add_page_to_inactive_list(zone, page); +} + +static inline void +putback_page_to_lru(struct zone *zone, struct page *page) +{ + spin_lock_irq(&zone->lru_lock); + __putback_page_to_lru(zone, page); + spin_unlock_irq(&zone->lru_lock); +} + --- linux.orig/include/linux/mman.h~AA-PM-22-vm_immovable 2005-09-30 12:40:01.000000000 -0700 +++ linux/include/linux/mman.h 2005-09-30 12:40:01.000000000 -0700 @@ -61,7 +61,8 @@ return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | + _calc_vm_trans(flags, MAP_IMMOVABLE, VM_IMMOVABLE ); } #endif /* _LINUX_MMAN_H */ --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/include/linux/mmigrate.h 2005-09-30 12:40:05.000000000 -0700 @@ -0,0 +1,39 @@ +#ifndef _LINUX_MEMHOTPLUG_H +#define _LINUX_MEMHOTPLUG_H + +#include +#include + +#define MIGRATE_NODE_ANY -1 + +#ifdef CONFIG_MEMORY_MIGRATE +extern int generic_migrate_page(struct page *, struct page *, + int (*)(struct page *, struct page *, struct list_head *)); +extern int migrate_page_common(struct page *, struct page *, + struct list_head *); +extern int migrate_page_buffer(struct page *, struct page *, + struct list_head *); +extern int page_migratable(struct page *, struct page *, int, + struct list_head *); +extern struct page * migrate_onepage(struct page *, int nodeid); +extern int try_to_migrate_pages(struct list_head *); + +#else +static inline int generic_migrate_page(struct page *page, struct page *newpage, + int (*fn)(struct page *, struct page *)) +{ + return -ENOSYS; +} +static inline int migrate_page_buffer(struct page* page, struct page* newpage) +{ + return -ENOSYS; +} +#endif + +#ifdef ARCH_HAS_PAGEMIGRATION +extern void arch_migrate_page(struct page *, struct page *); +#else +static inline void arch_migrate_page(struct page *page, struct page *newpage) {} +#endif + +#endif /* _LINUX_MEMHOTPLUG_H */ --- linux.orig/include/linux/mmzone.h~FROM-MM-memory-hotplug-prep-__section_nr-helper 2005-09-30 12:37:52.000000000 -0700 +++ linux/include/linux/mmzone.h 2005-09-30 12:38:21.000000000 -0700 @@ -11,8 +11,11 @@ #include #include #include +#include #include +#include #include +#include /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER @@ -137,6 +140,10 @@ * free areas of different sizes */ spinlock_t lock; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif struct free_area free_area[MAX_ORDER]; @@ -220,6 +227,16 @@ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; + /* + * zone_start_pfn, spanned_pages and present_pages are all + * protected by span_seqlock. It is a seqlock because it has + * to be read outside of zone->lock, and it is done in the main + * allocator path. But, it is written quite infrequently. + * + * The lock is declared along with zone->lock because it is + * frequently read in proximity to zone->lock. It's good to + * give them a chance of being in the same cacheline. + */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ @@ -227,6 +244,7 @@ * rarely used fields: */ char *name; + struct semaphore init_sem; } ____cacheline_maxaligned_in_smp; @@ -273,12 +291,21 @@ struct page *node_mem_map; #endif struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; - struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; @@ -293,7 +320,7 @@ #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) -extern struct pglist_data *pgdat_list; +#include void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat); @@ -314,62 +341,6 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); #endif -/* - * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. - */ -#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) - -/** - * for_each_pgdat - helper macro to iterate over all nodes - * @pgdat - pointer to a pg_data_t variable - * - * Meant to help with common loops of the form - * pgdat = pgdat_list; - * while(pgdat) { - * ... - * pgdat = pgdat->pgdat_next; - * } - */ -#define for_each_pgdat(pgdat) \ - for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) - -/* - * next_zone - helper magic for for_each_zone() - * Thanks to William Lee Irwin III for this piece of ingenuity. - */ -static inline struct zone *next_zone(struct zone *zone) -{ - pg_data_t *pgdat = zone->zone_pgdat; - - if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) - zone++; - else if (pgdat->pgdat_next) { - pgdat = pgdat->pgdat_next; - zone = pgdat->node_zones; - } else - zone = NULL; - - return zone; -} - -/** - * for_each_zone - helper macro to iterate over all memory zones - * @zone - pointer to struct zone variable - * - * The user only needs to declare the zone variable, for_each_zone - * fills it in. This basically means for_each_zone() is an - * easier to read version of this piece of code: - * - * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) - * for (i = 0; i < MAX_NR_ZONES; ++i) { - * struct zone * z = pgdat->node_zones + i; - * ... - * } - * } - */ -#define for_each_zone(zone) \ - for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) - static inline int is_highmem_idx(int idx) { return (idx == ZONE_HIGHMEM); @@ -422,6 +393,73 @@ #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +/* + * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. + */ +#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) + +#define first_online_pgdat() NODE_DATA(first_online_node()) +#define next_online_pgdat(pgdat) \ + ((next_online_node((pgdat)->node_id) != MAX_NUMNODES) ? \ + NODE_DATA(next_online_node((pgdat)->node_id)) : NULL) + +/** + * for_each_pgdat - helper macro to iterate over all online nodes + * @pgdat - pointer to a pg_data_t variable + * + * Meant to help with common loops of the form + * pgdat = NODE_DATA(first_online_node()) + * while(pgdat) { + * ... + * pgdat = (next node is online) ? NODE_DATA(next_node) : NULL ; + * } + */ +#define for_each_pgdat(pgdat) \ + for (pgdat = first_online_pgdat(); pgdat; \ + pgdat = next_online_pgdat(pgdat)) + +/* + * next_zone - helper magic for for_each_zone() + * Thanks to William Lee Irwin III for this piece of ingenuity. + */ +static inline struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + + return zone; +} + +/** + * for_each_zone - helper macro to iterate over all memory zones + * @zone - pointer to struct zone variable + * + * The user only needs to declare the zone variable, for_each_zone + * fills it in. This basically means for_each_zone() is an + * easier to read version of this piece of code: + * + * for (pgdat = first_online_node(); pgdat; pgdat = next_online_node(pgdat)) + * for (i = 0; i < MAX_NR_ZONES; ++i) { + * struct zone * z = pgdat->node_zones + i; + * ... + * } + * } + */ +#define for_each_zone(zone) \ + for (zone = first_online_pgdat()->node_zones; \ + zone; zone = next_zone(zone)) + + #ifdef CONFIG_SPARSEMEM #include #endif @@ -431,7 +469,7 @@ * with 32 bit page->flags field, we reserve 8 bits for node/zone info. * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. */ -#define FLAGS_RESERVED 8 +#define FLAGS_RESERVED 10 #elif BITS_PER_LONG == 64 /* @@ -509,6 +547,7 @@ return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } +extern int __section_nr(struct mem_section* ms); /* * We use the lower bits of the mem_map pointer to store @@ -542,11 +581,6 @@ return valid_section(__nr_to_section(nr)); } -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) - static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); @@ -586,6 +620,7 @@ #define early_pfn_valid(pfn) pfn_valid(pfn) void sparse_init(void); +extern int sparse_add_one_section(struct zone *, unsigned long, int); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) --- linux.orig/include/linux/nodemask.h~B3.0-remove-pgdat_list-ver2-base 2005-09-30 12:38:13.000000000 -0700 +++ linux/include/linux/nodemask.h 2005-09-30 12:38:13.000000000 -0700 @@ -232,6 +232,9 @@ return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } +#define first_online_node() first_node(node_online_map) +#define next_online_node(node) next_node((node), node_online_map) + #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ --- linux.orig/include/linux/page-flags.h~AA-PM-04-config-noswap 2005-09-30 12:39:46.000000000 -0700 +++ linux/include/linux/page-flags.h 2005-09-30 12:39:46.000000000 -0700 @@ -301,6 +301,8 @@ #define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else #define PageSwapCache(page) 0 +#define SetPageSwapCache(page) +#define ClearPageSwapCache(page) #endif #define PageUncached(page) test_bit(PG_uncached, &(page)->flags) --- linux.orig/include/linux/radix-tree.h~AA-PM-03-radix-tree-replace 2005-09-30 12:39:46.000000000 -0700 +++ linux/include/linux/radix-tree.h 2005-09-30 12:39:46.000000000 -0700 @@ -47,6 +47,7 @@ int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void *radix_tree_delete(struct radix_tree_root *, unsigned long); +void *radix_tree_replace(struct radix_tree_root *, unsigned long, void *); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); --- linux.orig/include/linux/rmap.h~AA-PM-14-try_to_unmap_force 2005-09-30 12:39:56.000000000 -0700 +++ linux/include/linux/rmap.h 2005-09-30 12:39:56.000000000 -0700 @@ -90,7 +90,9 @@ * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, int ignore_token); -int try_to_unmap(struct page *); +int try_to_unmap(struct page *, struct list_head *); +int touch_unmapped_address(struct list_head *); + /* * Called from mm/filemap_xip.c to unmap empty zero page @@ -110,7 +112,7 @@ #define anon_vma_link(vma) do {} while (0) #define page_referenced(page,l,i) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL +#define try_to_unmap(page, force) SWAP_FAIL #endif /* CONFIG_MMU */ --- linux.orig/include/linux/swap.h~AA-PM-02-export-pageout 2005-09-30 12:39:45.000000000 -0700 +++ linux/include/linux/swap.h 2005-09-30 12:39:58.000000000 -0700 @@ -174,6 +174,50 @@ extern int try_to_free_pages(struct zone **, unsigned int); extern int zone_reclaim(struct zone *, unsigned int, unsigned int); extern int shrink_all_memory(int); +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; +extern pageout_t pageout(struct page *, struct address_space *); +struct scan_control { + /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ + unsigned long nr_to_scan; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Incremented by the number of pages reclaimed */ + unsigned long nr_reclaimed; + + unsigned long nr_mapped; /* From page_state */ + + /* How many pages shrink_cache() should reclaim */ + int nr_to_reclaim; + + /* Ask shrink_caches, or shrink_zone to scan at this priority */ + unsigned int priority; + + /* This context's GFP mask */ + unsigned int gfp_mask; + + int may_writepage; + + /* This context's SWAP_CLUSTER_MAX. If freeing memory for + * suspend, we effectively ignore SWAP_CLUSTER_MAX. + * In this context, it doesn't matter that we scan the + * whole list at once. */ + int swap_cluster_max; + + /* Can pages be swapped as part of reclaim? */ + int may_swap; +}; +extern int shrink_list(struct list_head *, struct scan_control *); extern int vm_swappiness; #ifdef CONFIG_MMU @@ -193,7 +237,7 @@ extern struct address_space swapper_space; #define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *); +extern int add_to_swap(struct page *, unsigned int); extern void __delete_from_swap_cache(struct page *); extern void delete_from_swap_cache(struct page *); extern int move_to_swap_cache(struct page *, swp_entry_t); @@ -217,7 +261,11 @@ extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); -extern int remove_exclusive_swap_page(struct page *); +extern int __remove_exclusive_swap_page(struct page *, int); +static inline int remove_exclusive_swap_page(struct page *p) +{ + return __remove_exclusive_swap_page(p, 0); +} struct backing_dev_info; extern spinlock_t swap_lock; @@ -267,11 +315,16 @@ #define delete_from_swap_cache(p) /*NOTHING*/ #define swap_token_default_timeout 0 -static inline int remove_exclusive_swap_page(struct page *p) +static inline int __remove_exclusive_swap_page(struct page *p, int force) { return 0; } +static inline int remove_exclusive_swap_page(struct page *p) +{ + return __remove_exclusive_swap_page(p, 0); +} + static inline swp_entry_t get_swap_page(void) { swp_entry_t entry; --- linux.orig/init/Kconfig~AA-PM-07.2-memory_migration-depends-swap 2005-09-30 12:39:48.000000000 -0700 +++ linux/init/Kconfig 2005-09-30 12:39:48.000000000 -0700 @@ -103,6 +103,9 @@ used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. +comment " Swap automatically enabled by selecting Memory Migration" + depends on MEMORY_MIGRATE + config SYSVIPC bool "System V IPC" depends on MMU --- linux.orig/kernel/fork.c~AA-PM-22-vm_immovable 2005-09-30 12:40:01.000000000 -0700 +++ linux/kernel/fork.c 2005-09-30 12:40:01.000000000 -0700 @@ -232,7 +232,7 @@ if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= ~(VM_LOCKED|VM_IMMOVABLE); tmp->vm_mm = mm; tmp->vm_next = NULL; anon_vma_link(tmp); --- linux.orig/lib/radix-tree.c~AA-PM-03-radix-tree-replace 2005-09-30 12:39:46.000000000 -0700 +++ linux/lib/radix-tree.c 2005-09-30 12:39:46.000000000 -0700 @@ -101,7 +101,13 @@ static inline void radix_tree_node_free(struct radix_tree_node *node) { - kmem_cache_free(radix_tree_node_cachep, node); + struct radix_tree_preload *rtp; + + rtp = &__get_cpu_var(radix_tree_preloads); + if (rtp->nr < ARRAY_SIZE(rtp->nodes)) + rtp->nodes[rtp->nr++] = node; + else + kmem_cache_free(radix_tree_node_cachep, node); } /* @@ -739,6 +745,52 @@ EXPORT_SYMBOL(radix_tree_delete); /** + * radix_tree_replace - replace items in a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Replace the item at @index with @item. + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_replace(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + struct radix_tree_node *slot; + unsigned int height, shift; + void *ret = NULL; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + slot = root->rnode; + + for ( ; height > 0; height--) { + int offset; + + if (slot == NULL) + goto out; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp[1].offset = offset; + pathp[1].node = slot; + slot = slot->slots[offset]; + pathp++; + shift -= RADIX_TREE_MAP_SHIFT; + } + + if ((ret = slot)) + slot = item; +out: + return ret; +} +EXPORT_SYMBOL(radix_tree_replace); + +/** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root * @tag: tag to test --- linux.orig/mm/Kconfig~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/mm/Kconfig 2005-09-30 12:39:48.000000000 -0700 @@ -111,3 +111,22 @@ config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC + +# eventually, we can have this option just 'select SPARSEMEM' +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + +comment "Memory hotplug is currently incompatible with Software Suspend" + depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND + +config MEMORY_REMOVE + bool "Allow for memory hot-remove" + depends on MEMORY_HOTPLUG && MEMORY_MIGRATE && (X86 && !X86_64) + default y if MEMORY_HOTPLUG + help + Enabling this option allows you to hot-remove highmem zones + on i386 systems. The i386 depenence is a hack for now. + +comment "Selecting Memory Migration automatically enables CONFIG_SWAP" + depends on !SWAP --- linux.orig/mm/Makefile~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/mm/Makefile 2005-09-30 12:39:47.000000000 -0700 @@ -18,5 +18,6 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o - +obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o +obj-$(CONFIG_MEMORY_MIGRATE) += mmigrate.o obj-$(CONFIG_FS_XIP) += filemap_xip.o --- linux.orig/mm/bootmem.c~B3.0-remove-pgdat_list-ver2-base 2005-09-30 12:38:13.000000000 -0700 +++ linux/mm/bootmem.c 2005-09-30 12:38:13.000000000 -0700 @@ -61,17 +61,6 @@ { bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize = ((end - start)+7)/8; - static struct pglist_data *pgdat_last; - - pgdat->pgdat_next = NULL; - /* Add new nodes last so that bootmem always starts - searching in the first nodes, not the last ones */ - if (pgdat_last) - pgdat_last->pgdat_next = pgdat; - else { - pgdat_list = pgdat; - pgdat_last = pgdat; - } mapsize = ALIGN(mapsize, sizeof(long)); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); @@ -392,7 +381,7 @@ void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) { - pg_data_t *pgdat = pgdat_list; + pg_data_t *pgdat; void *ptr; for_each_pgdat(pgdat) --- linux.orig/mm/memory.c~AA-PM-09-migrate-swapcache-validate 2005-09-30 12:39:49.000000000 -0700 +++ linux/mm/memory.c 2005-09-30 12:40:02.000000000 -0700 @@ -1291,12 +1291,22 @@ if (unlikely(anon_vma_prepare(vma))) goto no_new_page; + if (old_page == ZERO_PAGE(address)) { - new_page = alloc_zeroed_user_highpage(vma, address); + if (VM_Immovable(vma)) { + new_page = alloc_page_vma(GFP_USER, vma, address); + if (new_page) + clear_user_page((void *)address, address, new_page); + } else + new_page = alloc_zeroed_user_highpage(vma, address); if (!new_page) goto no_new_page; } else { - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (VM_Immovable(vma)) + new_page = alloc_page_vma(GFP_USER, vma, address); + else + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!new_page) goto no_new_page; copy_user_highpage(new_page, old_page, address); @@ -1668,6 +1678,7 @@ pte_unmap(page_table); spin_unlock(&mm->page_table_lock); +again: page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1696,6 +1707,12 @@ mark_page_accessed(page); lock_page(page); + if (!PageSwapCache(page)) { + /* page-migration has occured */ + unlock_page(page); + page_cache_release(page); + goto again; + } /* * Back out if somebody else faulted in this pte while we @@ -1777,7 +1794,10 @@ if (unlikely(anon_vma_prepare(vma))) goto no_mem; - page = alloc_zeroed_user_highpage(vma, addr); + if (VM_Immovable(vma)) + page = alloc_page_vma(GFP_USER, vma, addr); + else + page = alloc_zeroed_user_highpage(vma, addr); if (!page) goto no_mem; --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/mm/memory_hotplug.c 2005-09-30 12:39:28.000000000 -0700 @@ -0,0 +1,158 @@ +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int nid = pgdat->node_id; + int zone_type; + + zone_type = zone - pgdat->node_zones; + memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); + zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); +} + +static int __add_section(struct zone *zone, unsigned long phys_start_pfn) +{ + int nr_pages = PAGES_PER_SECTION; + int ret; + + ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); + + hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION); + + if (ret < 0) + return ret; + + __add_zone(zone, phys_start_pfn); + return register_new_memory(__pfn_to_section(phys_start_pfn)); +} + +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __add_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int err = 0; + + printk(KERN_DEBUG "%s(%p, %08lx, %ld)\n", __func__, + zone, phys_start_pfn, nr_pages); + + for (i = 0; !err && (i < nr_pages); i += PAGES_PER_SECTION) { + printk(KERN_DEBUG "\tfor: i: %ld\n", i); + err = __add_section(zone, phys_start_pfn + i); + } + + return err; +} + +static void grow_zone_span(struct zone *zone, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_zone_end_pfn; + + zone_span_writelock(zone); + + old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + if (start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + if (end_pfn > old_zone_end_pfn) + zone->spanned_pages = end_pfn - zone->zone_start_pfn; + + zone_span_writeunlock(zone); +} + +static void grow_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + if (end_pfn > old_pgdat_end_pfn) + pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; +} + +#ifdef CONFIG_X86_SIMULATED_MEM_HOTPLUG +int page_is_hotpluggable_ram(unsigned long pfn) +{ + extern struct e820map bios_e820; + extern int page_is_ram_e820(unsigned long, struct e820map*); + + return page_is_ram_e820(pfn, &bios_e820); +} +#else +int page_is_hotpluggable_ram(unsigned long pfn) +{ + return 1; +} +#endif + +int online_pages(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long i; + unsigned long flags; + unsigned long onlined_pages = 0; + struct zone *zone; + + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_sem. + */ + zone = page_zone(pfn_to_page(pfn)); + pgdat_resize_lock(zone->zone_pgdat, &flags); + grow_zone_span(zone, pfn, pfn + nr_pages); + grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + printk(KERN_DEBUG "%s: onlining 0x%lx pages starting from pfn: 0x%lx\n", + __func__, nr_pages, pfn); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); + + if (page_is_hotpluggable_ram(pfn + i)) { + online_page(page); + onlined_pages++; + } + } + zone->present_pages += onlined_pages; + + setup_per_zone_pages_min(); + + return 0; +} --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/mm/mmigrate.c 2005-09-30 12:40:05.000000000 -0700 @@ -0,0 +1,592 @@ +/* + * linux/mm/mmigrate.c + * + * Memory migration support. + * + * Authors: IWAMOTO Toshihiro + * Hirokazu Takahashi + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The concept of memory migration is to replace a target page with + * a substitute page on a radix tree. New requests to access the target + * - including system calls and page faults - are redirected to the + * substitute that is locked and not up-to-date, so that all of these + * requests are blocked until the migration has done. Data of the target + * is copied into the substitute and then the requests are unblocked + * after all operations against the target have finished. + * + * By this approach, regular pages in the swapcache/pagecache and + * hugetlbpages can be handled in the same way. + */ + + +/* + * Try to writeback a dirty page to free its buffers. + */ +static int +writeback_and_free_buffers(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + BUG_ON(!PageLocked(page)); + wait_on_page_writeback(page); + if (!PagePrivate(page)) + return 0; + + if (PageDirty(page)) { + switch(pageout(page, mapping)) { + case PAGE_ACTIVATE: + return -1; + case PAGE_SUCCESS: + lock_page(page); + return 1; + case PAGE_KEEP: + case PAGE_CLEAN: + break; + } + } + if (try_to_release_page(page, GFP_KERNEL)) + return 0; + + return -1; +} + +/* + * Replace "page" with "newpage" on the radix tree, which the page belongs to. + */ +static int +replace_pages(struct page *page, struct page *newpage) +{ + struct address_space *mapping = page_mapping(page); + int ret = 0; + struct page *delpage; + + page_cache_get(newpage); + read_lock_irq(&mapping->tree_lock); + newpage->index = page->index; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + newpage->private = page->private; + } else + newpage->mapping = page->mapping; + if (PageWriteback(page)) + SetPageWriteback(newpage); + + delpage = radix_tree_replace(&mapping->page_tree, page_index(page), newpage); + read_unlock_irq(&mapping->tree_lock); + if (delpage == NULL) { + /* + * Migration is unnecessary since truncating the page is + * in progress. Just release the newpage. + */ + page_cache_release(newpage); + ret = -ENOENT; + } + return ret; +} + +/* + * Check whether the page can be migrated or not. + */ +int +page_migratable(struct page *page, struct page *newpage, + int freeable_page_count, struct list_head *vlist) +{ + int truncated; + + if (page_mapped(page)) { + switch (try_to_unmap(page, vlist)) { + case SWAP_FAIL: + return -EBUSY; + case SWAP_AGAIN: + return -EAGAIN; + } + } + if (PageWriteback(page)) + return -EAGAIN; + /* The page might have been truncated */ + truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL; + if (page_count(page) + truncated <= freeable_page_count) + return truncated ? -ENOENT : 0; + return -EAGAIN; +} + +/* + * Wait for the completion of all operations, which are going on + * against the page, and copy it. + */ +int +migrate_page_common(struct page *page, struct page *newpage, + struct list_head *vlist) +{ + long timeout = 5000; /* XXXX */ + int ret; + + while (timeout > 0) { + BUG_ON(page_count(page) == 0); + ret = page_migratable(page, newpage, 2, vlist); + switch (ret) { + case 0: + case -ENOENT: + copy_highpage(newpage, page); + return ret; + case -EBUSY: + return ret; + case -EAGAIN: + writeback_and_free_buffers(page); + unlock_page(page); + msleep(10); + timeout -= 10; + lock_page(page); + continue; + } + } + return -EBUSY; +} + +/* + * Wait for the completion of all operations, which are going on + * against the page. After that, move the buffers the page owns + * to the newpage and copy the page. + */ +int +migrate_page_buffer(struct page *page, struct page *newpage, + struct list_head *vlist) +{ + long timeout = 5000; /* XXXX */ + int ret; + + while (timeout > 0) { + BUG_ON(page_count(page) == 0); + ret = page_migratable(page, newpage, + 2 + !!PagePrivate(page), vlist); + switch (ret) { + case 0: + if (PagePrivate(page)) + generic_move_buffer(page, newpage); + /* fall thru */ + case -ENOENT: /* truncated */ + copy_highpage(newpage, page); + return ret; + case -EBUSY: + return ret; + case -EAGAIN: + wait_on_page_writeback(page); + unlock_page(page); + msleep(10); + timeout -= 10; + lock_page(page); + continue; + } + } + return -EBUSY; +} + +/* + * In some cases, a page migration needs to be rolled back. + */ +static int +unwind_page(struct page *page, struct page *newpage) +{ + struct address_space *mapping = page_mapping(newpage); + int truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL; + long retry = 1000; + + BUG_ON(mapping == NULL); + + /* + * Unwinding is not needed if the newpage has been already truncated. + */ + if (truncated) + goto out; + + /* + * Try to unwind by notifying waiters. If someone misbehaves, + * we die. + */ + read_lock_irq(&mapping->tree_lock); + page->index = newpage->index; + if (PageSwapCache(newpage)) { + SetPageSwapCache(page); + page->private = newpage->private; + } else + page->mapping = newpage->mapping; + if (radix_tree_replace(&mapping->page_tree, page_index(newpage), page) == NULL) { + printk(KERN_ERR "%s(): newpage:%p has gone. We can't roll back page:%p.\n", __FUNCTION__, newpage, page); + BUG(); + } + /* no page_cache_get(page); needed */ + read_unlock_irq(&mapping->tree_lock); +out: + newpage->mapping = NULL; + if (PageWriteback(newpage)) + end_page_writeback(newpage); /* XXX */ + newpage->private = 0; + ClearPageSwapCache(newpage); + /* XXX unmap needed? No, it shouldn't. Handled by fault handlers. */ + unlock_page(newpage); + unlock_page(page); + + /* + * Some requests may be blocked on the newpage. Wait until the + * requests have gone. + */ + while (page_count(newpage) > 2) { + msleep(10); + if (retry-- <= 0) { + retry = 1000; + printk(KERN_ERR "%s(): page:%p can't be rolled back, as there remain some references to newpage:%p yet.\n", __FUNCTION__, page, newpage); + printk(KERN_ERR "newpage %p flags %lx %d %d, page %p flags %lx %d\n", + newpage, newpage->flags, page_count(newpage), + page_mapcount(newpage), + page, page->flags, page_count(page)); + } + } + + BUG_ON(PageUptodate(newpage)); + BUG_ON(PageDirty(newpage)); + BUG_ON(PageActive(newpage)); + BUG_ON(PagePrivate(newpage)); + BUG_ON(page_count(newpage) != 2); + page_cache_release(newpage); + return 0; +} + +/* + * Try to migrate one page. Returns non-zero on failure. + * - Lock for the page must be held when invoked. + * - The page must be attached to an address_space. + */ +int +generic_migrate_page(struct page *page, struct page *newpage, + int (*migrate_fn)(struct page *, struct page *, struct list_head *)) +{ + LIST_HEAD(vlist); + int ret; + + /* + * Make sure that the newpage must be locked and kept not up-to-date + * during the page migration, so that it's guaranteed that all + * accesses to the newpage will be blocked until everything has + * become ok. + */ + if (TestSetPageLocked(newpage)) + BUG(); + + if ((ret = replace_pages(page, newpage))) + goto out_removing; + + /* + * With cleared PTEs, any accesses via the PTEs to the page + * can be caught and blocked in a pagefault handler. + */ + if (page_mapped(page)) { + while ((ret = try_to_unmap(page, &vlist)) == SWAP_AGAIN) + msleep(1); + if (ret != SWAP_SUCCESS) { + ret = -EBUSY; + goto out_busy; + } + } + + wait_on_page_writeback(page); + if (PageSwapCache(page)) { + /* + * The page is not mapped from anywhere now. + * Detach it from the swapcache completely. + */ + ClearPageSwapCache(page); + page->private = 0; + page->mapping = NULL; + } + + /* Wait for all operations against the page to finish. */ + ret = migrate_fn(page, newpage, &vlist); + switch (ret) { + case -ENOENT: + /* The file the page belongs to has been truncated. */ + page_cache_get(page); + page_cache_release(newpage); + newpage->mapping = NULL; + break; + case 0: + break; + default: + /* The page is busy. Try it later. */ + goto out_busy; + } + + arch_migrate_page(page, newpage); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageActive(page)) { + SetPageActive(newpage); + ClearPageActive(page); + } + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + if (PagePrivate(newpage)) { + BUG_ON(newpage->mapping == NULL); + unlock_page_buffer(newpage); + } + /* + * Finally, the newpage has become ready! Wake up all waiters, + * which have been waiting for the completion of the migration. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); + unlock_page(newpage); + + /* map the newpage where the old page have been mapped. */ + touch_unmapped_address(&vlist); + if (PageSwapCache(newpage)) { + lock_page(newpage); + __remove_exclusive_swap_page(newpage, 1); + unlock_page(newpage); + } + + page->mapping = NULL; + unlock_page(page); + page_cache_release(page); + + return 0; + +out_busy: + /* Roll back all operations. */ + unwind_page(page, newpage); + touch_unmapped_address(&vlist); + if (PageSwapCache(page)) { + lock_page(page); + __remove_exclusive_swap_page(page, 1); + unlock_page(page); + } + + return ret; + +out_removing: + if (PagePrivate(newpage)) + BUG(); + unlock_page(page); + unlock_page(newpage); + return ret; +} + +/* + * migrate_onepage() can migrate regular pages assigned to pagecache, + * swapcache or anonymous memory. + */ +struct page * +migrate_onepage(struct page *page, int nodeid) +{ + struct page *newpage; + struct address_space *mapping; + int ret; + + lock_page(page); + + /* + * Put the page in a radix tree if it isn't in the tree yet. + */ +#ifdef CONFIG_SWAP + if (PageAnon(page) && !PageSwapCache(page)) + if (!add_to_swap(page, GFP_KERNEL)) { + unlock_page(page); + return ERR_PTR(-ENOSPC); + } +#endif /* CONFIG_SWAP */ + if ((mapping = page_mapping(page)) == NULL) { + /* truncation is in progress */ + if (PagePrivate(page)) + try_to_release_page(page, GFP_KERNEL); + unlock_page(page); + return ERR_PTR(-ENOENT); + } + + /* + * Allocate a new page with the same gfp_mask + * as the target page has. + */ + if (nodeid == MIGRATE_NODE_ANY) + newpage = page_cache_alloc(mapping); + else + newpage = alloc_pages_node(nodeid, mapping->flags, 0); + if (newpage == NULL) { + unlock_page(page); + return ERR_PTR(-ENOMEM); + } + + if (mapping->a_ops && mapping->a_ops->migrate_page) + ret = mapping->a_ops->migrate_page(page, newpage); + else + ret = generic_migrate_page(page, newpage, migrate_page_common); + if (ret) { + BUG_ON(page_count(newpage) != 1); + page_cache_release(newpage); + return ERR_PTR(ret); + } + BUG_ON(page_count(page) != 1); + page_cache_release(page); + return newpage; +} + +static inline int +need_writeback(struct page *page) +{ + return PageDirty(page) && PagePrivate(page) && !PageWriteback(page); +} + +/* + * Start writeback I/O against a dirty page with filesystem + * specific private data to release them. + */ +static inline void page_start_writeback(struct page *page) +{ + struct address_space *mapping; + int ret; + + if (!need_writeback(page)) + return; + if (TestSetPageLocked(page)) + return; + + mapping = page_mapping(page); + + if (!mapping) + goto out_unlock; + /* + * Writeback is not needed if it has migrate_page method, + * because it can move all of them without writeback I/O. + */ + if (mapping->a_ops && mapping->a_ops->migrate_page) + goto out_unlock; + if (!need_writeback(page)) + goto out_unlock; + + ret = pageout(page, mapping); + + if (ret == PAGE_SUCCESS) + return; + +out_unlock: + unlock_page(page); +} + +/* + * This is the main entry point to migrate pages in a specific region. + * If a page is inactive, the page may be just released instead of + * migration. + */ +int try_to_migrate_pages(struct list_head *page_list) +{ + struct page *page, *page2, *newpage; + LIST_HEAD(pass1_list); + LIST_HEAD(pass2_list); + LIST_HEAD(discharge_list); + int nr_busy = 0; + int nr_noswap = 0; + struct scan_control sc = { + .nr_scanned = 0, + .nr_reclaimed = 0, + .priority = 0, + .gfp_mask = GFP_ATOMIC, + .may_writepage = 0, + }; + + + current->flags |= PF_KSWAPD; /* It's fake */ + list_for_each_entry_safe(page, page2, page_list, lru) { + page_start_writeback(page); + list_del(&page->lru); + if (PageActive(page)) + list_add(&page->lru, &pass1_list); + else + list_add(&page->lru, &discharge_list); + } + /* + * Try to free inactive pages only. + */ + shrink_list(&discharge_list, &sc); + list_splice(&discharge_list, &pass1_list); + + /* + * Try to migrate easily movable pages first. + */ + list_for_each_entry_safe(page, page2, &pass1_list, lru) { + list_del(&page->lru); + if (PageLocked(page) || PageWriteback(page) || + IS_ERR(newpage = migrate_onepage(page, MIGRATE_NODE_ANY))) { + if (page_count(page) == 1) { + /* the page is already unused */ + putback_page_to_lru(page_zone(page), page); + page_cache_release(page); + } else { + list_add(&page->lru, &pass2_list); + } + } else { + putback_page_to_lru(page_zone(newpage), newpage); + page_cache_release(newpage); + } + } + /* + * Try to migrate the rest of them. + */ + list_for_each_entry_safe(page, page2, &pass2_list, lru) { + list_del(&page->lru); + if (IS_ERR(newpage = migrate_onepage(page, MIGRATE_NODE_ANY))) { + if (page_count(page) == 1) { + /* the page is already unused */ + putback_page_to_lru(page_zone(page), page); + page_cache_release(page); + } else { + /* truncation may be in progress now. */ + nr_busy++; + if (PTR_ERR(newpage) == -ENOSPC) + nr_noswap++; + list_add(&page->lru, page_list); + } + } else { + putback_page_to_lru(page_zone(newpage), newpage); + page_cache_release(newpage); + } + } + current->flags &= ~PF_KSWAPD; + if (nr_noswap) { + if (printk_ratelimit()) + printk(KERN_WARNING "memory migration failed: Any swap devices should be added.\n"); + return -ENOSPC; + } + return nr_busy; +} + +EXPORT_SYMBOL(generic_migrate_page); +EXPORT_SYMBOL(migrate_page_common); +EXPORT_SYMBOL(migrate_page_buffer); +EXPORT_SYMBOL(page_migratable); +EXPORT_SYMBOL(migrate_onepage); --- linux.orig/mm/page_alloc.c~FROM-MM-memory-hotplug-prep-break-out-zone-initialization 2005-09-30 12:37:51.000000000 -0700 +++ linux/mm/page_alloc.c 2005-09-30 12:39:30.000000000 -0700 @@ -33,7 +33,9 @@ #include #include #include +#include #include +#include #include #include @@ -47,7 +49,6 @@ EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); -struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; @@ -78,21 +79,44 @@ unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (zone != page_zone(page)) + return 0; + + return 1; +} /* * Temporary debugging check for pages not lying within a given zone. */ static int bad_range(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) - return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) + if (page_outside_zone_boundaries(zone, page)) return 1; -#endif - if (zone != page_zone(page)) + if (!page_is_consistent(zone, page)) return 1; + return 0; } @@ -1401,7 +1425,7 @@ /* * Builds allocation fallback zone lists. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +int __devinit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) { switch (k) { struct zone *zone; @@ -1409,7 +1433,12 @@ BUG(); case ZONE_HIGHMEM: zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { + /* + * with mem hotplug we don't increment present_pages + * until the pages are actually freed into the zone, + * but we increment spanned pages much earlier + */ + if (zone->spanned_pages) { #ifndef CONFIG_HIGHMEM BUG(); #endif @@ -1417,20 +1446,47 @@ } case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; } return j; } -#ifdef CONFIG_NUMA +static inline int zone_index_to_type(int index) +{ + int type = ZONE_NORMAL; + + if (index & __GFP_HIGHMEM) + type = ZONE_HIGHMEM; + if (index & __GFP_DMA) + type = ZONE_DMA; + return type; +} + #define MAX_NODE_LOAD (num_online_nodes()) -static int __initdata node_load[MAX_NUMNODES]; + +#ifdef CONFIG_NUMA +static int __devinitdata node_load[MAX_NUMNODES]; +static int __devinit get_node_load(int node) +{ + return node_load[node]; +} +static void __devinit increment_node_load(int node, int load) +{ + node_load[node] += load; +} +#else +static inline int get_node_load(int node) +{ + return 0; +} +static inline void increment_node_load(int node, int load) {} +#endif /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending @@ -1445,7 +1501,7 @@ * on them otherwise. * It returns -1 if no node is found. */ -static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +static int __devinit find_next_best_node(int node, nodemask_t *used_node_mask) { int i, n, val; int min_val = INT_MAX; @@ -1477,7 +1533,7 @@ /* Slight preference for less loaded node */ val *= (MAX_NODE_LOAD*MAX_NUMNODES); - val += node_load[n]; + val += get_node_load(n); if (val < min_val) { min_val = val; @@ -1491,19 +1547,13 @@ return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +void __devinit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; struct zonelist *zonelist; nodemask_t used_mask; - /* initialize zonelists */ - for (i = 0; i < GFP_ZONETYPES; i++) { - zonelist = pgdat->node_zonelists + i; - zonelist->zones[0] = NULL; - } - /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = num_online_nodes(); @@ -1517,18 +1567,14 @@ */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] += load; + increment_node_load(node, load); prev_node = node; load--; for (i = 0; i < GFP_ZONETYPES; i++) { zonelist = pgdat->node_zonelists + i; for (j = 0; zonelist->zones[j] != NULL; j++); - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; + k = zone_index_to_type(i); j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); zonelist->zones[j] = NULL; @@ -1536,51 +1582,6 @@ } } -#else /* CONFIG_NUMA */ - -static void __init build_zonelists(pg_data_t *pgdat) -{ - int i, j, k, node, local_node; - - local_node = pgdat->node_id; - for (i = 0; i < GFP_ZONETYPES; i++) { - struct zonelist *zonelist; - - zonelist = pgdat->node_zonelists + i; - - j = 0; - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - - j = build_zonelists_node(pgdat, zonelist, j, k); - /* - * Now we build the zonelist so that it contains the zones - * of all the other nodes. - * We don't want to pressure a particular node, so when - * building the zones for node N, we make sure that the - * zones coming right after the local ones are those from - * node N+1 (modulo N) - */ - for (node = local_node + 1; node < MAX_NUMNODES; node++) { - if (!node_online(node)) - continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - } - for (node = 0; node < local_node; node++) { - if (!node_online(node)) - continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - } - - zonelist->zones[j] = NULL; - } -} - -#endif /* CONFIG_NUMA */ - void __init build_all_zonelists(void) { int i; @@ -1659,7 +1660,7 @@ * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -1870,6 +1871,63 @@ #endif +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + unsigned long size_bytes; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + size_bytes = zone->wait_table_size * sizeof(wait_queue_head_t); + if (system_state >= SYSTEM_RUNNING) + zone->wait_table = kmalloc(size_bytes, GFP_KERNEL); + else + zone->wait_table = alloc_bootmem_node(zone->zone_pgdat, + size_bytes); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static __devinit void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); + zone->spanned_pages = size; +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1879,10 +1937,11 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long i, j; - int cpu, nid = pgdat->node_id; + unsigned long j; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; @@ -1890,7 +1949,6 @@ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; realsize = size = zones_size[j]; if (zholes_size) @@ -1900,29 +1958,18 @@ nr_kernel_pages += realsize; nr_all_pages += realsize; - zone->spanned_pages = size; zone->present_pages = realsize; zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); + init_MUTEX(&zone->init_sem); zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - batch = zone_batchsize(zone); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1933,32 +1980,9 @@ if (!size) continue; - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - memmap_init(size, nid, j, zone_start_pfn); - zonetable_add(zone, nid, j, zone_start_pfn, size); - + init_currently_empty_zone(zone, zone_start_pfn, size); zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } @@ -2025,8 +2049,9 @@ pg_data_t *pgdat; loff_t node = *pos; - for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) - --node; + for_each_pgdat(pgdat) + if (!node--) + break; return pgdat; } @@ -2036,7 +2061,7 @@ pg_data_t *pgdat = (pg_data_t *)arg; (*pos)++; - return pgdat->pgdat_next; + return next_online_pgdat(pgdat); } static void frag_stop(struct seq_file *m, void *arg) @@ -2358,7 +2383,7 @@ * that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -static void setup_per_zone_pages_min(void) +void setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -2567,3 +2592,48 @@ return table; } + +static inline int zone_previously_initialized(struct zone *zone) +{ + if (zone->wait_table_size) + return 1; + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static int __build_zonelists(void *__pgdat) +{ + pg_data_t *pgdat = __pgdat; + build_zonelists(pgdat); + return 0; +} + +int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages) +{ + int ret = 0; + + down(&zone->init_sem); + if (zone_previously_initialized(zone)) { + ret = -EEXIST; + goto out; + } + + zone_wait_table_init(zone, size_pages); + init_currently_empty_zone(zone, phys_start_pfn, size_pages); + zone_pcp_init(zone); + + /* + * This is an awfully blunt way to do this. But, the + * zonelists are accessed many times over large areas + * of performance-critical code in the allocator. + * That makes it very hard to get a conventional lock + * to work. This of this as a rw lock with a huge + * write cost. + */ + stop_machine_run(__build_zonelists, zone->zone_pgdat, NR_CPUS); +out: + up(&zone->init_sem); + return ret; +} +#endif --- linux.orig/mm/rmap.c~AA-PM-14-try_to_unmap_force 2005-09-30 12:39:56.000000000 -0700 +++ linux/mm/rmap.c 2005-09-30 12:39:56.000000000 -0700 @@ -45,6 +45,7 @@ */ #include +#include #include #include #include @@ -500,11 +501,81 @@ } } +struct page_va_list { + struct mm_struct *mm; + unsigned long addr; + struct list_head list; +}; + +/* + * This function is invoked to record an address space and a mapped address + * to which a target page belongs, when it is unmapped forcibly. + */ +static int +record_unmapped_address(struct list_head *force, struct mm_struct *mm, + unsigned long address) +{ + struct page_va_list *vlist; + + vlist = kmalloc(sizeof(struct page_va_list), GFP_KERNEL); + if (vlist == NULL) + return -ENOMEM; + spin_lock(&mmlist_lock); + if (!atomic_read(&mm->mm_users)) + vlist->mm = NULL; + else { + vlist->mm = mm; + atomic_inc(&mm->mm_users); + } + spin_unlock(&mmlist_lock); + + if (vlist->mm == NULL) + kfree(vlist); + else { + vlist->addr = address; + list_add(&vlist->list, force); + } + return 0; +} + +/* + * This function touches an address recorded in the vlist to map + * a page into an address space again. + */ +int +touch_unmapped_address(struct list_head *vlist) +{ + struct page_va_list *v1, *v2; + struct vm_area_struct *vma; + int ret = 0; + int error; + + list_for_each_entry_safe(v1, v2, vlist, list) { + list_del(&v1->list); + down_read(&v1->mm->mmap_sem); + if (atomic_read(&v1->mm->mm_users) == 1) + goto out; + vma = find_vma(v1->mm, v1->addr); + if (vma == NULL) + goto out; + error = get_user_pages(current, v1->mm, v1->addr, 1, + 0, 0, NULL, NULL); + if (error < 0) + ret = error; + out: + up_read(&v1->mm->mmap_sem); + mmput(v1->mm); + kfree(v1); + } + return ret; +} + /* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + struct list_head *force) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -520,6 +591,9 @@ if (IS_ERR(pte)) goto out; + if (force && record_unmapped_address(force, mm, address)) + goto out_unmap; + /* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced @@ -527,8 +601,9 @@ * * Pages belonging to VM_RESERVED regions should not happen here. */ - if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || - ptep_clear_flush_young(vma, address, pte)) { + if (((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || + ptep_clear_flush_young(vma, address, pte)) && + force == NULL) { ret = SWAP_FAIL; goto out_unmap; } @@ -672,7 +747,7 @@ spin_unlock(&mm->page_table_lock); } -static int try_to_unmap_anon(struct page *page) +static int try_to_unmap_anon(struct page *page, struct list_head *force) { struct anon_vma *anon_vma; struct vm_area_struct *vma; @@ -683,7 +758,7 @@ return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - ret = try_to_unmap_one(page, vma); + ret = try_to_unmap_one(page, vma, force); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -700,7 +775,7 @@ * * This function is only called from try_to_unmap for object-based pages. */ -static int try_to_unmap_file(struct page *page) +static int try_to_unmap_file(struct page *page, struct list_head *force) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -714,7 +789,7 @@ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma); + ret = try_to_unmap_one(page, vma, force); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -802,7 +877,7 @@ * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable */ -int try_to_unmap(struct page *page) +int try_to_unmap(struct page *page, struct list_head *force) { int ret; @@ -810,9 +885,9 @@ BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page); + ret = try_to_unmap_anon(page, force); else - ret = try_to_unmap_file(page); + ret = try_to_unmap_file(page, force); if (!page_mapped(page)) ret = SWAP_SUCCESS; --- linux.orig/mm/shmem.c~AA-PM-09-migrate-swapcache-validate 2005-09-30 12:39:49.000000000 -0700 +++ linux/mm/shmem.c 2005-09-30 12:40:03.000000000 -0700 @@ -92,7 +92,16 @@ * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: * might be reconsidered if it ever diverges from PAGE_SIZE. */ +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * XXXX: This is temprary code, which should be replaced with proper one + * after the scheme to specify hot removable region has defined. + * 25/Sep/2004 -- taka + */ + return alloc_pages(gfp_mask & ~__GFP_HIGHMEM, PAGE_CACHE_SHIFT-PAGE_SHIFT); +#else return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); +#endif } static inline void shmem_dir_free(struct page *page) @@ -1016,6 +1025,14 @@ page_cache_release(swappage); goto repeat; } + if (!PageSwapCache(swappage)) { + /* page-migration has occured */ + shmem_swp_unmap(entry); + spin_unlock(&info->lock); + unlock_page(swappage); + page_cache_release(swappage); + goto repeat; + } if (PageWriteback(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); --- linux.orig/mm/sparse.c~FROM-MM-memory-hotplug-prep-__section_nr-helper-fix 2005-09-30 12:37:53.000000000 -0700 +++ linux/mm/sparse.c 2005-09-30 12:39:27.000000000 -0700 @@ -5,8 +5,10 @@ #include #include #include +#include #include #include +#include #include /* @@ -72,6 +74,31 @@ } #endif +/* + * Although written for the SPARSEMEM_EXTREME case, this happens + * to also work for the flat array case becase + * NR_SECTION_ROOTS==NR_MEM_SECTIONS. + */ +int __section_nr(struct mem_section* ms) +{ + unsigned long root_nr; + struct mem_section* root; + + for (root_nr = 0; + root_nr < NR_MEM_SECTIONS; + root_nr += SECTIONS_PER_ROOT) { + root = __nr_to_section(root_nr); + + if (!root) + continue; + + if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) + break; + } + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); +} + /* Record a memory area against a node. */ void memory_present(int nid, unsigned long start, unsigned long end) { @@ -162,6 +189,45 @@ return NULL; } +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * nr_pages; + + page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + memset(ret, 0, memmap_size); + + return ret; +} + +static int vaddr_in_vmalloc_area(void *addr) +{ + if (addr >= (void *)VMALLOC_START && + addr < (void *)VMALLOC_END) + return 1; + return 0; +} + +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +{ + if (vaddr_in_vmalloc_area(memmap)) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * nr_pages)); +} + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -187,14 +253,37 @@ * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) +int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + int nr_pages) { - struct mem_section *ms = __pfn_to_section(start_pfn); + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct pglist_data *pgdat = zone->zone_pgdat; + struct mem_section *ms; + struct page *memmap; + unsigned long flags; + int ret; - if (ms->section_mem_map & SECTION_MARKED_PRESENT) - return -EEXIST; + /* + * no locking for this, because it does its own + * plus, it does a kmalloc + */ + sparse_index_init(section_nr, pgdat->node_id); + memmap = __kmalloc_section_memmap(nr_pages); + + pgdat_resize_lock(pgdat, &flags); + ms = __pfn_to_section(start_pfn); + if (ms->section_mem_map & SECTION_MARKED_PRESENT) { + ret = -EEXIST; + goto out; + } ms->section_mem_map |= SECTION_MARKED_PRESENT; - return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); + ret = sparse_init_one_section(ms, section_nr, memmap); + + if (ret <= 0) + __kfree_section_memmap(memmap, nr_pages); +out: + pgdat_resize_unlock(pgdat, &flags); + return ret; } --- linux.orig/mm/swap_state.c~AA-PM-05-swapper_space-gfpmask 2005-09-30 12:39:47.000000000 -0700 +++ linux/mm/swap_state.c 2005-09-30 12:39:47.000000000 -0700 @@ -37,6 +37,7 @@ .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .tree_lock = RW_LOCK_UNLOCKED, .a_ops = &swap_aops, + .flags = GFP_HIGHUSER, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, }; @@ -141,7 +142,7 @@ * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page * page) +int add_to_swap(struct page * page, unsigned int gfp_mask) { swp_entry_t entry; int err; @@ -166,7 +167,7 @@ * Add it to the swap cache and mark it dirty */ err = __add_to_swap_cache(page, entry, - GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN); + gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); switch (err) { case 0: /* Success */ --- linux.orig/mm/swapfile.c~AA-PM-09-migrate-swapcache-validate 2005-09-30 12:39:49.000000000 -0700 +++ linux/mm/swapfile.c 2005-09-30 12:39:57.000000000 -0700 @@ -323,11 +323,12 @@ * Work out if there are any other processes sharing this * swap cache page. Free it if you can. Return success. */ -int remove_exclusive_swap_page(struct page *page) +int __remove_exclusive_swap_page(struct page *page, int force) { int retval; struct swap_info_struct * p; swp_entry_t entry; + int mapcount = force ? page_mapcount(page) : 0; BUG_ON(PagePrivate(page)); BUG_ON(!PageLocked(page)); @@ -336,7 +337,7 @@ return 0; if (PageWriteback(page)) return 0; - if (page_count(page) != 2) /* 2: us + cache */ + if (page_count(page) - mapcount != 2) /* 2: us + cache */ return 0; entry.val = page->private; @@ -349,7 +350,8 @@ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ write_lock_irq(&swapper_space.tree_lock); - if ((page_count(page) == 2) && !PageWriteback(page)) { + mapcount = force ? page_mapcount(page) : 0; + if ((page_count(page) - mapcount == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; @@ -629,6 +631,7 @@ */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); +again: page = read_swap_cache_async(entry, NULL, 0); if (!page) { /* @@ -663,6 +666,12 @@ wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); + if (!PageSwapCache(page)) { + /* page-migration has occured */ + unlock_page(page); + page_cache_release(page); + goto again; + } wait_on_page_writeback(page); /* --- linux.orig/mm/thrash.c~AA-PM-15-swap_token-kthread 2005-09-30 12:39:56.000000000 -0700 +++ linux/mm/thrash.c 2005-09-30 12:39:56.000000000 -0700 @@ -54,6 +54,9 @@ struct mm_struct *mm; int reason; + if (current->mm == NULL) + return; + /* We have the token. Let others know we still need it. */ if (has_swap_token(current->mm)) { current->mm->recent_pagein = 1; --- linux.orig/mm/truncate.c~AA-PM-11.0-migrate-truncate 2005-09-30 12:39:51.000000000 -0700 +++ linux/mm/truncate.c 2005-09-30 12:39:52.000000000 -0700 @@ -90,6 +90,34 @@ return 1; } +static inline struct page *lock_replace_page(struct page **p, struct address_space *mapping) +{ + struct page *page = *p; + struct page *newpage; + + lock_page(page); + + if (page->mapping != NULL) + return page; + + unlock_page(page); + + newpage = find_lock_page(mapping, page->index); + if (!newpage) { + /* + * put the page back the way it was and let + * the normal truncate code handle it + */ + lock_page(page); + return page; + } + + /* memory migration has been rolled back. */ + page_cache_release(page); + *p = newpage; + return newpage; +} + /** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate @@ -140,6 +168,9 @@ unlock_page(page); continue; } + /* page->mapping check is done in + * truncate_complete_page() when the page has been + * migrated. */ truncate_complete_page(mapping, page); unlock_page(page); } @@ -167,9 +198,9 @@ continue; } for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + struct page *page; - lock_page(page); + page = lock_replace_page(&pvec.pages[i], mapping); wait_on_page_writeback(page); if (page->index > next) next = page->index; @@ -267,11 +298,11 @@ pagevec_lookup(&pvec, mapping, next, min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { for (i = 0; !ret && i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + struct page *page; pgoff_t page_index; int was_dirty; - lock_page(page); + page = lock_replace_page(&pvec.pages[i], mapping); if (page->mapping != mapping) { unlock_page(page); continue; --- linux.orig/mm/vmalloc.c~AA-PM-26-vmalloc 2005-09-30 12:40:03.000000000 -0700 +++ linux/mm/vmalloc.c 2005-09-30 12:40:03.000000000 -0700 @@ -476,7 +476,16 @@ */ void *vmalloc(unsigned long size) { +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * XXXX: This is temprary code, which should be replaced with proper one + * after the scheme to specify hot removable region has defined. + * 25/Sep/2004 -- taka + */ + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); +#else return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); +#endif } EXPORT_SYMBOL(vmalloc); --- linux.orig/mm/vmscan.c~AA-PM-01-steal_page_from_lru 2005-09-30 12:38:29.000000000 -0700 +++ linux/mm/vmscan.c 2005-09-30 12:40:05.000000000 -0700 @@ -39,51 +39,6 @@ #include -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; - -struct scan_control { - /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ - unsigned long nr_to_scan; - - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Incremented by the number of pages reclaimed */ - unsigned long nr_reclaimed; - - unsigned long nr_mapped; /* From page_state */ - - /* How many pages shrink_cache() should reclaim */ - int nr_to_reclaim; - - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; - - /* This context's GFP mask */ - unsigned int gfp_mask; - - int may_writepage; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; -}; - /* * The list of shrinker callbacks used by to apply pressure to * ageable caches. @@ -302,7 +257,7 @@ /* * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -373,7 +328,7 @@ /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +int shrink_list(struct list_head *page_list, struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; @@ -418,7 +373,7 @@ * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { - if (!add_to_swap(page)) + if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; } #endif /* CONFIG_SWAP */ @@ -432,7 +387,7 @@ * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page)) { + switch (try_to_unmap(page, NULL)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -582,22 +537,8 @@ while (scan++ < nr_to_scan && !list_empty(src)) { page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - - if (!TestClearPageLRU(page)) - BUG(); - list_del(&page->lru); - if (get_page_testone(page)) { - /* - * It is being freed elsewhere - */ - __put_page(page); - SetPageLRU(page); - list_add(&page->lru, src); - continue; - } else { - list_add(&page->lru, dst); + if (isolate_lru_onepage(page, src, dst)) nr_taken++; - } } *scanned = scan; @@ -650,13 +591,10 @@ */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + if (PageActive(page)) + ClearPageActive(page); + __putback_page_to_lru(zone, page); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); --- linux.orig/net/sunrpc/xprt.c~A9.1-xprt-warnings 2005-09-30 12:38:08.000000000 -0700 +++ linux/net/sunrpc/xprt.c 2005-09-30 12:38:08.000000000 -0700 @@ -825,13 +825,13 @@ if (len > desc->count) len = desc->count; if (skb_copy_bits(desc->skb, desc->offset, p, len)) { - dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n", + dprintk("RPC: failed to copy %u bytes from skb. %u bytes remain\n", len, desc->count); return 0; } desc->offset += len; desc->count -= len; - dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n", + dprintk("RPC: copied %u bytes from skb. %u bytes remain\n", len, desc->count); return len; }