--- linux.orig/arch/alpha/mm/numa.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/alpha/mm/numa.c 2005-09-30 12:38:06.000000000 -0700 @@ -279,7 +279,7 @@ initrd_end, phys_to_virt(PFN_PHYS(max_low_pfn))); } else { - nid = kvaddr_to_nid(initrd_start); + nid = pa_to_nid(__pa(initrd_start)); reserve_bootmem_node(NODE_DATA(nid), virt_to_phys((void *)initrd_start), INITRD_SIZE); @@ -371,6 +371,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); i = node_spanned_pages(nid); while (i-- > 0) { struct page *page = nid_page_nr(nid, i); @@ -384,6 +386,7 @@ else shared += page_count(page) - 1; } + pgdat_resize_unlock(NODE_DATA(nid), &flags); } printk("%ld pages of RAM\n",total); printk("%ld free pages\n",free); --- linux.orig/arch/i386/Kconfig~C2-enable-i386-sparsemem-debug 2005-09-30 12:38:18.000000000 -0700 +++ linux/arch/i386/Kconfig 2005-09-30 12:38:22.000000000 -0700 @@ -776,6 +776,9 @@ depends on NUMA default y +config ARCH_HAS_BOOTPA + def_bool y + config ARCH_HAVE_MEMORY_PRESENT bool depends on DISCONTIGMEM @@ -799,9 +802,27 @@ def_bool y depends on NUMA +config X86_SPARSEMEM_DEBUG_NONUMA + bool "Enable SPARSEMEM on flat systems (debugging only)" + depends on !NUMA && EXPERIMENTAL + select SPARSEMEM_STATIC + select SPARSEMEM_MANUAL + +config ARCH_MEMORY_PROBE + def_bool y + depends on X86_SPARSEMEM_DEBUG_NONUMA + +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_SPARSEMEM_DEBUG_NONUMA + +config X86_SIMULATED_MEM_HOTPLUG + bool "Simulate memory hotplug on non-hotplug hardware" + depends on EXPERIMENTAL + config ARCH_SPARSEMEM_ENABLE def_bool y - depends on NUMA + depends on NUMA || X86_SPARSEMEM_DEBUG_NONUMA config ARCH_SELECT_MEMORY_MODEL def_bool y --- linux.orig/arch/i386/kernel/setup.c~B2.1-i386-discontig-consolidation 2005-09-30 12:38:12.000000000 -0700 +++ linux/arch/i386/kernel/setup.c 2005-09-30 12:38:20.000000000 -0700 @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -146,6 +147,7 @@ EXPORT_SYMBOL(ist_info); #endif struct e820map e820; +struct e820map bios_e820; extern void early_cpu_init(void); extern void dmi_scan_machine(void); @@ -365,6 +367,37 @@ } } +/* + * numa interface - we expect the numa architecture specfic code to have + * populated the following initialisation. + * + * 1) node_online_map - the map of all nodes configured (online) in the system + * 2) node_start_pfn - the starting page frame number for a node + * 3) node_end_pfn - the ending page fram number for a node + */ +unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; +unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; +bootmem_data_t node0_bdata; + +/* + * FLAT - support for basic PC memory model with discontig enabled, essentially + * a single node with all available processors in it with a flat + * memory map. + */ +int __init get_memcfg_numa_flat(void) +{ + printk("NUMA - single node, flat memory mode\n"); + + /* Run the memory configuration and find the top of memory. */ + node_start_pfn[0] = 0; + node_end_pfn[0] = max_pfn; + + /* Indicate there is one node available. */ + nodes_clear(node_online_map); + node_set_online(0); + return 1; +} + static void __init limit_regions(unsigned long long size) { unsigned long long current_addr = 0; @@ -946,6 +979,12 @@ return 0; } +static int __init +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) +{ + memory_present(0, start, end); + return 0; +} /* * Find the highest page frame number we have available @@ -957,6 +996,7 @@ max_pfn = 0; if (efi_enabled) { efi_memmap_walk(efi_find_max_pfn, &max_pfn); + efi_memmap_walk(efi_memory_present_wrapper, NULL); return; } @@ -971,6 +1011,7 @@ continue; if (end > max_pfn) max_pfn = end; + memory_present(0, start, end); } } @@ -1106,59 +1147,112 @@ reserve_bootmem(addr, PAGE_SIZE); } -#ifndef CONFIG_NEED_MULTIPLE_NODES -void __init setup_bootmem_allocator(void); -static unsigned long __init setup_memory(void) +static void __init find_max_pfn_node(int nid) { + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; /* - * partially used pages are not usable - thus - * we are rounding upwards: + * if a user has given mem=XXXX, then we need to make sure + * that the node _starts_ before that, too, not just ends */ - min_low_pfn = PFN_UP(init_pg_tables_end); + if (node_start_pfn[nid] > max_pfn) + node_start_pfn[nid] = max_pfn; + if (node_start_pfn[nid] > node_end_pfn[nid]) + BUG(); +} + +void __init setup_bootmem_allocator(void); +unsigned long __init setup_memory(void) +{ + int nid; + unsigned long reserve_pages; + /* + * When mapping a NUMA machine we allocate the node_mem_map arrays + * from node local memory. They are then mapped directly into KVA + * between zone normal and vmalloc space. Calculate the size of + * this space and use it to adjust the boundry between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ find_max_pfn(); + get_memcfg_numa(); + for_each_online_node(nid) + num_physpages = max(num_physpages, node_end_pfn[nid]); - max_low_pfn = find_max_low_pfn(); + reserve_pages = calculate_numa_remap_pages(); + /* partially used pages are not usable - thus round upwards */ + min_low_pfn = PFN_UP(init_pg_tables_end); + max_low_pfn = find_max_low_pfn() - reserve_pages; + + if (reserve_pages) + printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n", + reserve_pages, max_low_pfn + reserve_pages); + printk(KERN_DEBUG "max_pfn = %ld\n", max_pfn); #ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > max_low_pfn) { - highstart_pfn = max_low_pfn; - } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + pages_to_mb(max_pfn - max_low_pfn)); #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); + pages_to_mb(max_low_pfn - min_low_pfn)); + printk(KERN_DEBUG "min_low_pfn = %ld, max_low_pfn = %ld\n", + min_low_pfn, max_low_pfn); + + printk(KERN_NOTICE "Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + setup_numa_kva_remap(); + printk("High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + vmalloc_earlyreserve = reserve_pages * PAGE_SIZE; + for_each_online_node(nid) + find_max_pfn_node(nid); setup_bootmem_allocator(); - return max_low_pfn; } -void __init zone_sizes_init(void) +static inline unsigned long max_hardware_dma_pfn(void) +{ + return virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; +} +static inline unsigned long nid_size_pages(int nid) +{ + return node_end_pfn[nid] - node_start_pfn[nid]; +} +static inline int nid_starts_in_highmem(int nid) +{ + return node_start_pfn[nid] >= max_low_pfn; +} + +void __init nid_zone_sizes_init(int nid) { unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int max_dma, low; + unsigned long max_dma; + unsigned long start = node_start_pfn[nid]; + unsigned long end = node_end_pfn[nid]; + + if (node_has_online_mem(nid)){ + if (nid_starts_in_highmem(nid)) { + zones_size[ZONE_HIGHMEM] = nid_size_pages(nid); + } else { + max_dma = min(max_hardware_dma_pfn(), max_low_pfn); + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; + zones_size[ZONE_HIGHMEM] = end - max_low_pfn; + } + } - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - low = max_low_pfn; + free_area_init_node(nid, NODE_DATA(nid), zones_size, start, + get_zholes_size(nid)); +} - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = highend_pfn - low; -#endif - } - free_area_init(zones_size); +void __init zone_sizes_init(void) +{ + int nid; + + for_each_online_node(nid) + nid_zone_sizes_init(nid); } -#else -extern unsigned long __init setup_memory(void); -extern void zone_sizes_init(void); -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ void __init setup_bootmem_allocator(void) { @@ -1520,6 +1614,7 @@ else { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); print_memory_map(machine_specific_memory_setup()); + bios_e820 = e820; } copy_edd(); --- linux.orig/arch/i386/mm/Makefile~B2.2-i386-create-numa.c 2005-09-30 12:38:13.000000000 -0700 +++ linux/arch/i386/mm/Makefile 2005-09-30 12:38:13.000000000 -0700 @@ -4,7 +4,8 @@ obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o -obj-$(CONFIG_NUMA) += discontig.o +obj-$(CONFIG_DISCONTIGMEM) += discontig.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o --- linux.orig/arch/i386/mm/discontig.c~FROM-MM-memory-hotplug-i386-addition-functions 2005-09-30 12:37:59.000000000 -0700 +++ linux/arch/i386/mm/discontig.c 2005-09-30 12:38:13.000000000 -0700 @@ -32,28 +32,10 @@ #include #include -#include #include #include #include -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); -bootmem_data_t node0_bdata; - -/* - * numa interface - we expect the numa architecture specfic code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; - - -#ifdef CONFIG_DISCONTIGMEM /* * 4) physnode_map - the mapping between a pfn and owning node * physnode_map keeps track of the physical memory layout of a generic @@ -94,342 +76,3 @@ return (nr_pages + 1) * sizeof(struct page); } -#endif - -extern unsigned long find_max_low_pfn(void); -extern void find_max_pfn(void); -extern void one_highpage_init(struct page *, int, int); - -extern struct e820map e820; -extern unsigned long init_pg_tables_end; -extern unsigned long highend_pfn, highstart_pfn; -extern unsigned long max_low_pfn; -extern unsigned long totalram_pages; -extern unsigned long totalhigh_pages; - -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -unsigned long node_remap_start_pfn[MAX_NUMNODES]; -unsigned long node_remap_size[MAX_NUMNODES]; -unsigned long node_remap_offset[MAX_NUMNODES]; -void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -void *node_remap_end_vaddr[MAX_NUMNODES]; -void *node_remap_alloc_vaddr[MAX_NUMNODES]; - -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -int __init get_memcfg_numa_flat(void) -{ - printk("NUMA - single node, flat memory mode\n"); - - /* Run the memory configuration and find the top of memory. */ - find_max_pfn(); - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - memory_present(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init find_max_pfn_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - if (node_start_pfn[nid] > node_end_pfn[nid]) - BUG(); -} - -/* Find the owning node for a pfn. */ -int early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - for_each_node(nid) { - if (node_end_pfn[nid] == 0) - break; - if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn) - return nid; - } - - return 0; -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - if (nid && node_has_online_mem(nid)) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { - NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); - min_low_pfn += PFN_UP(sizeof(pg_data_t)); - } -} - -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) - return 0; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn< max_pfn) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - /* - * Validate the region we are allocating only contains valid - * pages. - */ - for (pfn = node_end_pfn[nid] - size; - pfn < node_end_pfn[nid]; pfn++) - if (!page_is_ram(pfn)) - break; - - if (pfn != node_end_pfn[nid]) - size = 0; - - printk("Reserving %ld pages of KVA for lmem_map of node %d\n", - size, nid); - node_remap_size[nid] = size; - node_remap_offset[nid] = reserve_pages; - reserve_pages += size; - printk("Shrinking node %d from %ld pages to %ld pages\n", - nid, node_end_pfn[nid], node_end_pfn[nid] - size); - - if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { - /* - * Align node_end_pfn[] and node_remap_start_pfn[] to - * pmd boundary. remap_numa_kva will barf otherwise. - */ - printk("Shrinking node %d further by %ld pages for proper alignment\n", - nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); - size += node_end_pfn[nid] & (PTRS_PER_PTE-1); - } - - node_end_pfn[nid] -= size; - node_remap_start_pfn[nid] = node_end_pfn[nid]; - } - printk("Reserving total of %ld pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; -} - -extern void setup_bootmem_allocator(void); -unsigned long __init setup_memory(void) -{ - int nid; - unsigned long system_start_pfn, system_max_low_pfn; - unsigned long reserve_pages; - - /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundry between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ - find_max_pfn(); - get_memcfg_numa(); - - reserve_pages = calculate_numa_remap_pages(); - - /* partially used pages are not usable - thus round upwards */ - system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); - - system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages; - printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n", - reserve_pages, max_low_pfn + reserve_pages); - printk("max_pfn = %ld\n", max_pfn); -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > system_max_low_pfn) - highstart_pfn = system_max_low_pfn; - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(system_max_low_pfn)); - printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", - min_low_pfn, max_low_pfn, highstart_pfn); - - printk("Low memory ends at vaddr %08lx\n", - (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - node_remap_start_vaddr[nid] = pfn_to_kaddr( - highstart_pfn + node_remap_offset[nid]); - /* Init the node remap allocator */ - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - allocate_pgdat(nid); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) pfn_to_kaddr(highstart_pfn - + node_remap_offset[nid] + node_remap_size[nid])); - } - printk("High memory starts at vaddr %08lx\n", - (ulong) pfn_to_kaddr(highstart_pfn)); - vmalloc_earlyreserve = reserve_pages * PAGE_SIZE; - for_each_online_node(nid) - find_max_pfn_node(nid); - - memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); - NODE_DATA(0)->bdata = &node0_bdata; - setup_bootmem_allocator(); - return max_low_pfn; -} - -void __init zone_sizes_init(void) -{ - int nid; - - /* - * Insert nodes into pgdat_list backward so they appear in order. - * Clobber node 0's links and NULL out pgdat_list before starting. - */ - pgdat_list = NULL; - for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) { - if (!node_online(nid)) - continue; - NODE_DATA(nid)->pgdat_next = pgdat_list; - pgdat_list = NODE_DATA(nid); - } - - for_each_online_node(nid) { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned long *zholes_size; - unsigned int max_dma; - - unsigned long low = max_low_pfn; - unsigned long start = node_start_pfn[nid]; - unsigned long high = node_end_pfn[nid]; - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - if (node_has_online_mem(nid)){ - if (start > low) { -#ifdef CONFIG_HIGHMEM - BUG_ON(start > high); - zones_size[ZONE_HIGHMEM] = high - start; -#endif - } else { - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - BUG_ON(max_dma > low); - BUG_ON(low > high); - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; -#endif - } - } - } - - zholes_size = get_zholes_size(nid); - - free_area_init_node(nid, NODE_DATA(nid), zones_size, start, - zholes_size); - } - return; -} - -void __init set_highmem_pages_init(int bad_ppro) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - struct page *page; - - for_each_zone(zone) { - unsigned long node_pfn, zone_start_pfn, zone_end_pfn; - - if (!is_highmem(zone)) - continue; - - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone_start_pfn + zone->spanned_pages; - - printk("Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, zone->zone_pgdat->node_id, - zone_start_pfn, zone_end_pfn); - - for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - one_highpage_init(page, node_pfn, bad_ppro); - } - } - totalram_pages += totalhigh_pages; -#endif -} --- linux.orig/arch/i386/mm/init.c~FROM-MM-memory-hotplug-i386-addition-functions 2005-09-30 12:37:59.000000000 -0700 +++ linux/arch/i386/mm/init.c 2005-09-30 12:39:28.000000000 -0700 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +45,6 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -unsigned long highstart_pfn, highend_pfn; static int noinline do_test_wp_bit(void); @@ -191,39 +192,43 @@ extern int is_available_memory(efi_memory_desc_t *); -int page_is_ram(unsigned long pagenr) +static int page_is_ram_efi(unsigned long pagenr) { +#ifdef CONFIG_EFI int i; unsigned long addr, end; - - if (efi_enabled) { - efi_memory_desc_t *md; + efi_memory_desc_t *md; void *p; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (!is_available_memory(md)) + continue; + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; + if ((pagenr >= addr) && (pagenr < end)) + return 1; } +#endif /* CONFIG_EFI */ + return 0; +} - for (i = 0; i < e820.nr_map; i++) { +int page_is_ram_e820(unsigned long pagenr, struct e820map *local_e820) +{ + int i; + unsigned long addr, end; - if (e820.map[i].type != E820_RAM) /* not usable memory */ + for (i = 0; i < local_e820->nr_map; i++) { + + if (local_e820->map[i].type != E820_RAM) /* not usable memory */ continue; /* * !!!FIXME!!! Some BIOSen report areas as RAM that * are not. Notably the 640->1Mb area. We need a sanity * check here. */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + addr = (local_e820->map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (local_e820->map[i].addr+local_e820->map[i].size) >> PAGE_SHIFT; if ((pagenr >= addr) && (pagenr < end)) return 1; } @@ -266,28 +271,72 @@ pkmap_page_table = pte; } -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +void __devinit free_new_highpage(struct page *page) +{ + set_page_count(page, 1); + __free_page(page); + totalhigh_pages++; +} + +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - totalhigh_pages++; + free_new_highpage(page); } else SetPageReserved(page); } -#ifdef CONFIG_NUMA -extern void set_highmem_pages_init(int); -#else -static void __init set_highmem_pages_init(int bad_ppro) +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) { - int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + free_new_highpage(page); + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr = max(pfn, max_mapnr); +#endif + num_physpages++; + return 0; +} + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage_hotplug(page, page_to_pfn(page)); +} + +void __init set_highmem_pages_init(int bad_ppro) +{ + struct zone *zone; + struct page *page; + + for_each_zone(zone) { + unsigned long node_pfn, zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + printk("Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, zone->zone_pgdat->node_id, + zone_start_pfn, zone_end_pfn); + + for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { + if (!pfn_valid(node_pfn)) + continue; + page = pfn_to_page(node_pfn); + add_one_highpage_init(page, node_pfn, bad_ppro); + } + } totalram_pages += totalhigh_pages; } -#endif /* CONFIG_FLATMEM */ #else #define kmap_init() do { } while (0) @@ -299,12 +348,6 @@ EXPORT_SYMBOL(__PAGE_KERNEL); unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; -#ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); -#else -#define remap_numa_kva() do {} while (0) -#endif - static void __init pagetable_init (void) { unsigned long vaddr; @@ -522,11 +565,6 @@ static void __init set_max_mapnr_init(void) { -#ifdef CONFIG_HIGHMEM - num_physpages = highend_pfn; -#else - num_physpages = max_low_pfn; -#endif #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif @@ -560,11 +598,7 @@ set_max_mapnr_init(); -#ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); @@ -615,6 +649,28 @@ #endif } +/* + * this is for the non-NUMA, single node SMP system case. + * Specifically, in the case of x86, we will always add + * memory to the highmem for now. + */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +int add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = &contig_page_data; + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages); +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +#endif + kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; @@ -695,3 +751,10 @@ } } #endif + +int page_is_ram(unsigned long pagenr) +{ + if (efi_enabled) + return page_is_ram_efi(pagenr); + return page_is_ram_e820(pagenr, &e820); +} --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/arch/i386/mm/numa.c 2005-09-30 12:38:13.000000000 -0700 @@ -0,0 +1,167 @@ +#include +#include +#include + +#include +#include + +unsigned long node_remap_start_pfn[MAX_NUMNODES]; +unsigned long node_remap_size[MAX_NUMNODES]; +unsigned long node_remap_offset[MAX_NUMNODES]; +void *node_remap_start_vaddr[MAX_NUMNODES]; + +void *node_remap_end_vaddr[MAX_NUMNODES]; +void *node_remap_alloc_vaddr[MAX_NUMNODES]; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +/* + * Allocate memory for the pg_data_t for this node via a crude pre-bootmem + * method. For node zero take this from the bottom of memory, for + * subsequent nodes place them at node_remap_start_vaddr which contains + * node local data in physically node local memory. See setup_memory() + * for details. + */ +static bootmem_data_t node0_bdata; +static void __init allocate_pgdat(int nid) +{ + if (nid && node_has_online_mem(nid)) + NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; + else { + NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + min_low_pfn += PFN_UP(sizeof(pg_data_t)); + memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); + NODE_DATA(0)->bdata = &node0_bdata; + } +} + +void setup_numa_kva_remap(void) +{ + int nid; + for_each_online_node(nid) { + if (NODE_DATA(nid)) + continue; + node_remap_start_vaddr[nid] = pfn_to_kaddr( + max_low_pfn + node_remap_offset[nid]); + /* Init the node remap allocator */ + node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + + (node_remap_size[nid] * PAGE_SIZE); + node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + + ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + allocate_pgdat(nid); + printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) pfn_to_kaddr(max_low_pfn + + node_remap_offset[nid] + node_remap_size[nid])); + } +} + +void *alloc_remap(int nid, unsigned long size) +{ + void *allocation = node_remap_alloc_vaddr[nid]; + + size = ALIGN(size, L1_CACHE_BYTES); + + if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + return 0; + + node_remap_alloc_vaddr[nid] += size; + memset(allocation, 0, size); + + return allocation; +} + +void __init remap_numa_kva(void) +{ + void *vaddr; + unsigned long pfn; + int node; + + for_each_online_node(node) { + for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { + vaddr = node_remap_start_vaddr[node]+(pfn< max_pfn) + continue; + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + + /* ensure the remap includes space for the pgdat. */ + size = node_remap_size[nid] + sizeof(pg_data_t); + + /* convert size to large (pmd size) pages, rounding up */ + size = (size + PMD_SIZE - 1) / PMD_SIZE; + /* now the roundup is correct, convert to PAGE_SIZE pages */ + size = size * PTRS_PER_PTE; + + /* + * Validate the region we are allocating only contains valid + * pages. + */ + for (pfn = node_end_pfn[nid] - size; + pfn < node_end_pfn[nid]; pfn++) + if (!page_is_ram(pfn)) + break; + + if (pfn != node_end_pfn[nid]) + size = 0; + + printk("Reserving %ld pages of KVA for lmem_map of node %d\n", + size, nid); + node_remap_size[nid] = size; + node_remap_offset[nid] = reserve_pages; + reserve_pages += size; + printk("Shrinking node %d from %ld pages to %ld pages\n", + nid, node_end_pfn[nid], node_end_pfn[nid] - size); + + if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { + /* + * Align node_end_pfn[] and node_remap_start_pfn[] to + * pmd boundary. remap_numa_kva will barf otherwise. + */ + printk("Shrinking node %d further by %ld pages for proper alignment\n", + nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); + size += node_end_pfn[nid] & (PTRS_PER_PTE-1); + } + + node_end_pfn[nid] -= size; + node_remap_start_pfn[nid] = node_end_pfn[nid]; + } + printk("Reserving total of %ld pages for numa KVA remap\n", + reserve_pages); + return reserve_pages; +} + +/* Find the owning node for a pfn. */ +int early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + for_each_node(nid) { + if (node_end_pfn[nid] == 0) + break; + if (node_start_pfn[nid] <= pfn && node_end_pfn[nid] >= pfn) + return nid; + } + + return 0; +} --- linux.orig/arch/i386/mm/pgtable.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/i386/mm/pgtable.c 2005-09-30 12:37:54.000000000 -0700 @@ -31,11 +31,13 @@ pg_data_t *pgdat; unsigned long i; struct page_state ps; + unsigned long flags; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -48,6 +50,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); --- linux.orig/arch/ia64/Kconfig~F3-create-__boot-ia64 2005-09-30 12:38:24.000000000 -0700 +++ linux/arch/ia64/Kconfig 2005-09-30 12:38:24.000000000 -0700 @@ -298,6 +298,9 @@ source "mm/Kconfig" +config ARCH_HAS_BOOTPA + def_bool y + config IA32_SUPPORT bool "Support for Linux/x86 binaries" help --- linux.orig/arch/ia64/mm/discontig.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/ia64/mm/discontig.c 2005-09-30 12:38:14.000000000 -0700 @@ -376,30 +376,6 @@ return ptr; } -/** - * pgdat_insert - insert the pgdat into global pgdat_list - * @pgdat: the pgdat for a node. - */ -static void __init pgdat_insert(pg_data_t *pgdat) -{ - pg_data_t *prev = NULL, *next; - - for_each_pgdat(next) - if (pgdat->node_id < next->node_id) - break; - else - prev = next; - - if (prev) { - prev->pgdat_next = pgdat; - pgdat->pgdat_next = next; - } else { - pgdat->pgdat_next = pgdat_list; - pgdat_list = pgdat; - } - - return; -} /** * memory_less_nodes - allocate and initialize CPU only nodes pernode @@ -524,9 +500,13 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long present = pgdat->node_present_pages; + unsigned long present; + unsigned long flags; int shared = 0, cached = 0, reserved = 0; + printk("Node ID: %d\n", pgdat->node_id); + pgdat_resize_lock(pgdat, &flags); + present = pgdat->node_present_pages; for(i = 0; i < pgdat->node_spanned_pages; i++) { struct page *page = pgdat_page_nr(pgdat, i); if (!ia64_pfn_valid(pgdat->node_start_pfn+i)) @@ -538,6 +518,7 @@ else if (page_count(page)) shared += page_count(page)-1; } + pgdat_resize_unlock(pgdat, &flags); total_present += present; total_reserved += reserved; total_cached += cached; @@ -695,11 +676,5 @@ pfn_offset, zholes_size); } - /* - * Make memory less nodes become a member of the known nodes. - */ - for_each_node_mask(node, memory_less_mask) - pgdat_insert(mem_data[node].pgdat); - zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } --- linux.orig/arch/m32r/mm/discontig.c~B3.4-remove-pgdat_list-ver2-m32r 2005-09-30 12:38:16.000000000 -0700 +++ linux/arch/m32r/mm/discontig.c 2005-09-30 12:38:16.000000000 -0700 @@ -137,12 +137,6 @@ int nid, i; mem_prof_t *mp; - pgdat_list = NULL; - for (nid = num_online_nodes() - 1 ; nid >= 0 ; nid--) { - NODE_DATA(nid)->pgdat_next = pgdat_list; - pgdat_list = NODE_DATA(nid); - } - for_each_online_node(nid) { mp = &mem_prof[nid]; for (i = 0 ; i < MAX_NR_ZONES ; i++) { --- linux.orig/arch/m32r/mm/init.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/m32r/mm/init.c 2005-09-30 12:37:54.000000000 -0700 @@ -48,6 +48,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -60,6 +62,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); @@ -150,10 +153,14 @@ int reservedpages, nid, i; reservedpages = 0; - for_each_online_node(nid) + for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) if (PageReserved(nid_page_nr(nid, i))) reservedpages++; + pgdat_resize_unlock(NODE_DATA(nid), &flags); + } return reservedpages; } --- linux.orig/arch/parisc/mm/init.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/parisc/mm/init.c 2005-09-30 12:37:54.000000000 -0700 @@ -505,7 +505,9 @@ for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { struct page *p; + unsigned long flags; + pgdat_resize_lock(NODE_DATA(i), &flags); p = nid_page_nr(i, j) - node_start_pfn(i); total++; @@ -517,6 +519,7 @@ free++; else shared += page_count(p) - 1; + pgdat_resize_unlock(NODE_DATA(i), &flags); } } #endif --- linux.orig/arch/ppc64/Kconfig~A4.3-antonb-ppc64-convert_to_sparsemem 2005-09-30 12:38:07.000000000 -0700 +++ linux/arch/ppc64/Kconfig 2005-09-30 12:38:22.000000000 -0700 @@ -227,6 +227,10 @@ depends on SMP default "32" +config ARCH_HAS_BOOTPA + bool + default y + config HMT bool "Hardware multithreading" depends on SMP && PPC_PSERIES && BROKEN @@ -238,23 +242,14 @@ def_bool y config ARCH_FLATMEM_ENABLE - def_bool y - depends on !NUMA - -config ARCH_DISCONTIGMEM_ENABLE - def_bool y - depends on SMP && PPC_PSERIES - -config ARCH_DISCONTIGMEM_DEFAULT def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE -config ARCH_FLATMEM_ENABLE +config ARCH_SPARSEMEM_ENABLE def_bool y -config ARCH_SPARSEMEM_ENABLE +config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on ARCH_DISCONTIGMEM_ENABLE + depends on PPC_PSERIES source "mm/Kconfig" @@ -276,7 +271,8 @@ config NUMA bool "NUMA support" - default y if DISCONTIGMEM || SPARSEMEM + depends on SPARSEMEM + default y if SPARSEMEM config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" --- linux.orig/arch/ppc64/kernel/pSeries_setup.c~no-found-boot_cpuid 2005-09-30 12:37:50.000000000 -0700 +++ linux/arch/ppc64/kernel/pSeries_setup.c 2005-09-30 12:37:50.000000000 -0700 @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include --- linux.orig/arch/ppc64/kernel/prom_init.c~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/arch/ppc64/kernel/prom_init.c 2005-09-30 12:38:27.000000000 -0700 @@ -1141,11 +1141,11 @@ extern unsigned long __secondary_hold_spinloop; extern unsigned long __secondary_hold_acknowledge; unsigned long *spinloop - = (void *)virt_to_abs(&__secondary_hold_spinloop); + = (void *)boot_virt_to_abs((unsigned long)&__secondary_hold_spinloop); unsigned long *acknowledge - = (void *)virt_to_abs(&__secondary_hold_acknowledge); + = (void *)boot_virt_to_abs((unsigned long)&__secondary_hold_acknowledge); unsigned long secondary_hold - = virt_to_abs(*PTRRELOC((unsigned long *)__secondary_hold)); + = boot_virt_to_abs(*PTRRELOC((unsigned long *)__secondary_hold)); struct prom_t *_prom = PTRRELOC(&prom); prom_debug("prom_hold_cpus: start...\n"); @@ -1871,7 +1871,7 @@ if ( r3 && r4 && r4 != 0xdeadbeef) { u64 val; - RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __pa(r3) : r3; + RELOC(prom_initrd_start) = (r3 >= KERNELBASE) ? __boot_pa(r3) : r3; RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4; val = (u64)RELOC(prom_initrd_start); --- linux.orig/arch/ppc64/kernel/rtas.c~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/arch/ppc64/kernel/rtas.c 2005-09-30 12:38:26.000000000 -0700 @@ -36,6 +36,7 @@ struct rtas_t rtas = { .lock = SPIN_LOCK_UNLOCKED }; +static unsigned long rtas_args_paddr; EXPORT_SYMBOL(rtas); @@ -309,8 +310,7 @@ for (i = 0; i < nret; ++i) rtas_args->rets[i] = 0; - PPCDBG(PPCDBG_RTAS, "\tentering rtas with 0x%lx\n", - __pa(rtas_args)); + PPCDBG(PPCDBG_RTAS, "\tentering rtas with 0x%lx\n", rtas_args_paddr); enter_rtas(__pa(rtas_args)); PPCDBG(PPCDBG_RTAS, "\treturned from rtas ...\n"); @@ -758,6 +758,8 @@ #endif /* CONFIG_HOTPLUG_CPU */ } + /* Get and save off phys address of rtas structure argunemt field */ + rtas_args_paddr = __boot_pa(&rtas.args); } --- linux.orig/arch/ppc64/kernel/setup.c~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/arch/ppc64/kernel/setup.c 2005-09-30 12:38:26.000000000 -0700 @@ -376,7 +376,7 @@ * tree, like retreiving the physical memory map or * calculating/retreiving the hash table size */ - early_init_devtree(__va(dt_ptr)); + early_init_devtree(__boot_va(dt_ptr)); /* * Iterate all ppc_md structures until we find the proper @@ -505,11 +505,11 @@ prop = (u64 *)get_property(of_chosen, "linux,initrd-start", NULL); if (prop != NULL) { - initrd_start = (unsigned long)__va(*prop); + initrd_start = (unsigned long)__boot_va(*prop); prop = (u64 *)get_property(of_chosen, "linux,initrd-end", NULL); if (prop != NULL) { - initrd_end = (unsigned long)__va(*prop); + initrd_end = (unsigned long)__boot_va(*prop); initrd_below_start_ok = 1; } else initrd_start = 0; @@ -940,9 +940,9 @@ * SLB misses on them. */ for_each_cpu(i) { - softirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE, + softirq_ctx[i] = (struct thread_info *)__boot_va(lmb_alloc_base(THREAD_SIZE, THREAD_SIZE, 0x10000000)); - hardirq_ctx[i] = (struct thread_info *)__va(lmb_alloc_base(THREAD_SIZE, + hardirq_ctx[i] = (struct thread_info *)__boot_va(lmb_alloc_base(THREAD_SIZE, THREAD_SIZE, 0x10000000)); } } @@ -971,7 +971,7 @@ limit = min(0x10000000UL, lmb.rmo_size); for_each_cpu(i) - paca[i].emergency_sp = __va(lmb_alloc_base(PAGE_SIZE, 128, + paca[i].emergency_sp = __boot_va(lmb_alloc_base(PAGE_SIZE, 128, limit)) + PAGE_SIZE; } --- linux.orig/arch/ppc64/kernel/time.c~no-found-boot_cpuid 2005-09-30 12:37:50.000000000 -0700 +++ linux/arch/ppc64/kernel/time.c 2005-09-30 12:37:50.000000000 -0700 @@ -65,6 +65,7 @@ #include #include #include +#include #include #include --- linux.orig/arch/ppc64/mm/hash_utils.c~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/arch/ppc64/mm/hash_utils.c 2005-09-30 12:38:28.000000000 -0700 @@ -132,12 +132,12 @@ #ifdef CONFIG_PPC_PSERIES if (systemcfg->platform & PLATFORM_LPAR) ret = pSeries_lpar_hpte_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, + boot_virt_to_abs(addr) >> PAGE_SHIFT, vflags, tmp_mode); else #endif /* CONFIG_PPC_PSERIES */ ret = native_hpte_insert(hpteg, va, - virt_to_abs(addr) >> PAGE_SHIFT, + boot_virt_to_abs(addr) >> PAGE_SHIFT, vflags, tmp_mode); if (ret == -1) { @@ -147,6 +147,13 @@ } } +void create_lmb_mapping(unsigned long start, unsigned long end) +{ + create_pte_mapping(start, end, + _PAGE_ACCESSED | _PAGE_COHERENT | PP_RWXX, + cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE ? 1 : 0); +} + void __init htab_initialize(void) { unsigned long table, htab_size_bytes; --- linux.orig/arch/ppc64/mm/init.c~FROM-MM-memory-hotplug-locking-node_size_lock 2005-09-30 12:37:54.000000000 -0700 +++ linux/arch/ppc64/mm/init.c 2005-09-30 12:38:28.000000000 -0700 @@ -104,6 +104,8 @@ show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); total++; @@ -114,6 +116,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%ld pages of RAM\n", total); printk("%ld reserved pages\n", reserved); @@ -633,7 +636,8 @@ unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; num_physpages = max_low_pfn; /* RAM is assumed contiguous */ - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + /* The strange -1 +1 is to avoid calling __va on an invalid address */ + high_memory = (void *) (__va(max_low_pfn * PAGE_SIZE - 1) + 1); #ifdef CONFIG_NEED_MULTIPLE_NODES for_each_online_node(nid) { @@ -649,11 +653,14 @@ #endif for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); if (PageReserved(page)) reservedpages++; } + pgdat_resize_unlock(pgdat, &flags); } codesize = (unsigned long)&_etext - (unsigned long)&_stext; @@ -868,3 +875,83 @@ return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); + +#ifdef CONFIG_MEMORY_HOTPLUG + +void online_page(struct page *page) +{ + ClearPageReserved(page); + free_cold_page(page); + totalram_pages++; + num_physpages++; +} + +/* + * This works only for the non-NUMA case. Later, we'll need a lookup + * to convert from real physical addresses to nid, that doesn't use + * pfn_to_nid(). + */ +int __devinit add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(0); + struct zone *zone; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + start += KERNELBASE; + create_lmb_mapping(start, start + size); + + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones; + + return __add_pages(zone, start_pfn, nr_pages); + + return 0; +} + +/* + * First pass at this code will check to determine if the remove + * request is within the RMO. Do not allow removal within the RMO. + */ +int __devinit remove_memory(u64 start, u64 size) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + end_pfn = start_pfn + nr_pages; + + printk("%s(): Attempting to remove memoy in range " + "%lx to %lx\n", __func__, start, start+size); + /* + * check for range within RMO + */ + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s(): memory will be removed from " + "the %s zone\n", __func__, zone->name); + + /* + * not handling removing memory ranges that + * overlap multiple zones yet + */ + if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages)) + goto overlap; + + /* make sure it is NOT in RMO */ + if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) { + printk("%s(): range to be removed must NOT be in RMO!\n", + __func__); + goto in_rmo; + } + + return __remove_pages(zone, start_pfn, nr_pages); + +overlap: + printk("%s(): memory range to be removed overlaps " + "multiple zones!!!\n", __func__); +in_rmo: + return -1; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ --- linux.orig/arch/ppc64/mm/numa.c~A4.2-antonb-ppc64-_use_generic_nr_cpus_node 2005-09-30 12:38:06.000000000 -0700 +++ linux/arch/ppc64/mm/numa.c 2005-09-30 12:38:07.000000000 -0700 @@ -17,54 +17,121 @@ #include #include #include +#include #include -#include -#include static int numa_enabled = 1; static int numa_debug; #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } -#ifdef DEBUG_NUMA -#define ARRAY_INITIALISER -1 -#else -#define ARRAY_INITIALISER 0 -#endif - -int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] = - ARRAY_INITIALISER}; -char *numa_memory_lookup_table; +int numa_cpu_lookup_table[NR_CPUS]; cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES]; -int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0}; - struct pglist_data *node_data[MAX_NUMNODES]; -bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; + +EXPORT_SYMBOL(numa_cpu_lookup_table); +EXPORT_SYMBOL(numa_cpumask_lookup_table); +EXPORT_SYMBOL(node_data); + +static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; static int min_common_depth; /* - * We need somewhere to store start/span for each node until we have + * We need somewhere to store start/end/node for each region until we have * allocated the real node_data structures. */ +#define MAX_REGIONS (MAX_LMB_REGIONS*2) static struct { - unsigned long node_start_pfn; - unsigned long node_end_pfn; - unsigned long node_present_pages; -} init_node_data[MAX_NUMNODES] __initdata; + unsigned long start_pfn; + unsigned long end_pfn; + int nid; +} init_node_data[MAX_REGIONS] __initdata; -EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(numa_cpu_lookup_table); -EXPORT_SYMBOL(numa_memory_lookup_table); -EXPORT_SYMBOL(numa_cpumask_lookup_table); -EXPORT_SYMBOL(nr_cpus_in_node); +int __init early_pfn_to_nid(unsigned long pfn) +{ + unsigned int i; + + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start_pfn = init_node_data[i].start_pfn; + unsigned long end_pfn = init_node_data[i].end_pfn; + + if ((start_pfn <= pfn) && (pfn < end_pfn)) + return init_node_data[i].nid; + } + + return -1; +} + +void __init add_region(unsigned int nid, unsigned long start_pfn, + unsigned long pages) +{ + unsigned int i; + + dbg("add_region nid %d start_pfn 0x%lx pages 0x%lx\n", + nid, start_pfn, pages); + + for (i = 0; init_node_data[i].end_pfn; i++) { + if (init_node_data[i].nid != nid) + continue; + if (init_node_data[i].end_pfn == start_pfn) { + init_node_data[i].end_pfn += pages; + return; + } + if (init_node_data[i].start_pfn == (start_pfn + pages)) { + init_node_data[i].start_pfn -= pages; + return; + } + } + + /* + * Leave last entry NULL so we dont iterate off the end (we use + * entry.end_pfn to terminate the walk). + */ + if (i >= (MAX_REGIONS - 1)) { + printk(KERN_ERR "WARNING: too many memory regions in " + "numa code, truncating\n"); + return; + } + + init_node_data[i].start_pfn = start_pfn; + init_node_data[i].end_pfn = start_pfn + pages; + init_node_data[i].nid = nid; +} + +/* We assume init_node_data has no overlapping regions */ +void __init get_region(unsigned int nid, unsigned long *start_pfn, + unsigned long *end_pfn, unsigned long *pages_present) +{ + unsigned int i; + + *start_pfn = -1UL; + *end_pfn = *pages_present = 0; + + for (i = 0; init_node_data[i].end_pfn; i++) { + if (init_node_data[i].nid != nid) + continue; + + *pages_present += init_node_data[i].end_pfn - + init_node_data[i].start_pfn; + + if (init_node_data[i].start_pfn < *start_pfn) + *start_pfn = init_node_data[i].start_pfn; + + if (init_node_data[i].end_pfn > *end_pfn) + *end_pfn = init_node_data[i].end_pfn; + } + + /* We didnt find a matching region, return start/end as 0 */ + if (*start_pfn == -1UL) + start_pfn = 0; +} static inline void map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; - if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) { + + if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node]))) cpu_set(cpu, numa_cpumask_lookup_table[node]); - nr_cpus_in_node[node]++; - } } #ifdef CONFIG_HOTPLUG_CPU @@ -76,7 +143,6 @@ if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) { cpu_clear(cpu, numa_cpumask_lookup_table[node]); - nr_cpus_in_node[node]--; } else { printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", cpu, node); @@ -84,7 +150,7 @@ } #endif /* CONFIG_HOTPLUG_CPU */ -static struct device_node * __devinit find_cpu_node(unsigned int cpu) +static struct device_node *find_cpu_node(unsigned int cpu) { unsigned int hw_cpuid = get_hard_smp_processor_id(cpu); struct device_node *cpu_node = NULL; @@ -211,7 +277,7 @@ return rc; } -static unsigned long read_n_cells(int n, unsigned int **buf) +static unsigned long __init read_n_cells(int n, unsigned int **buf) { unsigned long result = 0; @@ -293,7 +359,8 @@ * or zero. If the returned value of size is 0 the region should be * discarded as it lies wholy above the memory limit. */ -static unsigned long __init numa_enforce_memory_limit(unsigned long start, unsigned long size) +static unsigned long __init numa_enforce_memory_limit(unsigned long start, + unsigned long size) { /* * We use lmb_end_of_DRAM() in here instead of memory_limit because @@ -319,8 +386,7 @@ struct device_node *cpu = NULL; struct device_node *memory = NULL; int addr_cells, size_cells; - int max_domain = 0; - long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; + int max_domain; unsigned long i; if (numa_enabled == 0) { @@ -328,13 +394,6 @@ return -1; } - numa_memory_lookup_table = - (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); - memset(numa_memory_lookup_table, 0, entries * sizeof(char)); - - for (i = 0; i < entries ; i++) - numa_memory_lookup_table[i] = ARRAY_INITIALISER; - min_common_depth = find_min_common_depth(); dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); @@ -386,9 +445,6 @@ start = read_n_cells(addr_cells, &memcell_buf); size = read_n_cells(size_cells, &memcell_buf); - start = _ALIGN_DOWN(start, MEMORY_INCREMENT); - size = _ALIGN_UP(size, MEMORY_INCREMENT); - numa_domain = of_node_numa_domain(memory); if (numa_domain >= MAX_NUMNODES) { @@ -402,44 +458,15 @@ if (max_domain < numa_domain) max_domain = numa_domain; - if (! (size = numa_enforce_memory_limit(start, size))) { + if (!(size = numa_enforce_memory_limit(start, size))) { if (--ranges) goto new_range; else continue; } - /* - * Initialize new node struct, or add to an existing one. - */ - if (init_node_data[numa_domain].node_end_pfn) { - if ((start / PAGE_SIZE) < - init_node_data[numa_domain].node_start_pfn) - init_node_data[numa_domain].node_start_pfn = - start / PAGE_SIZE; - if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) > - init_node_data[numa_domain].node_end_pfn) - init_node_data[numa_domain].node_end_pfn = - (start / PAGE_SIZE) + - (size / PAGE_SIZE); - - init_node_data[numa_domain].node_present_pages += - size / PAGE_SIZE; - } else { - node_set_online(numa_domain); - - init_node_data[numa_domain].node_start_pfn = - start / PAGE_SIZE; - init_node_data[numa_domain].node_end_pfn = - init_node_data[numa_domain].node_start_pfn + - size / PAGE_SIZE; - init_node_data[numa_domain].node_present_pages = - size / PAGE_SIZE; - } - - for (i = start ; i < (start+size); i += MEMORY_INCREMENT) - numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = - numa_domain; + add_region(numa_domain, start >> PAGE_SHIFT, + size >> PAGE_SHIFT); if (--ranges) goto new_range; @@ -455,32 +482,15 @@ { unsigned long top_of_ram = lmb_end_of_DRAM(); unsigned long total_ram = lmb_phys_mem_size(); - unsigned long i; printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); printk(KERN_INFO "Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); - if (!numa_memory_lookup_table) { - long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT; - numa_memory_lookup_table = - (char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1)); - memset(numa_memory_lookup_table, 0, entries * sizeof(char)); - for (i = 0; i < entries ; i++) - numa_memory_lookup_table[i] = ARRAY_INITIALISER; - } - map_cpu_to_node(boot_cpuid, 0); - + add_region(0, 0, lmb_end_of_DRAM() >> PAGE_SHIFT); node_set_online(0); - - init_node_data[0].node_start_pfn = 0; - init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE; - init_node_data[0].node_present_pages = total_ram / PAGE_SIZE; - - for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) - numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; } static void __init dump_numa_topology(void) @@ -498,8 +508,9 @@ count = 0; - for (i = 0; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) { - if (numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] == node) { + for (i = 0; i < lmb_end_of_DRAM(); + i += (1 << SECTION_SIZE_BITS)) { + if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { if (count == 0) printk(" 0x%lx", i); ++count; @@ -524,10 +535,12 @@ * * Returns the physical address of the memory. */ -static unsigned long careful_allocation(int nid, unsigned long size, - unsigned long align, unsigned long end) +static void __init *careful_allocation(int nid, unsigned long size, + unsigned long align, + unsigned long end_pfn) { - unsigned long ret = lmb_alloc_base(size, align, end); + int new_nid; + unsigned long ret = lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT); /* retry over all memory */ if (!ret) @@ -541,28 +554,27 @@ * If the memory came from a previously allocated node, we must * retry with the bootmem allocator. */ - if (pa_to_nid(ret) < nid) { - nid = pa_to_nid(ret); - ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid), + new_nid = early_pfn_to_nid(ret >> PAGE_SHIFT); + if (new_nid < nid) { + ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(new_nid), size, align, 0); if (!ret) panic("numa.c: cannot allocate %lu bytes on node %d", - size, nid); + size, new_nid); - ret = virt_to_abs(ret); + ret = __pa(ret); dbg("alloc_bootmem %lx %lx\n", ret, size); } - return ret; + return (void *)ret; } void __init do_init_bootmem(void) { int nid; - int addr_cells, size_cells; - struct device_node *memory = NULL; + unsigned int i; static struct notifier_block ppc64_numa_nb = { .notifier_call = cpu_numa_callback, .priority = 1 /* Must run before sched domains notifier. */ @@ -580,99 +592,66 @@ register_cpu_notifier(&ppc64_numa_nb); for_each_online_node(nid) { - unsigned long start_paddr, end_paddr; - int i; + unsigned long start_pfn, end_pfn, pages_present; unsigned long bootmem_paddr; unsigned long bootmap_pages; - start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE; - end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE; + get_region(nid, &start_pfn, &end_pfn, &pages_present); /* Allocate the node structure node local if possible */ - NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid, + NODE_DATA(nid) = careful_allocation(nid, sizeof(struct pglist_data), - SMP_CACHE_BYTES, end_paddr); - NODE_DATA(nid) = abs_to_virt(NODE_DATA(nid)); + SMP_CACHE_BYTES, end_pfn); + NODE_DATA(nid) = __va(NODE_DATA(nid)); memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); dbg("node %d\n", nid); dbg("NODE_DATA() = %p\n", NODE_DATA(nid)); NODE_DATA(nid)->bdata = &plat_node_bdata[nid]; - NODE_DATA(nid)->node_start_pfn = - init_node_data[nid].node_start_pfn; - NODE_DATA(nid)->node_spanned_pages = - end_paddr - start_paddr; + NODE_DATA(nid)->node_start_pfn = start_pfn; + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; if (NODE_DATA(nid)->node_spanned_pages == 0) continue; - dbg("start_paddr = %lx\n", start_paddr); - dbg("end_paddr = %lx\n", end_paddr); + dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT); + dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT); - bootmap_pages = bootmem_bootmap_pages((end_paddr - start_paddr) >> PAGE_SHIFT); + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmem_paddr = (unsigned long)careful_allocation(nid, + bootmap_pages << PAGE_SHIFT, + PAGE_SIZE, end_pfn); + memset(__va(bootmem_paddr), 0, bootmap_pages << PAGE_SHIFT); - bootmem_paddr = careful_allocation(nid, - bootmap_pages << PAGE_SHIFT, - PAGE_SIZE, end_paddr); - memset(abs_to_virt(bootmem_paddr), 0, - bootmap_pages << PAGE_SHIFT); dbg("bootmap_paddr = %lx\n", bootmem_paddr); init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT, - start_paddr >> PAGE_SHIFT, - end_paddr >> PAGE_SHIFT); - - /* - * We need to do another scan of all memory sections to - * associate memory with the correct node. - */ - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long mem_start, mem_size; - int numa_domain, ranges; - unsigned int *memcell_buf; - unsigned int len; - - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; + start_pfn, end_pfn); - ranges = memory->n_addrs; /* ranges in cell */ -new_range: - mem_start = read_n_cells(addr_cells, &memcell_buf); - mem_size = read_n_cells(size_cells, &memcell_buf); - if (numa_enabled) { - numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) - numa_domain = 0; - } else - numa_domain = 0; + /* Add free regions on this node */ + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start, end; - if (numa_domain != nid) + if (init_node_data[i].nid != nid) continue; - mem_size = numa_enforce_memory_limit(mem_start, mem_size); - if (mem_size) { - dbg("free_bootmem %lx %lx\n", mem_start, mem_size); - free_bootmem_node(NODE_DATA(nid), mem_start, mem_size); - } + start = init_node_data[i].start_pfn << PAGE_SHIFT; + end = init_node_data[i].end_pfn << PAGE_SHIFT; - if (--ranges) /* process all ranges in cell */ - goto new_range; + dbg("free_bootmem %lx %lx\n", start, end - start); + free_bootmem_node(NODE_DATA(nid), start, end - start); } - /* - * Mark reserved regions on this node - */ + /* Mark reserved regions on this node */ for (i = 0; i < lmb.reserved.cnt; i++) { unsigned long physbase = lmb.reserved.region[i].base; unsigned long size = lmb.reserved.region[i].size; + unsigned long start_paddr = start_pfn << PAGE_SHIFT; + unsigned long end_paddr = end_pfn << PAGE_SHIFT; - if (pa_to_nid(physbase) != nid && - pa_to_nid(physbase+size-1) != nid) + if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && + early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) continue; if (physbase < end_paddr && @@ -692,46 +671,19 @@ size); } } - /* - * This loop may look famaliar, but we have to do it again - * after marking our reserved memory to mark memory present - * for sparsemem. - */ - addr_cells = get_mem_addr_cells(); - size_cells = get_mem_size_cells(); - memory = NULL; - while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { - unsigned long mem_start, mem_size; - int numa_domain, ranges; - unsigned int *memcell_buf; - unsigned int len; - memcell_buf = (unsigned int *)get_property(memory, "reg", &len); - if (!memcell_buf || len <= 0) - continue; + /* Add regions into sparsemem */ + for (i = 0; init_node_data[i].end_pfn; i++) { + unsigned long start, end; - ranges = memory->n_addrs; /* ranges in cell */ -new_range2: - mem_start = read_n_cells(addr_cells, &memcell_buf); - mem_size = read_n_cells(size_cells, &memcell_buf); - if (numa_enabled) { - numa_domain = of_node_numa_domain(memory); - if (numa_domain >= MAX_NUMNODES) - numa_domain = 0; - } else - numa_domain = 0; - - if (numa_domain != nid) + if (init_node_data[i].nid != nid) continue; - mem_size = numa_enforce_memory_limit(mem_start, mem_size); - memory_present(numa_domain, mem_start >> PAGE_SHIFT, - (mem_start + mem_size) >> PAGE_SHIFT); + start = init_node_data[i].start_pfn; + end = init_node_data[i].end_pfn; - if (--ranges) /* process all ranges in cell */ - goto new_range2; + memory_present(nid, start, end); } - } } @@ -745,21 +697,18 @@ memset(zholes_size, 0, sizeof(zholes_size)); for_each_online_node(nid) { - unsigned long start_pfn; - unsigned long end_pfn; + unsigned long start_pfn, end_pfn, pages_present; - start_pfn = init_node_data[nid].node_start_pfn; - end_pfn = init_node_data[nid].node_end_pfn; + get_region(nid, &start_pfn, &end_pfn, &pages_present); zones_size[ZONE_DMA] = end_pfn - start_pfn; - zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - - init_node_data[nid].node_present_pages; + zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - pages_present; dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); - free_area_init_node(nid, NODE_DATA(nid), zones_size, - start_pfn, zholes_size); + free_area_init_node(nid, NODE_DATA(nid), zones_size, start_pfn, + zholes_size); } } --- linux.orig/arch/x86_64/Kconfig~F2-create-__boot-x86_64 2005-09-30 12:38:23.000000000 -0700 +++ linux/arch/x86_64/Kconfig 2005-09-30 12:38:23.000000000 -0700 @@ -277,6 +277,9 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool y +config ARCH_HAS_BOOTPA + def_bool y + config NR_CPUS int "Maximum number of CPUs (2-256)" range 2 256 --- linux.orig/arch/x86_64/mm/init.c~A0.0-will-not-push-x86_64-hotplug-functions 2005-09-30 12:38:01.000000000 -0700 +++ linux/arch/x86_64/mm/init.c 2005-09-30 12:38:01.000000000 -0700 @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -179,13 +181,19 @@ {} }; -static __init void *alloc_low_page(int *index, unsigned long *phys) +static __devinit void *alloc_low_page(int *index, unsigned long *phys) { struct temp_map *ti; int i; unsigned long pfn = table_end++, paddr; void *adr; + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC); + *phys = __pa(adr); + return adr; + } + if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); for (i = 0; temp_mappings[i].allocated; i++) { @@ -198,55 +206,95 @@ ti->allocated = 1; __flush_tlb(); adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + memset(adr, 0, PAGE_SIZE); *index = i; *phys = pfn * PAGE_SIZE; return adr; } -static __init void unmap_low_page(int i) +static __devinit void unmap_low_page(int i) { - struct temp_map *ti = &temp_mappings[i]; + struct temp_map *ti; + + if (after_bootmem) + return; + ti = &temp_mappings[i]; set_pmd(ti->pmd, __pmd(0)); ti->allocated = 0; } -static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) + +static void __devinit +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +{ + int i; + + printk("%s: pmd: 0x%p, address: 0x%lx end: 0x%lx\n", + __func__, pmd, address, end); + + for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + unsigned long entry; + + if (address > end) { + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + entry = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | address; + entry &= __supported_pte_mask; + set_pmd(pmd, __pmd(entry)); + } +} + + +static void __devinit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); + + printk("%s: addr: 0x%lx end: 0x%lx pmd: 0x%p\n", + __func__, address, end, pmd); + + if (pmd_none(*pmd)) { + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); + } +} + + + +static void __devinit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { - long i, j; + long i = pud_index(address); - i = pud_index(address); pud = pud + i; + + if (after_bootmem && pud_val(*pud)) { + phys_pmd_update(pud, address, end); + return; + } + for (; i < PTRS_PER_PUD; pud++, i++) { int map; unsigned long paddr, pmd_phys; pmd_t *pmd; - paddr = address + i*PUD_SIZE; - if (paddr >= end) { - for (; i < PTRS_PER_PUD; i++, pud++) - set_pud(pud, __pud(0)); + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; + if (paddr >= end) break; - } - if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { set_pud(pud, __pud(0)); continue; } pmd = alloc_low_page(&map, &pmd_phys); + if (after_bootmem) spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) { - for (; j < PTRS_PER_PMD; j++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } + phys_pmd_init(pmd, paddr, end); + if (after_bootmem) spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } __flush_tlb(); @@ -267,12 +315,16 @@ table_start >>= PAGE_SHIFT; table_end = table_start; + + early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, + table_start< end) next = end; phys_pud_init(pud, __pa(start), __pa(next)); - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + if (!after_bootmem) + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(map); } - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + if (!after_bootmem) + asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); __flush_tlb_all(); - early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, - table_start<node_zones + MAX_NR_ZONES - 2; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + ret = __add_pages(zone, start_pfn, nr_pages, attr); + if (ret) + goto error; + + init_memory_mapping(start, (start + size - 1)); + + return ret; +error: + printk("%s: Problem encountered in __add_pages!\n", __func__); + return ret; +} +EXPORT_SYMBOL(add_memory); + +int remove_memory(u64 start, u64 size, unsigned long attr) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + printk("%s: start: 0x%llx size: 0x%llx attr: 0x%lx\n", + __func__, start, size, attr); + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + /* end_pfn is the last *valid* pfn */ + end_pfn = start_pfn + nr_pages - 1; + + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s: memory will be removed from the %s zone\n", + __func__, zone->name); + printk("%s: start_pfn: 0x%lx nr_pages: 0x%lx end_pfn: 0x%lx\n", + __func__, start_pfn, nr_pages, end_pfn); + + if (zone != page_zone(pfn_to_page(end_pfn))) + goto overlap; + + printk("%s: just before remove pages\n", __func__); + + return __remove_pages(zone, start_pfn, nr_pages, attr); +overlap: + printk("%s: memory range overlaps multiple zones?\n", __func__); + return -ENOSYS; +} +EXPORT_SYMBOL(remove_memory); + +#endif static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; --- linux.orig/drivers/acpi/acpi_memhotplug.c~FROM-MM-memory-hotplug-move-section_mem_map-alloc-to-sparsec-fix 2005-09-30 12:37:57.000000000 -0700 +++ linux/drivers/acpi/acpi_memhotplug.c 2005-09-30 12:38:04.000000000 -0700 @@ -30,6 +30,7 @@ #include #include #include +#include #include #define ACPI_MEMORY_DEVICE_COMPONENT 0x08000000UL @@ -180,6 +181,19 @@ return_VALUE(0); } +static acpi_status acpi_memory_set_name(struct acpi_memory_device *mem_device) +{ + struct acpi_device *device = NULL; + acpi_status status; + int ret; + status = acpi_bus_get_device(mem_device->handle, &device); + if (ACPI_FAILURE(status)) + return status; + ret = attach_device_to_memsection(mem_device->start_addr, + mem_device->end_addr, &device->kobj); + return_VALUE(ret); +} + static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) { int result; @@ -200,14 +214,14 @@ * Note: Assume that this function returns zero on success */ result = add_memory(mem_device->start_addr, - (mem_device->end_addr - mem_device->start_addr) + 1, - mem_device->read_write_attribute); + (mem_device->end_addr - mem_device->start_addr) + 1); if (result) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n")); mem_device->state = MEMORY_INVALID_STATE; return result; } - + /* link to /sys/devices/system/memory/memoryX */ + result = acpi_memory_set_name(mem_device); return result; } @@ -259,7 +273,7 @@ * Ask the VM to offline this memory range. * Note: Assume that this function returns zero on success */ - result = remove_memory(start, len, attr); + result = remove_memory(start, len); if (result) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n")); return_VALUE(result); @@ -473,6 +487,21 @@ return_ACPI_STATUS(status); } +static acpi_status __init acpi_memory_set_name_cb(acpi_handle handle, u32 level, + void *ctxt, void **retv) +{ + acpi_status status; + struct acpi_memory_device *mem_device; + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return_ACPI_STATUS(AE_OK); + if (acpi_memory_get_device(handle, &mem_device)) { + ACPI_DEBUG_PRINT((ACPI_DB_ERROR, + "Error in finding driver data\n")); + } + return acpi_memory_set_name(mem_device); +} + static int __init acpi_memory_device_init(void) { int result; @@ -496,6 +525,16 @@ return_VALUE(-ENODEV); } + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_set_name_cb, + NULL, NULL); + if (ACPI_FAILURE(status)) { + ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "walk_namespace failed\n")); + acpi_bus_unregister_driver(&acpi_memory_device_driver); + return_VALUE(-ENODEV); + } + return_VALUE(0); } --- linux.orig/drivers/acpi/processor_idle.c~A9.2-acpi-warnings 2005-09-30 12:38:08.000000000 -0700 +++ linux/drivers/acpi/processor_idle.c 2005-09-30 12:38:08.000000000 -0700 @@ -910,7 +910,7 @@ if (!pr) goto end; - seq_printf(seq, "active state: C%zd\n" + seq_printf(seq, "active state: C%d\n" "max_cstate: C%d\n" "bus master activity: %08x\n", pr->power.state ? pr->power.state - pr->power.states : 0, @@ -944,14 +944,14 @@ } if (pr->power.states[i].promotion.state) - seq_printf(seq, "promotion[C%zd] ", + seq_printf(seq, "promotion[C%d] ", (pr->power.states[i].promotion.state - pr->power.states)); else seq_puts(seq, "promotion[--] "); if (pr->power.states[i].demotion.state) - seq_printf(seq, "demotion[C%zd] ", + seq_printf(seq, "demotion[C%d] ", (pr->power.states[i].demotion.state - pr->power.states)); else --- linux.orig/drivers/base/Makefile~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/drivers/base/Makefile 2005-09-30 12:37:55.000000000 -0700 @@ -7,6 +7,7 @@ obj-y += power/ obj-$(CONFIG_FW_LOADER) += firmware_class.o obj-$(CONFIG_NUMA) += node.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_DEBUG_DRIVER),y) EXTRA_CFLAGS += -DDEBUG --- linux.orig/drivers/base/init.c~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/drivers/base/init.c 2005-09-30 12:37:55.000000000 -0700 @@ -9,6 +9,7 @@ #include #include +#include extern int devices_init(void); extern int buses_init(void); @@ -39,5 +40,6 @@ platform_bus_init(); system_bus_init(); cpu_dev_init(); + memory_dev_init(); attribute_container_init(); } --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/drivers/base/memory.c 2005-09-30 12:39:27.000000000 -0700 @@ -0,0 +1,498 @@ +/* + * drivers/base/memory.c - basic Memory class support + * + * Written by Matt Tolentino + * Dave Hansen + * + * This file provides the necessary infrastructure to represent + * a SPARSEMEM-memory-model system's physical memory in /sysfs. + * All arch-independent code that assumes MEMORY_HOTPLUG requires + * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. + */ + +#include +#include +#include +#include /* capable() */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMORY_CLASS_NAME "memory" + +static struct sysdev_class memory_sysdev_class = { + set_kset_name(MEMORY_CLASS_NAME), +}; +EXPORT_SYMBOL(memory_sysdev_class); + +static const char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) +{ + return MEMORY_CLASS_NAME; +} + +static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + int retval = 0; + + return retval; +} + +static struct kset_hotplug_ops memory_hotplug_ops = { + .name = memory_hotplug_name, + .hotplug = memory_hotplug, +}; + +static struct notifier_block *memory_chain; + +int register_memory_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&memory_chain, nb); +} + +void unregister_memory_notifier(struct notifier_block *nb) +{ + notifier_chain_unregister(&memory_chain, nb); +} + +/* + * register_memory - Setup a sysfs device for a memory block + */ +int register_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + int error; + + memory->sysdev.cls = &memory_sysdev_class; + memory->sysdev.id = __section_nr(section); + + error = sysdev_register(&memory->sysdev); + + if (root && !error) + error = sysfs_create_link(&root->sysdev.kobj, + &memory->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); + + return error; +} + +static void +unregister_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + BUG_ON(memory->sysdev.cls != &memory_sysdev_class); + BUG_ON(memory->sysdev.id != __section_nr(section)); + + sysdev_unregister(&memory->sysdev); + if (root) + sysfs_remove_link(&root->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); +} + +/* + * use this as the physical section index that this memsection + * uses. + */ + +static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%08lx\n", mem->phys_index); +} + +/* + * online, offline, going offline, etc. + */ +static ssize_t show_mem_state(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + ssize_t len = 0; + + /* + * We can probably put these states in a nice little array + * so that they're not open-coded + */ + switch (mem->state) { + case MEM_ONLINE: + len = sprintf(buf, "online\n"); + break; + case MEM_OFFLINE: + len = sprintf(buf, "offline\n"); + break; + case MEM_GOING_OFFLINE: + len = sprintf(buf, "going-offline\n"); + break; + default: + len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", + mem->state); + WARN_ON(1); + break; + } + + return len; +} + +static inline int memory_notify(unsigned long val, void *v) +{ + return notifier_call_chain(&memory_chain, val, v); +} + +/* + * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is + * OK to have direct references to sparsemem variables in here. + */ +static int +memory_block_action(struct memory_block *mem, unsigned long action) +{ + int i; + unsigned long psection; + unsigned long start_pfn, start_paddr; + struct page *first_page; + int ret; + int old_state = mem->state; + + psection = mem->phys_index; + first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); + + /* + * The probe routines leave the pages reserved, just + * as the bootmem code does. Make sure they're still + * that way. + */ + if (action == MEM_ONLINE) { + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageReserved(first_page+i)) + continue; + + printk(KERN_WARNING "section number %ld page number %d " + "not reserved, was it already online? \n", + psection, i); + return -EBUSY; + } + } + + switch (action) { + case MEM_ONLINE: + start_pfn = page_to_pfn(first_page); + ret = online_pages(start_pfn, PAGES_PER_SECTION); + break; + case MEM_OFFLINE: + mem->state = MEM_GOING_OFFLINE; + memory_notify(MEM_GOING_OFFLINE, NULL); + start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; + ret = remove_memory(start_paddr, + PAGES_PER_SECTION << PAGE_SHIFT); + if (ret) { + mem->state = old_state; + break; + } + memory_notify(MEM_MAPPING_INVALID, NULL); + break; + default: + printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", + __FUNCTION__, mem, action, action); + WARN_ON(1); + ret = -EINVAL; + } + /* + * For now, only notify on successful memory operations + */ + if (!ret) + memory_notify(action, NULL); + + return ret; +} + +static int memory_block_change_state(struct memory_block *mem, + unsigned long to_state, unsigned long from_state_req) +{ + int ret = 0; + down(&mem->state_sem); + + if (mem->state != from_state_req) { + ret = -EINVAL; + goto out; + } + + ret = memory_block_action(mem, to_state); + if (!ret) + mem->state = to_state; + +out: + up(&mem->state_sem); + return ret; +} + +static ssize_t +store_mem_state(struct sys_device *dev, const char *buf, size_t count) +{ + struct memory_block *mem; + unsigned int phys_section_nr; + int ret = -EINVAL; + + mem = container_of(dev, struct memory_block, sysdev); + phys_section_nr = mem->phys_index; + + if (!valid_section_nr(phys_section_nr)) + goto out; + + if (!strncmp(buf, "online", min((int)count, 6))) + ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + else if(!strncmp(buf, "offline", min((int)count, 7))) + ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); +out: + if (ret) + return ret; + return count; +} + + +static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); +static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); + + +#define mem_create_simple_file(mem, attr_name) \ + sysdev_create_file(&mem->sysdev, &attr_##attr_name) +#define mem_remove_simple_file(mem, attr_name) \ + sysdev_remove_file(&mem->sysdev, &attr_##attr_name) + +/* + * Block size attribute stuff + */ +static ssize_t +print_block_size(struct class *class, char *buf) +{ + return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); +} + +static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); + +static int block_size_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_block_size_bytes.attr); + return 0; +} + +/* + * Some architectures will have custom drivers to do this, and + * will not need to do it from userspace. The fake hot-add code + * as well as ppc64 will do all of their discovery in userspace + * and will require this interface. + */ +extern int page_is_hotpluggable_ram(unsigned long pfn); +#ifdef CONFIG_ARCH_MEMORY_PROBE +static ssize_t +memory_probe_store(struct class *class, const char __user *buf, size_t count) +{ + u64 phys_addr; + unsigned long offset; + int ret; + + phys_addr = simple_strtoull(buf, NULL, 0); + + for (offset = 0; offset < PAGES_PER_SECTION; offset++) { + unsigned long page_nr = (phys_addr >> PAGE_SHIFT) + offset; + if (page_is_hotpluggable_ram(page_nr)) + break; + } + if (offset == PAGES_PER_SECTION) + return -EINVAL; + + ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); + + if (ret) + count = ret; + + return count; +} +static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); + +static int memory_probe_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_probe.attr); + return 0; +} +#else +#define memory_probe_init(...) do {} while (0) +#endif + +static int attach_phys_device(struct memory_block *mem, struct kobject *kobj) +{ + char name[24]; + int ret; + sprintf(name, "phys_device%d",mem->phys_device); + ret = sysfs_create_link(&mem->sysdev.kobj, kobj, name); + if (ret) + return ret; + mem->phys_device++; + return 0; +} + +static void remove_all_phys_device(struct memory_block *mem) +{ + char name[24]; + int i; + for (i = 0; i < mem->phys_device; i++) { + sprintf(name, "phys_device%d",i); + sysfs_remove_link(&mem->sysdev.kobj, name); + } + mem->phys_device = 0; +} + +/* + * kobj is a kobject of physical memory device which includes the specified range + * It is here to allow for differentiation between which *physical* devices each + * section belongs to... + * If kobj != NULL, symbolic link to device from mem_section is created. + */ + +static int add_memory_block(unsigned long node_id, struct mem_section *section, + unsigned long state, struct kobject *kobj) +{ + struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); + int ret = 0; + + if (!mem) + return -ENOMEM; + + mem->phys_index = __section_nr(section); + mem->state = state; + init_MUTEX(&mem->state_sem); + + ret = register_memory(mem, section, NULL); + if (!ret) + ret = mem_create_simple_file(mem, phys_index); + if (!ret) + ret = mem_create_simple_file(mem, state); + if (!ret && kobj) + ret = attach_phys_device(mem, kobj); + + return ret; +} + +/* + * For now, we have a linear search to go find the appropriate + * memory_block corresponding to a particular phys_index. If + * this gets to be a real problem, we can always use a radix + * tree or something here. + * + * This could be made generic for all sysdev classes. + */ +static struct memory_block *find_memory_block(struct mem_section *section) +{ + struct kobject *kobj; + struct sys_device *sysdev; + struct memory_block *mem; + char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; + + /* + * This only works because we know that section == sysdev->id + * slightly redundant with sysdev_register() + */ + sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); + + kobj = kset_find_obj(&memory_sysdev_class.kset, name); + if (!kobj) + return NULL; + + sysdev = container_of(kobj, struct sys_device, kobj); + mem = container_of(sysdev, struct memory_block, sysdev); + + return mem; +} + +static struct memory_block *pfn_to_memory_block(unsigned long pfn) +{ + struct mem_section *section; + section = __nr_to_section(pfn_to_section_nr(pfn)); + return find_memory_block(section); +} + +int remove_memory_block(unsigned long node_id, struct mem_section *section) +{ + struct memory_block *mem; + + mem = find_memory_block(section); + mem_remove_simple_file(mem, phys_index); + mem_remove_simple_file(mem, state); + if (mem->phys_device) + remove_all_phys_device(mem); + unregister_memory(mem, section, NULL); + + return 0; +} + +/* + * creating symbolic link from mem_section[] in specified address range + * to specified device. This device here is expected to be physical memory device. + * This symbolic link will be used to show relationship between mem_section and device. + */ +int attach_device_to_memsection(u64 start_addr, u64 end_addr, struct kobject *kobj) +{ + unsigned long pfn = start_addr >> PAGE_SHIFT; + unsigned long end_pfn = end_addr >> PAGE_SHIFT; + struct memory_block *mem; + int ret = 0; + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + mem = pfn_to_memory_block(pfn); + if (mem) { + ret = attach_phys_device(mem, kobj); + if (ret) + break; + } + } + return ret; +} + +/* + * need an interface for the VM to add new memory regions, + * but without onlining it. + */ +int register_new_memory(struct mem_section *section) +{ + return add_memory_block(0, section, MEM_OFFLINE, NULL); +} + +int unregister_memory_section(struct mem_section *section) +{ + if (!valid_section(section)) + return -EINVAL; + + return remove_memory_block(0, section); +} + +/* + * Initialize the sysfs support for memory devices... + */ +int __init memory_dev_init(void) +{ + unsigned int i; + int ret; + + memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; + ret = sysdev_class_register(&memory_sysdev_class); + + /* + * Create entries for memory sections that were found + * during boot and have been initialized + */ + for (i = 0; i < NR_MEM_SECTIONS; i++) { + if (!valid_section_nr(i)) + continue; + add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); + } + + memory_probe_init(); + block_size_init(); + + return ret; +} --- linux.orig/drivers/scsi/scsi_scan.c~A9.3-uninit-scsi_report_lun_scan-result 2005-09-30 12:38:09.000000000 -0700 +++ linux/drivers/scsi/scsi_scan.c 2005-09-30 12:38:09.000000000 -0700 @@ -1063,7 +1063,7 @@ unsigned int lun; unsigned int num_luns; unsigned int retries; - int result; + int result = 0; struct scsi_lun *lunp, *lun_data; u8 *data; struct scsi_sense_hdr sshdr; --- linux.orig/include/asm-alpha/mmzone.h~A4.1-antonb-Remove_kvaddr_to_nid_and_local_mapnr 2005-09-30 12:38:06.000000000 -0700 +++ linux/include/asm-alpha/mmzone.h 2005-09-30 12:38:06.000000000 -0700 @@ -32,8 +32,6 @@ #define pa_to_nid(pa) alpha_pa_to_nid(pa) #define NODE_DATA(nid) (&node_data[(nid)]) -#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) - #if 1 #define PLAT_NODE_DATA_LOCALNR(p, n) \ (((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn) @@ -49,28 +47,14 @@ #ifdef CONFIG_DISCONTIGMEM +#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) + /* * Following are macros that each numa implementation must define. */ -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define local_mapnr(kvaddr) \ - ((__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr))) - -/* - * Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory - * and returns the kaddr corresponding to first physical page in the - * node's mem_map. - */ -#define LOCAL_BASE_ADDR(kaddr) \ - ((unsigned long)__va(NODE_DATA(kvaddr_to_nid(kaddr))->node_start_pfn \ - << PAGE_SHIFT)) - /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) @@ -106,8 +90,9 @@ #define pfn_to_page(pfn) \ ({ \ - unsigned long kaddr = (unsigned long)__va((pfn) << PAGE_SHIFT); \ - (NODE_DATA(kvaddr_to_nid(kaddr))->node_mem_map + local_mapnr(kaddr)); \ + unsigned long __tmp = pfn; \ + (NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \ + node_localnr(__tmp, pfn_to_nid(__tmp))); \ }) #define page_to_pfn(page) \ --- linux.orig/include/asm-i386/mmzone.h~FROM-MM-memory-hotplug-prep-kill-local_mapnr 2005-09-30 12:37:50.000000000 -0700 +++ linux/include/asm-i386/mmzone.h 2005-09-30 12:38:13.000000000 -0700 @@ -38,10 +38,15 @@ } extern int early_pfn_to_nid(unsigned long pfn); - +extern void __init remap_numa_kva(void); +extern unsigned long calculate_numa_remap_pages(void); +extern void setup_numa_kva_remap(void); #else /* !CONFIG_NUMA */ #define get_memcfg_numa get_memcfg_numa_flat #define get_zholes_size(n) (0) +#define remap_numa_kva() do {} while (0) +#define setup_numa_kva_remap() do {} while (0) +#define calculate_numa_remap_pages() (0) #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM @@ -76,11 +81,6 @@ * Following are macros that each numa implmentation must define. */ -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) - #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -88,12 +88,6 @@ __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) --- linux.orig/include/asm-i386/page.h~F0-create-__boot-i386 2005-09-30 12:38:22.000000000 -0700 +++ linux/include/asm-i386/page.h 2005-09-30 12:38:22.000000000 -0700 @@ -122,8 +122,10 @@ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) #define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) -#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) -#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#define __boot_pa(x) ((unsigned long)(x)-PAGE_OFFSET) +#define __boot_va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#define __pa(x) __boot_pa(x) +#define __va(x) __boot_va(x) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) --- linux.orig/include/asm-i386/pgtable-3level.h~B2.2-i386-create-numa.c 2005-09-30 12:38:13.000000000 -0700 +++ linux/include/asm-i386/pgtable-3level.h 2005-09-30 12:38:13.000000000 -0700 @@ -65,6 +65,7 @@ set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval)) #define set_pud(pudptr,pudval) \ (*(pudptr) = (pudval)) +extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); /* * Pentium-II erratum A13: in PAE mode we explicitly have to flush --- linux.orig/include/asm-i386/sparsemem.h~E6-for-debugging-more-FLAGS_RESERVED 2005-09-30 12:38:21.000000000 -0700 +++ linux/include/asm-i386/sparsemem.h 2005-09-30 12:38:21.000000000 -0700 @@ -15,7 +15,7 @@ * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space */ #ifdef CONFIG_X86_PAE -#define SECTION_SIZE_BITS 30 +#define SECTION_SIZE_BITS 28 #define MAX_PHYSADDR_BITS 36 #define MAX_PHYSMEM_BITS 36 #else --- linux.orig/include/asm-m32r/mmzone.h~FROM-MM-memory-hotplug-prep-kill-local_mapnr 2005-09-30 12:37:50.000000000 -0700 +++ linux/include/asm-m32r/mmzone.h 2005-09-30 12:37:50.000000000 -0700 @@ -21,12 +21,6 @@ __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = pfn; \ --- linux.orig/include/asm-mips/mmzone.h~A4.1-antonb-Remove_kvaddr_to_nid_and_local_mapnr 2005-09-30 12:38:06.000000000 -0700 +++ linux/include/asm-mips/mmzone.h 2005-09-30 12:38:06.000000000 -0700 @@ -10,7 +10,6 @@ #ifdef CONFIG_DISCONTIGMEM -#define kvaddr_to_nid(kvaddr) pa_to_nid(__pa(kvaddr)) #define pfn_to_nid(pfn) pa_to_nid((pfn) << PAGE_SHIFT) #define pfn_valid(pfn) \ --- linux.orig/include/asm-parisc/mmzone.h~FROM-MM-memory-hotplug-prep-kill-local_mapnr 2005-09-30 12:37:50.000000000 -0700 +++ linux/include/asm-parisc/mmzone.h 2005-09-30 12:38:06.000000000 -0700 @@ -14,11 +14,6 @@ #define NODE_DATA(nid) (&node_data[nid].pg_data) -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) - #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) \ ({ \ @@ -27,12 +22,6 @@ }) #define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ --- linux.orig/include/asm-powerpc/topology.h~A4.2-antonb-ppc64-_use_generic_nr_cpus_node 2005-09-30 12:38:06.000000000 -0700 +++ linux/include/asm-powerpc/topology.h 2005-09-30 12:38:07.000000000 -0700 @@ -9,15 +9,7 @@ static inline int cpu_to_node(int cpu) { - int node; - - node = numa_cpu_lookup_table[cpu]; - -#ifdef DEBUG_NUMA - BUG_ON(node == -1); -#endif - - return node; + return numa_cpu_lookup_table[cpu]; } #define parent_node(node) (node) @@ -37,8 +29,6 @@ #define pcibus_to_node(node) (-1) #define pcibus_to_cpumask(bus) (cpu_online_map) -#define nr_cpus_node(node) (nr_cpus_in_node[node]) - /* sched_domains SD_NODE_INIT for PPC64 machines */ #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ --- linux.orig/include/asm-ppc64/abs_addr.h~G1-kravetz-ppc64-fixes-static_inlines 2005-09-30 12:38:27.000000000 -0700 +++ linux/include/asm-ppc64/abs_addr.h 2005-09-30 12:38:27.000000000 -0700 @@ -62,5 +62,14 @@ /* Convenience macros */ #define virt_to_abs(va) phys_to_abs(__pa(va)) #define abs_to_virt(aa) __va(aa) +static inline unsigned long boot_virt_to_abs(unsigned long va) +{ + return phys_to_abs(__boot_pa(va)); +} +static inline void *boot_abs_to_virt(unsigned long aa) +{ + return __boot_va(aa); +} + #endif /* _ABS_ADDR_H */ --- linux.orig/include/asm-ppc64/dma.h~G0-ppc64-__boot-fixes 2005-09-30 12:38:26.000000000 -0700 +++ linux/include/asm-ppc64/dma.h 2005-09-30 12:38:26.000000000 -0700 @@ -26,6 +26,8 @@ /* The maximum address that we can perform a DMA transfer to on this platform */ /* Doesn't really apply... */ #define MAX_DMA_ADDRESS (~0UL) +#define MAX_DMA_PHYSADDR MAX_DMA_ADDRESS +#define MAX_DMA_PHYSADDR MAX_DMA_ADDRESS #if !defined(CONFIG_PPC_ISERIES) || defined(CONFIG_PCI) --- linux.orig/include/asm-ppc64/mmzone.h~FROM-MM-memory-hotplug-prep-kill-local_mapnr 2005-09-30 12:37:50.000000000 -0700 +++ linux/include/asm-ppc64/mmzone.h 2005-09-30 12:38:07.000000000 -0700 @@ -8,15 +8,14 @@ #define _ASM_MMZONE_H_ #include -#include -/* generic non-linear memory support: +/* + * generic non-linear memory support: * * 1) we will not split memory into more chunks than will fit into the * flags field of the struct page */ - #ifdef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data *node_data[]; @@ -30,35 +29,7 @@ */ extern int numa_cpu_lookup_table[]; -extern char *numa_memory_lookup_table; extern cpumask_t numa_cpumask_lookup_table[]; -extern int nr_cpus_in_node[]; - -/* 16MB regions */ -#define MEMORY_INCREMENT_SHIFT 24 -#define MEMORY_INCREMENT (1UL << MEMORY_INCREMENT_SHIFT) - -/* NUMA debugging, will not work on a DLPAR machine */ -#undef DEBUG_NUMA - -static inline int pa_to_nid(unsigned long pa) -{ - int nid; - - nid = numa_memory_lookup_table[pa >> MEMORY_INCREMENT_SHIFT]; - -#ifdef DEBUG_NUMA - /* the physical address passed in is not in the map for the system */ - if (nid == -1) { - printk("bad address: %lx\n", pa); - BUG(); - } -#endif - - return nid; -} - -#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) /* * Following are macros that each numa implmentation must define. @@ -67,42 +38,10 @@ #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) - -#ifdef CONFIG_DISCONTIGMEM - -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) - -#define pfn_to_nid(pfn) pa_to_nid((unsigned long)(pfn) << PAGE_SHIFT) - -/* Written this way to avoid evaluating arguments twice */ -#define discontigmem_pfn_to_page(pfn) \ -({ \ - unsigned long __tmp = pfn; \ - (NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \ - node_localnr(__tmp, pfn_to_nid(__tmp))); \ -}) - -#define discontigmem_page_to_pfn(p) \ -({ \ - struct page *__tmp = p; \ - (((__tmp) - page_zone(__tmp)->zone_mem_map) + \ - page_zone(__tmp)->zone_start_pfn); \ -}) - -/* XXX fix for discontiguous physical memory */ -#define discontigmem_pfn_valid(pfn) ((pfn) < num_physpages) - -#endif /* CONFIG_DISCONTIGMEM */ - #endif /* CONFIG_NEED_MULTIPLE_NODES */ #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -#define early_pfn_to_nid(pfn) pa_to_nid(((unsigned long)pfn) << PAGE_SHIFT) +extern int __init early_pfn_to_nid(unsigned long pfn); #endif #endif /* _ASM_MMZONE_H_ */ --- linux.orig/include/asm-ppc64/page.h~A4.3-antonb-ppc64-convert_to_sparsemem 2005-09-30 12:38:07.000000000 -0700 +++ linux/include/asm-ppc64/page.h 2005-09-30 12:38:22.000000000 -0700 @@ -172,7 +172,10 @@ #endif -#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) +#define __boot_pa(x) ((unsigned long)(x)-PAGE_OFFSET) +#define __boot_va(x) ((void *)((unsigned long)(x) + KERNELBASE)) +#define __pa(x) __boot_pa(x) +#define __va(x) __boot_va(x) extern int page_is_ram(unsigned long pfn); @@ -206,13 +209,6 @@ #define USER_REGION_ID (0UL) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define __va(x) ((void *)((unsigned long)(x) + KERNELBASE)) - -#ifdef CONFIG_DISCONTIGMEM -#define page_to_pfn(page) discontigmem_page_to_pfn(page) -#define pfn_to_page(pfn) discontigmem_pfn_to_page(pfn) -#define pfn_valid(pfn) discontigmem_pfn_valid(pfn) -#endif #ifdef CONFIG_FLATMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) --- linux.orig/include/asm-x86_64/mmzone.h~A4.1-antonb-Remove_kvaddr_to_nid_and_local_mapnr 2005-09-30 12:38:06.000000000 -0700 +++ linux/include/asm-x86_64/mmzone.h 2005-09-30 12:38:06.000000000 -0700 @@ -39,7 +39,6 @@ #ifdef CONFIG_DISCONTIGMEM #define pfn_to_nid(pfn) phys_to_nid((unsigned long)(pfn) << PAGE_SHIFT) -#define kvaddr_to_nid(kaddr) phys_to_nid(__pa(kaddr)) /* AK: this currently doesn't deal with invalid addresses. We'll see if the 2.5 kernel doesn't pass them @@ -57,7 +56,5 @@ nid__ != 0xff && (pfn) >= node_start_pfn(nid__) && (pfn) < node_end_pfn(nid__); })) #endif -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) #endif #endif --- linux.orig/include/linux/bootmem.h~F4-use-__boot-generic 2005-09-30 12:38:24.000000000 -0700 +++ linux/include/linux/bootmem.h 2005-09-30 12:38:25.000000000 -0700 @@ -10,6 +10,11 @@ #include #include +#ifndef CONFIG_ARCH_HAS_BOOTPA +#define __boot_pa(pa) __pa(pa) +#define __boot_va(pa) __va(va) +#endif + /* * simple boot-time physical memory area allocator. */ @@ -40,6 +45,10 @@ * up searching */ } bootmem_data_t; +#ifndef MAX_DMA_PHYSADDR +#define MAX_DMA_PHYSADDR (__boot_pa(MAX_DMA_ADDRESS)) +#endif + extern unsigned long __init bootmem_bootmap_pages (unsigned long); extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend); extern void __init free_bootmem (unsigned long addr, unsigned long size); @@ -47,11 +56,11 @@ #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE extern void __init reserve_bootmem (unsigned long addr, unsigned long size); #define alloc_bootmem(x) \ - __alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem((x), SMP_CACHE_BYTES, MAX_DMA_PHYSADDR) #define alloc_bootmem_low(x) \ __alloc_bootmem((x), SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ - __alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem((x), PAGE_SIZE, MAX_DMA_PHYSADDR) #define alloc_bootmem_low_pages(x) \ __alloc_bootmem((x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ @@ -64,9 +73,9 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem_node(pgdat, x) \ - __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, MAX_DMA_PHYSADDR) #define alloc_bootmem_pages_node(pgdat, x) \ - __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, MAX_DMA_PHYSADDR) #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/include/linux/memory.h 2005-09-30 12:38:19.000000000 -0700 @@ -0,0 +1,97 @@ +/* + * include/linux/memory.h - generic memory definition + * + * This is mainly for topological representation. We define the + * basic "struct memory_block" here, which can be embedded in per-arch + * definitions or NUMA information. + * + * Basic handling of the devices is done in drivers/base/memory.c + * and system devices are handled in drivers/base/sys.c. + * + * Memory block are exported via sysfs in the class/memory/devices/ + * directory. + * + */ +#ifndef _LINUX_MEMORY_H_ +#define _LINUX_MEMORY_H_ + +#include +#include +#include + +#include + +struct memory_block { + unsigned long phys_index; + unsigned long state; + /* + * This serializes all state change requests. It isn't + * held during creation because the control files are + * created long after the critical areas during + * initialization. + */ + struct semaphore state_sem; + int phys_device; /* num of attached phys_device */ + void *hw; /* optional pointer to fw/hw data */ + int (*phys_callback)(struct memory_block *); + struct sys_device sysdev; +}; + +/* These states are exposed to userspace as text strings in sysfs */ +#define MEM_ONLINE (1<<0) /* exposed to userspace */ +#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ +#define MEM_OFFLINE (1<<2) /* exposed to userspace */ + +/* + * All of these states are currently kernel-internal for notifying + * kernel components and architectures. + * + * For MEM_MAPPING_INVALID, all notifier chains with priority >0 + * are called before pfn_to_page() becomes invalid. The priority=0 + * entry is reserved for the function that actually makes + * pfn_to_page() stop working. Any notifiers that want to be called + * after that should have priority <0. + */ +#define MEM_MAPPING_INVALID (1<<3) + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline int memory_dev_init(void) +{ + return 0; +} +static inline int register_memory_notifier(struct notifier_block *nb) +{ + return 0; +} +static inline void unregister_memory_notifier(struct notifier_block *nb) +{ +} +#else +extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); +extern int register_new_memory(struct mem_section *); +extern int unregister_memory_section(struct mem_section *); +extern int memory_dev_init(void); +extern int register_memory_notifier(struct notifier_block *nb); +extern void unregister_memory_notifier(struct notifier_block *nb); +/* creating symbolic link between memory device and memory section */ +extern int attach_device_to_memsection(u64 start_addr, u64 end_addr, struct kobject *kobj); + +#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< +#include +#include +#include + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * pgdat resizing functions + */ +static inline +void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqsave(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_init(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->node_size_lock); +} +/* + * Zone resizing functions + */ +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return read_seqbegin(&zone->span_seqlock); +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return read_seqretry(&zone->span_seqlock, iv); +} +static inline void zone_span_writelock(struct zone *zone) +{ + write_seqlock(&zone->span_seqlock); +} +static inline void zone_span_writeunlock(struct zone *zone) +{ + write_sequnlock(&zone->span_seqlock); +} +static inline void zone_seqlock_init(struct zone *zone) +{ + seqlock_init(&zone->span_seqlock); +} +extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); +extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); +extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +/* need some defines for these for archs that don't support it */ +extern void online_page(struct page *page); +/* VM interface that may be used by firmware interface */ +extern int add_memory(u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern int online_pages(unsigned long, unsigned long); + +/* reasonably generic interface to expand the physical pages in a zone */ +extern int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); +#else /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Stub functions for when hotplug is off + */ +static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_init(struct pglist_data *pgdat) {} + +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return 0; +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return 0; +} +static inline void zone_span_writelock(struct zone *zone) {} +static inline void zone_span_writeunlock(struct zone *zone) {} +static inline void zone_seqlock_init(struct zone *zone) {} + +static inline int mhp_notimplemented(const char *func) +{ + printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); + dump_stack(); + return -ENOSYS; +} + +static inline int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + return mhp_notimplemented(__FUNCTION__); +} +#endif /* ! CONFIG_MEMORY_HOTPLUG */ +static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__); + dump_stack(); + return -ENOSYS; +} +#endif /* __LINUX_MEMORY_HOTPLUG_H */ --- linux.orig/include/linux/mm.h~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/include/linux/mm.h 2005-09-30 12:39:32.000000000 -0700 @@ -791,6 +791,9 @@ unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); +extern void zonetable_add(struct zone *, int, int, + unsigned long, unsigned long); +extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); --- linux.orig/include/linux/mm_inline.h~AA-PM-01-steal_page_from_lru 2005-09-30 12:38:29.000000000 -0700 +++ linux/include/linux/mm_inline.h 2005-09-30 12:38:29.000000000 -0700 @@ -38,3 +38,71 @@ zone->nr_inactive--; } } + +static inline int +isolate_lru_onepage(struct page *page, struct list_head *src, + struct list_head *dst) +{ + if (!TestClearPageLRU(page)) + BUG(); + list_del(&page->lru); + if (get_page_testone(page)) { + /* + * It is being freed elsewhere + */ + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + return 0; + } + list_add(&page->lru, dst); + return 1; +} + + +static inline int +__steal_page_from_lru(struct zone *zone, struct page *page, + struct list_head *dst) +{ + if (PageActive(page)) { + if (!isolate_lru_onepage(page, &zone->active_list, dst)) + return 0; + zone->nr_active--; + } else { + if (!isolate_lru_onepage(page, &zone->inactive_list, dst)) + return 0; + zone->nr_inactive--; + } + return 1; +} + +static inline int +steal_page_from_lru(struct zone *zone, struct page *page, + struct list_head *dst) +{ + int ret; + spin_lock_irq(&zone->lru_lock); + ret = __steal_page_from_lru(zone, page, dst); + spin_unlock_irq(&zone->lru_lock); + return ret; +} + +static inline void +__putback_page_to_lru(struct zone *zone, struct page *page) +{ + if (TestSetPageLRU(page)) + BUG(); + if (PageActive(page)) + add_page_to_active_list(zone, page); + else + add_page_to_inactive_list(zone, page); +} + +static inline void +putback_page_to_lru(struct zone *zone, struct page *page) +{ + spin_lock_irq(&zone->lru_lock); + __putback_page_to_lru(zone, page); + spin_unlock_irq(&zone->lru_lock); +} + --- linux.orig/include/linux/mmzone.h~FROM-MM-memory-hotplug-prep-__section_nr-helper 2005-09-30 12:37:52.000000000 -0700 +++ linux/include/linux/mmzone.h 2005-09-30 12:38:21.000000000 -0700 @@ -11,8 +11,11 @@ #include #include #include +#include #include +#include #include +#include /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER @@ -137,6 +140,10 @@ * free areas of different sizes */ spinlock_t lock; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif struct free_area free_area[MAX_ORDER]; @@ -220,6 +227,16 @@ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; + /* + * zone_start_pfn, spanned_pages and present_pages are all + * protected by span_seqlock. It is a seqlock because it has + * to be read outside of zone->lock, and it is done in the main + * allocator path. But, it is written quite infrequently. + * + * The lock is declared along with zone->lock because it is + * frequently read in proximity to zone->lock. It's good to + * give them a chance of being in the same cacheline. + */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ @@ -227,6 +244,7 @@ * rarely used fields: */ char *name; + struct semaphore init_sem; } ____cacheline_maxaligned_in_smp; @@ -273,12 +291,21 @@ struct page *node_mem_map; #endif struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; - struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; @@ -293,7 +320,7 @@ #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) -extern struct pglist_data *pgdat_list; +#include void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat); @@ -314,62 +341,6 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); #endif -/* - * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. - */ -#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) - -/** - * for_each_pgdat - helper macro to iterate over all nodes - * @pgdat - pointer to a pg_data_t variable - * - * Meant to help with common loops of the form - * pgdat = pgdat_list; - * while(pgdat) { - * ... - * pgdat = pgdat->pgdat_next; - * } - */ -#define for_each_pgdat(pgdat) \ - for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next) - -/* - * next_zone - helper magic for for_each_zone() - * Thanks to William Lee Irwin III for this piece of ingenuity. - */ -static inline struct zone *next_zone(struct zone *zone) -{ - pg_data_t *pgdat = zone->zone_pgdat; - - if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) - zone++; - else if (pgdat->pgdat_next) { - pgdat = pgdat->pgdat_next; - zone = pgdat->node_zones; - } else - zone = NULL; - - return zone; -} - -/** - * for_each_zone - helper macro to iterate over all memory zones - * @zone - pointer to struct zone variable - * - * The user only needs to declare the zone variable, for_each_zone - * fills it in. This basically means for_each_zone() is an - * easier to read version of this piece of code: - * - * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next) - * for (i = 0; i < MAX_NR_ZONES; ++i) { - * struct zone * z = pgdat->node_zones + i; - * ... - * } - * } - */ -#define for_each_zone(zone) \ - for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) - static inline int is_highmem_idx(int idx) { return (idx == ZONE_HIGHMEM); @@ -422,6 +393,73 @@ #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +/* + * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. + */ +#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) + +#define first_online_pgdat() NODE_DATA(first_online_node()) +#define next_online_pgdat(pgdat) \ + ((next_online_node((pgdat)->node_id) != MAX_NUMNODES) ? \ + NODE_DATA(next_online_node((pgdat)->node_id)) : NULL) + +/** + * for_each_pgdat - helper macro to iterate over all online nodes + * @pgdat - pointer to a pg_data_t variable + * + * Meant to help with common loops of the form + * pgdat = NODE_DATA(first_online_node()) + * while(pgdat) { + * ... + * pgdat = (next node is online) ? NODE_DATA(next_node) : NULL ; + * } + */ +#define for_each_pgdat(pgdat) \ + for (pgdat = first_online_pgdat(); pgdat; \ + pgdat = next_online_pgdat(pgdat)) + +/* + * next_zone - helper magic for for_each_zone() + * Thanks to William Lee Irwin III for this piece of ingenuity. + */ +static inline struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + + return zone; +} + +/** + * for_each_zone - helper macro to iterate over all memory zones + * @zone - pointer to struct zone variable + * + * The user only needs to declare the zone variable, for_each_zone + * fills it in. This basically means for_each_zone() is an + * easier to read version of this piece of code: + * + * for (pgdat = first_online_node(); pgdat; pgdat = next_online_node(pgdat)) + * for (i = 0; i < MAX_NR_ZONES; ++i) { + * struct zone * z = pgdat->node_zones + i; + * ... + * } + * } + */ +#define for_each_zone(zone) \ + for (zone = first_online_pgdat()->node_zones; \ + zone; zone = next_zone(zone)) + + #ifdef CONFIG_SPARSEMEM #include #endif @@ -431,7 +469,7 @@ * with 32 bit page->flags field, we reserve 8 bits for node/zone info. * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes. */ -#define FLAGS_RESERVED 8 +#define FLAGS_RESERVED 10 #elif BITS_PER_LONG == 64 /* @@ -509,6 +547,7 @@ return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } +extern int __section_nr(struct mem_section* ms); /* * We use the lower bits of the mem_map pointer to store @@ -542,11 +581,6 @@ return valid_section(__nr_to_section(nr)); } -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) - static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); @@ -586,6 +620,7 @@ #define early_pfn_valid(pfn) pfn_valid(pfn) void sparse_init(void); +extern int sparse_add_one_section(struct zone *, unsigned long, int); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) --- linux.orig/include/linux/nodemask.h~B3.0-remove-pgdat_list-ver2-base 2005-09-30 12:38:13.000000000 -0700 +++ linux/include/linux/nodemask.h 2005-09-30 12:38:13.000000000 -0700 @@ -232,6 +232,9 @@ return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } +#define first_online_node() first_node(node_online_map) +#define next_online_node(node) next_node((node), node_online_map) + #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ --- linux.orig/mm/Kconfig~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/mm/Kconfig 2005-09-30 12:39:37.000000000 -0700 @@ -111,3 +111,11 @@ config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC + +# eventually, we can have this option just 'select SPARSEMEM' +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + +comment "Memory hotplug is currently incompatible with Software Suspend" + depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND --- linux.orig/mm/Makefile~FROM-MM-memory-hotplug-sysfs-and-add-remove-functions 2005-09-30 12:37:55.000000000 -0700 +++ linux/mm/Makefile 2005-09-30 12:39:37.000000000 -0700 @@ -18,5 +18,5 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o - +obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o --- linux.orig/mm/bootmem.c~B3.0-remove-pgdat_list-ver2-base 2005-09-30 12:38:13.000000000 -0700 +++ linux/mm/bootmem.c 2005-09-30 12:38:13.000000000 -0700 @@ -61,17 +61,6 @@ { bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize = ((end - start)+7)/8; - static struct pglist_data *pgdat_last; - - pgdat->pgdat_next = NULL; - /* Add new nodes last so that bootmem always starts - searching in the first nodes, not the last ones */ - if (pgdat_last) - pgdat_last->pgdat_next = pgdat; - else { - pgdat_list = pgdat; - pgdat_last = pgdat; - } mapsize = ALIGN(mapsize, sizeof(long)); bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); @@ -392,7 +381,7 @@ void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal) { - pg_data_t *pgdat = pgdat_list; + pg_data_t *pgdat; void *ptr; for_each_pgdat(pgdat) --- /dev/null 2005-03-30 22:36:15.000000000 -0800 +++ linux/mm/memory_hotplug.c 2005-09-30 12:39:28.000000000 -0700 @@ -0,0 +1,158 @@ +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int nid = pgdat->node_id; + int zone_type; + + zone_type = zone - pgdat->node_zones; + memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); + zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); +} + +static int __add_section(struct zone *zone, unsigned long phys_start_pfn) +{ + int nr_pages = PAGES_PER_SECTION; + int ret; + + ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); + + hot_add_zone_init(zone, phys_start_pfn, PAGES_PER_SECTION); + + if (ret < 0) + return ret; + + __add_zone(zone, phys_start_pfn); + return register_new_memory(__pfn_to_section(phys_start_pfn)); +} + +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __add_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int err = 0; + + printk(KERN_DEBUG "%s(%p, %08lx, %ld)\n", __func__, + zone, phys_start_pfn, nr_pages); + + for (i = 0; !err && (i < nr_pages); i += PAGES_PER_SECTION) { + printk(KERN_DEBUG "\tfor: i: %ld\n", i); + err = __add_section(zone, phys_start_pfn + i); + } + + return err; +} + +static void grow_zone_span(struct zone *zone, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_zone_end_pfn; + + zone_span_writelock(zone); + + old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + if (start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + if (end_pfn > old_zone_end_pfn) + zone->spanned_pages = end_pfn - zone->zone_start_pfn; + + zone_span_writeunlock(zone); +} + +static void grow_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + if (end_pfn > old_pgdat_end_pfn) + pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; +} + +#ifdef CONFIG_X86_SIMULATED_MEM_HOTPLUG +int page_is_hotpluggable_ram(unsigned long pfn) +{ + extern struct e820map bios_e820; + extern int page_is_ram_e820(unsigned long, struct e820map*); + + return page_is_ram_e820(pfn, &bios_e820); +} +#else +int page_is_hotpluggable_ram(unsigned long pfn) +{ + return 1; +} +#endif + +int online_pages(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long i; + unsigned long flags; + unsigned long onlined_pages = 0; + struct zone *zone; + + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_sem. + */ + zone = page_zone(pfn_to_page(pfn)); + pgdat_resize_lock(zone->zone_pgdat, &flags); + grow_zone_span(zone, pfn, pfn + nr_pages); + grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + printk(KERN_DEBUG "%s: onlining 0x%lx pages starting from pfn: 0x%lx\n", + __func__, nr_pages, pfn); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); + + if (page_is_hotpluggable_ram(pfn + i)) { + online_page(page); + onlined_pages++; + } + } + zone->present_pages += onlined_pages; + + setup_per_zone_pages_min(); + + return 0; +} --- linux.orig/mm/page_alloc.c~FROM-MM-memory-hotplug-prep-break-out-zone-initialization 2005-09-30 12:37:51.000000000 -0700 +++ linux/mm/page_alloc.c 2005-09-30 12:39:30.000000000 -0700 @@ -33,7 +33,9 @@ #include #include #include +#include #include +#include #include #include @@ -47,7 +49,6 @@ EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); -struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; @@ -78,21 +79,44 @@ unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (zone != page_zone(page)) + return 0; + + return 1; +} /* * Temporary debugging check for pages not lying within a given zone. */ static int bad_range(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) - return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) + if (page_outside_zone_boundaries(zone, page)) return 1; -#endif - if (zone != page_zone(page)) + if (!page_is_consistent(zone, page)) return 1; + return 0; } @@ -1401,7 +1425,7 @@ /* * Builds allocation fallback zone lists. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +int __devinit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) { switch (k) { struct zone *zone; @@ -1409,7 +1433,12 @@ BUG(); case ZONE_HIGHMEM: zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { + /* + * with mem hotplug we don't increment present_pages + * until the pages are actually freed into the zone, + * but we increment spanned pages much earlier + */ + if (zone->spanned_pages) { #ifndef CONFIG_HIGHMEM BUG(); #endif @@ -1417,20 +1446,47 @@ } case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) + if (zone->spanned_pages) zonelist->zones[j++] = zone; } return j; } -#ifdef CONFIG_NUMA +static inline int zone_index_to_type(int index) +{ + int type = ZONE_NORMAL; + + if (index & __GFP_HIGHMEM) + type = ZONE_HIGHMEM; + if (index & __GFP_DMA) + type = ZONE_DMA; + return type; +} + #define MAX_NODE_LOAD (num_online_nodes()) -static int __initdata node_load[MAX_NUMNODES]; + +#ifdef CONFIG_NUMA +static int __devinitdata node_load[MAX_NUMNODES]; +static int __devinit get_node_load(int node) +{ + return node_load[node]; +} +static void __devinit increment_node_load(int node, int load) +{ + node_load[node] += load; +} +#else +static inline int get_node_load(int node) +{ + return 0; +} +static inline void increment_node_load(int node, int load) {} +#endif /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending @@ -1445,7 +1501,7 @@ * on them otherwise. * It returns -1 if no node is found. */ -static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +static int __devinit find_next_best_node(int node, nodemask_t *used_node_mask) { int i, n, val; int min_val = INT_MAX; @@ -1477,7 +1533,7 @@ /* Slight preference for less loaded node */ val *= (MAX_NODE_LOAD*MAX_NUMNODES); - val += node_load[n]; + val += get_node_load(n); if (val < min_val) { min_val = val; @@ -1491,19 +1547,13 @@ return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +void __devinit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; struct zonelist *zonelist; nodemask_t used_mask; - /* initialize zonelists */ - for (i = 0; i < GFP_ZONETYPES; i++) { - zonelist = pgdat->node_zonelists + i; - zonelist->zones[0] = NULL; - } - /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = num_online_nodes(); @@ -1517,18 +1567,14 @@ */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) - node_load[node] += load; + increment_node_load(node, load); prev_node = node; load--; for (i = 0; i < GFP_ZONETYPES; i++) { zonelist = pgdat->node_zonelists + i; for (j = 0; zonelist->zones[j] != NULL; j++); - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; + k = zone_index_to_type(i); j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); zonelist->zones[j] = NULL; @@ -1536,51 +1582,6 @@ } } -#else /* CONFIG_NUMA */ - -static void __init build_zonelists(pg_data_t *pgdat) -{ - int i, j, k, node, local_node; - - local_node = pgdat->node_id; - for (i = 0; i < GFP_ZONETYPES; i++) { - struct zonelist *zonelist; - - zonelist = pgdat->node_zonelists + i; - - j = 0; - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - - j = build_zonelists_node(pgdat, zonelist, j, k); - /* - * Now we build the zonelist so that it contains the zones - * of all the other nodes. - * We don't want to pressure a particular node, so when - * building the zones for node N, we make sure that the - * zones coming right after the local ones are those from - * node N+1 (modulo N) - */ - for (node = local_node + 1; node < MAX_NUMNODES; node++) { - if (!node_online(node)) - continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - } - for (node = 0; node < local_node; node++) { - if (!node_online(node)) - continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - } - - zonelist->zones[j] = NULL; - } -} - -#endif /* CONFIG_NUMA */ - void __init build_all_zonelists(void) { int i; @@ -1659,7 +1660,7 @@ * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -1870,6 +1871,63 @@ #endif +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + unsigned long size_bytes; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + size_bytes = zone->wait_table_size * sizeof(wait_queue_head_t); + if (system_state >= SYSTEM_RUNNING) + zone->wait_table = kmalloc(size_bytes, GFP_KERNEL); + else + zone->wait_table = alloc_bootmem_node(zone->zone_pgdat, + size_bytes); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static __devinit void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); + zone->spanned_pages = size; +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1879,10 +1937,11 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long i, j; - int cpu, nid = pgdat->node_id; + unsigned long j; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; @@ -1890,7 +1949,6 @@ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; realsize = size = zones_size[j]; if (zholes_size) @@ -1900,29 +1958,18 @@ nr_kernel_pages += realsize; nr_all_pages += realsize; - zone->spanned_pages = size; zone->present_pages = realsize; zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); + init_MUTEX(&zone->init_sem); zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - batch = zone_batchsize(zone); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1933,32 +1980,9 @@ if (!size) continue; - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - memmap_init(size, nid, j, zone_start_pfn); - zonetable_add(zone, nid, j, zone_start_pfn, size); - + init_currently_empty_zone(zone, zone_start_pfn, size); zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } @@ -2025,8 +2049,9 @@ pg_data_t *pgdat; loff_t node = *pos; - for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) - --node; + for_each_pgdat(pgdat) + if (!node--) + break; return pgdat; } @@ -2036,7 +2061,7 @@ pg_data_t *pgdat = (pg_data_t *)arg; (*pos)++; - return pgdat->pgdat_next; + return next_online_pgdat(pgdat); } static void frag_stop(struct seq_file *m, void *arg) @@ -2358,7 +2383,7 @@ * that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -static void setup_per_zone_pages_min(void) +void setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -2567,3 +2592,48 @@ return table; } + +static inline int zone_previously_initialized(struct zone *zone) +{ + if (zone->wait_table_size) + return 1; + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static int __build_zonelists(void *__pgdat) +{ + pg_data_t *pgdat = __pgdat; + build_zonelists(pgdat); + return 0; +} + +int hot_add_zone_init(struct zone *zone, unsigned long phys_start_pfn, unsigned long size_pages) +{ + int ret = 0; + + down(&zone->init_sem); + if (zone_previously_initialized(zone)) { + ret = -EEXIST; + goto out; + } + + zone_wait_table_init(zone, size_pages); + init_currently_empty_zone(zone, phys_start_pfn, size_pages); + zone_pcp_init(zone); + + /* + * This is an awfully blunt way to do this. But, the + * zonelists are accessed many times over large areas + * of performance-critical code in the allocator. + * That makes it very hard to get a conventional lock + * to work. This of this as a rw lock with a huge + * write cost. + */ + stop_machine_run(__build_zonelists, zone->zone_pgdat, NR_CPUS); +out: + up(&zone->init_sem); + return ret; +} +#endif --- linux.orig/mm/sparse.c~FROM-MM-memory-hotplug-prep-__section_nr-helper-fix 2005-09-30 12:37:53.000000000 -0700 +++ linux/mm/sparse.c 2005-09-30 12:39:27.000000000 -0700 @@ -5,8 +5,10 @@ #include #include #include +#include #include #include +#include #include /* @@ -72,6 +74,31 @@ } #endif +/* + * Although written for the SPARSEMEM_EXTREME case, this happens + * to also work for the flat array case becase + * NR_SECTION_ROOTS==NR_MEM_SECTIONS. + */ +int __section_nr(struct mem_section* ms) +{ + unsigned long root_nr; + struct mem_section* root; + + for (root_nr = 0; + root_nr < NR_MEM_SECTIONS; + root_nr += SECTIONS_PER_ROOT) { + root = __nr_to_section(root_nr); + + if (!root) + continue; + + if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) + break; + } + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); +} + /* Record a memory area against a node. */ void memory_present(int nid, unsigned long start, unsigned long end) { @@ -162,6 +189,45 @@ return NULL; } +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * nr_pages; + + page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + memset(ret, 0, memmap_size); + + return ret; +} + +static int vaddr_in_vmalloc_area(void *addr) +{ + if (addr >= (void *)VMALLOC_START && + addr < (void *)VMALLOC_END) + return 1; + return 0; +} + +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +{ + if (vaddr_in_vmalloc_area(memmap)) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * nr_pages)); +} + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -187,14 +253,37 @@ * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) +int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + int nr_pages) { - struct mem_section *ms = __pfn_to_section(start_pfn); + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct pglist_data *pgdat = zone->zone_pgdat; + struct mem_section *ms; + struct page *memmap; + unsigned long flags; + int ret; - if (ms->section_mem_map & SECTION_MARKED_PRESENT) - return -EEXIST; + /* + * no locking for this, because it does its own + * plus, it does a kmalloc + */ + sparse_index_init(section_nr, pgdat->node_id); + memmap = __kmalloc_section_memmap(nr_pages); + + pgdat_resize_lock(pgdat, &flags); + ms = __pfn_to_section(start_pfn); + if (ms->section_mem_map & SECTION_MARKED_PRESENT) { + ret = -EEXIST; + goto out; + } ms->section_mem_map |= SECTION_MARKED_PRESENT; - return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); + ret = sparse_init_one_section(ms, section_nr, memmap); + + if (ret <= 0) + __kfree_section_memmap(memmap, nr_pages); +out: + pgdat_resize_unlock(pgdat, &flags); + return ret; } --- linux.orig/mm/vmscan.c~AA-PM-01-steal_page_from_lru 2005-09-30 12:38:29.000000000 -0700 +++ linux/mm/vmscan.c 2005-09-30 12:39:38.000000000 -0700 @@ -582,22 +582,8 @@ while (scan++ < nr_to_scan && !list_empty(src)) { page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - - if (!TestClearPageLRU(page)) - BUG(); - list_del(&page->lru); - if (get_page_testone(page)) { - /* - * It is being freed elsewhere - */ - __put_page(page); - SetPageLRU(page); - list_add(&page->lru, src); - continue; - } else { - list_add(&page->lru, dst); + if (isolate_lru_onepage(page, src, dst)) nr_taken++; - } } *scanned = scan; @@ -650,13 +636,10 @@ */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + if (PageActive(page) && page_under_capture(page)) + ClearPageActive(page); + __putback_page_to_lru(zone, page); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); --- linux.orig/net/sunrpc/xprt.c~A9.1-xprt-warnings 2005-09-30 12:38:08.000000000 -0700 +++ linux/net/sunrpc/xprt.c 2005-09-30 12:38:08.000000000 -0700 @@ -825,13 +825,13 @@ if (len > desc->count) len = desc->count; if (skb_copy_bits(desc->skb, desc->offset, p, len)) { - dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n", + dprintk("RPC: failed to copy %u bytes from skb. %u bytes remain\n", len, desc->count); return 0; } desc->offset += len; desc->count -= len; - dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n", + dprintk("RPC: copied %u bytes from skb. %u bytes remain\n", len, desc->count); return len; }