From: Mike Kravetz . When I booted my new 720 on a kernel configured for NUMA, I received the following during bootup: WARNING: Unexpected node layout: region start 44000000 length 2000000 NUMA is disabled This is due to memory 'holes' within nodes. If such holes are encountered, then NUMA is disabled. The following patch adds support for such configurations. This patch gets the cell sizes before extracting the cells. I have made this change to existing code in the file, as well as the code I added. Signed-off-by: Mike Kravetz Signed-off-by: Paul Mackerras Signed-off-by: Andrew Morton --- 25-akpm/arch/ppc64/mm/numa.c | 160 ++++++++++++++++++++++++------------------- 1 files changed, 92 insertions(+), 68 deletions(-) diff -puN arch/ppc64/mm/numa.c~ppc64-numa-memory-fixup arch/ppc64/mm/numa.c --- 25/arch/ppc64/mm/numa.c~ppc64-numa-memory-fixup Fri Mar 11 14:56:10 2005 +++ 25-akpm/arch/ppc64/mm/numa.c Fri Mar 11 14:56:10 2005 @@ -40,7 +40,6 @@ int nr_cpus_in_node[MAX_NUMNODES] = { [0 struct pglist_data *node_data[MAX_NUMNODES]; bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES]; -static unsigned long node0_io_hole_size; static int min_common_depth; /* @@ -49,7 +48,8 @@ static int min_common_depth; */ static struct { unsigned long node_start_pfn; - unsigned long node_spanned_pages; + unsigned long node_end_pfn; + unsigned long node_present_pages; } init_node_data[MAX_NUMNODES] __initdata; EXPORT_SYMBOL(node_data); @@ -186,14 +186,31 @@ static int __init find_min_common_depth( return depth; } -static unsigned long read_cell_ul(struct device_node *device, unsigned int **buf) +static int __init get_mem_addr_cells(void) +{ + struct device_node *memory = NULL; + + memory = of_find_node_by_type(memory, "memory"); + if (!memory) + return 0; /* it won't matter */ + return(prom_n_addr_cells(memory)); +} + +static int __init get_mem_size_cells(void) +{ + struct device_node *memory = NULL; + + memory = of_find_node_by_type(memory, "memory"); + if (!memory) + return 0; /* it won't matter */ + return(prom_n_size_cells(memory)); +} + +static unsigned long read_n_cells(int n, unsigned int **buf) { - int i; unsigned long result = 0; - i = prom_n_size_cells(device); - /* bug on i>2 ?? */ - while (i--) { + while (n--) { result = (result << 32) | **buf; (*buf)++; } @@ -267,6 +284,7 @@ static int __init parse_numa_properties( { struct device_node *cpu = NULL; struct device_node *memory = NULL; + int addr_cells, size_cells; int max_domain = 0; long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT; unsigned long i; @@ -313,6 +331,8 @@ static int __init parse_numa_properties( } } + addr_cells = get_mem_addr_cells(); + size_cells = get_mem_size_cells(); memory = NULL; while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { unsigned long start; @@ -329,8 +349,8 @@ static int __init parse_numa_properties( ranges = memory->n_addrs; new_range: /* these are order-sensitive, and modify the buffer pointer */ - start = read_cell_ul(memory, &memcell_buf); - size = read_cell_ul(memory, &memcell_buf); + start = read_n_cells(addr_cells, &memcell_buf); + size = read_n_cells(size_cells, &memcell_buf); start = _ALIGN_DOWN(start, MEMORY_INCREMENT); size = _ALIGN_UP(size, MEMORY_INCREMENT); @@ -348,33 +368,28 @@ new_range: if (max_domain < numa_domain) max_domain = numa_domain; - /* - * For backwards compatibility, OF splits the first node - * into two regions (the first being 0-4GB). Check for - * this simple case and complain if there is a gap in - * memory + /* + * Initialize new node struct, or add to an existing one. */ - if (init_node_data[numa_domain].node_spanned_pages) { - unsigned long shouldstart = - init_node_data[numa_domain].node_start_pfn + - init_node_data[numa_domain].node_spanned_pages; - if (shouldstart != (start / PAGE_SIZE)) { - /* Revert to non-numa for now */ - printk(KERN_ERR - "WARNING: Unexpected node layout: " - "region start %lx length %lx\n", - start, size); - printk(KERN_ERR "NUMA is disabled\n"); - goto err; - } - init_node_data[numa_domain].node_spanned_pages += + if (init_node_data[numa_domain].node_end_pfn) { + if ((start / PAGE_SIZE) < + init_node_data[numa_domain].node_start_pfn) + init_node_data[numa_domain].node_start_pfn = + start / PAGE_SIZE; + else + init_node_data[numa_domain].node_end_pfn = + (start / PAGE_SIZE) + + (size / PAGE_SIZE); + + init_node_data[numa_domain].node_present_pages += size / PAGE_SIZE; } else { node_set_online(numa_domain); init_node_data[numa_domain].node_start_pfn = start / PAGE_SIZE; - init_node_data[numa_domain].node_spanned_pages = + init_node_data[numa_domain].node_end_pfn = + init_node_data[numa_domain].node_start_pfn + size / PAGE_SIZE; } @@ -391,14 +406,6 @@ new_range: node_set_online(i); return 0; -err: - /* Something has gone wrong; revert any setup we've done */ - for_each_node(i) { - node_set_offline(i); - init_node_data[i].node_start_pfn = 0; - init_node_data[i].node_spanned_pages = 0; - } - return -1; } static void __init setup_nonnuma(void) @@ -426,12 +433,11 @@ static void __init setup_nonnuma(void) node_set_online(0); init_node_data[0].node_start_pfn = 0; - init_node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE; + init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE; + init_node_data[0].node_present_pages = total_ram / PAGE_SIZE; for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; - - node0_io_hole_size = top_of_ram - total_ram; } static void __init dump_numa_topology(void) @@ -512,6 +518,8 @@ static unsigned long careful_allocation( void __init do_init_bootmem(void) { int nid; + int addr_cells, size_cells; + struct device_node *memory = NULL; static struct notifier_block ppc64_numa_nb = { .notifier_call = cpu_numa_callback, .priority = 1 /* Must run before sched domains notifier. */ @@ -535,7 +543,7 @@ void __init do_init_bootmem(void) unsigned long bootmap_pages; start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE; - end_paddr = start_paddr + (init_node_data[nid].node_spanned_pages * PAGE_SIZE); + end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE; /* Allocate the node structure node local if possible */ NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid, @@ -551,9 +559,9 @@ void __init do_init_bootmem(void) NODE_DATA(nid)->node_start_pfn = init_node_data[nid].node_start_pfn; NODE_DATA(nid)->node_spanned_pages = - init_node_data[nid].node_spanned_pages; + end_paddr - start_paddr; - if (init_node_data[nid].node_spanned_pages == 0) + if (NODE_DATA(nid)->node_spanned_pages == 0) continue; dbg("start_paddr = %lx\n", start_paddr); @@ -572,33 +580,50 @@ void __init do_init_bootmem(void) start_paddr >> PAGE_SHIFT, end_paddr >> PAGE_SHIFT); - for (i = 0; i < lmb.memory.cnt; i++) { - unsigned long physbase, size; - - physbase = lmb.memory.region[i].physbase; - size = lmb.memory.region[i].size; - - if (physbase < end_paddr && - (physbase+size) > start_paddr) { - /* overlaps */ - if (physbase < start_paddr) { - size -= start_paddr - physbase; - physbase = start_paddr; - } - - if (size > end_paddr - physbase) - size = end_paddr - physbase; - - dbg("free_bootmem %lx %lx\n", physbase, size); - free_bootmem_node(NODE_DATA(nid), physbase, - size); + /* + * We need to do another scan of all memory sections to + * associate memory with the correct node. + */ + addr_cells = get_mem_addr_cells(); + size_cells = get_mem_size_cells(); + memory = NULL; + while ((memory = of_find_node_by_type(memory, "memory")) != NULL) { + unsigned long mem_start, mem_size; + int numa_domain; + unsigned int *memcell_buf; + unsigned int len; + + memcell_buf = (unsigned int *)get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + mem_start = read_n_cells(addr_cells, &memcell_buf); + mem_size = read_n_cells(size_cells, &memcell_buf); + numa_domain = of_node_numa_domain(memory); + + if (numa_domain != nid) + continue; + + if (mem_start < end_paddr && + (mem_start+mem_size) > start_paddr) { + /* should be no overlaps ! */ + dbg("free_bootmem %lx %lx\n", mem_start, mem_size); + free_bootmem_node(NODE_DATA(nid), mem_start, + mem_size); } } + /* + * Mark reserved regions on this node + */ for (i = 0; i < lmb.reserved.cnt; i++) { unsigned long physbase = lmb.reserved.region[i].physbase; unsigned long size = lmb.reserved.region[i].size; + if (pa_to_nid(physbase) != nid && + pa_to_nid(physbase+size-1) != nid) + continue; + if (physbase < end_paddr && (physbase+size) > start_paddr) { /* overlaps */ @@ -632,13 +657,12 @@ void __init paging_init(void) unsigned long start_pfn; unsigned long end_pfn; - start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT; - end_pfn = plat_node_bdata[nid].node_low_pfn; + start_pfn = init_node_data[nid].node_start_pfn; + end_pfn = init_node_data[nid].node_end_pfn; zones_size[ZONE_DMA] = end_pfn - start_pfn; - zholes_size[ZONE_DMA] = 0; - if (nid == 0) - zholes_size[ZONE_DMA] = node0_io_hole_size >> PAGE_SHIFT; + zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] - + init_node_data[nid].node_present_pages; dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid, zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]); _