The kernel presently tends to reclaim from highmem much more than from lowmem. A 1.5G machine running 2.6.3 will reclaim 700 highmem pagecache pages for one lowmem one. With Nick's patches this comes down to 10:1. This patch fixes it up, by introducing a pre-calculated zone->reclaim_batch, which tells the VM how many pages to reclaim from this zone. It is scaled according to the number of pages in the zone. With this in place, the reclaim rate for each zone is indeed proportional to the size of the zone. Interaction with the `incremental min' logic in the page allocator is important. We teach kswapd to perform reclaim against the normal zone even if the normal zone has free_pages > pages_high. This way, more pages get freed from the normal zone, bringing its pages_free up to the level at which the normal zone is again eligible for __GFP_HIGHMEM allocations. This keeps the inter-zone allocation rates balanced. --- include/linux/mmzone.h | 5 ++++- mm/page_alloc.c | 13 ++++++++++++- mm/vmscan.c | 22 +++++++++------------- 3 files changed, 25 insertions(+), 15 deletions(-) diff -puN mm/page_alloc.c~zone-balancing-batching mm/page_alloc.c --- 25/mm/page_alloc.c~zone-balancing-batching 2004-02-22 15:15:52.000000000 -0800 +++ 25-akpm/mm/page_alloc.c 2004-02-22 15:15:52.000000000 -0800 @@ -1019,6 +1019,7 @@ void show_free_areas(void) " min:%lukB" " low:%lukB" " high:%lukB" + " batch:%lukB" " active:%lukB" " inactive:%lukB" "\n", @@ -1027,6 +1028,7 @@ void show_free_areas(void) K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), + K(zone->reclaim_batch), K(zone->nr_active), K(zone->nr_inactive) ); @@ -1618,6 +1620,8 @@ static void setup_per_zone_pages_min(voi lowmem_pages += zone->present_pages; for_each_zone(zone) { + unsigned long long reclaim_batch; + spin_lock_irqsave(&zone->lru_lock, flags); if (is_highmem(zone)) { /* @@ -1642,8 +1646,15 @@ static void setup_per_zone_pages_min(voi lowmem_pages; } - zone-> pages_low = zone->pages_min * 2; + zone->pages_low = zone->pages_min * 2; zone->pages_high = zone->pages_min * 3; + + reclaim_batch = zone->present_pages * SWAP_CLUSTER_MAX; + do_div(reclaim_batch, lowmem_pages); + zone->reclaim_batch = reclaim_batch; + if (zone->reclaim_batch < 4) + zone->reclaim_batch = 4; + spin_unlock_irqrestore(&zone->lru_lock, flags); } } diff -puN mm/vmscan.c~zone-balancing-batching mm/vmscan.c --- 25/mm/vmscan.c~zone-balancing-batching 2004-02-22 15:15:52.000000000 -0800 +++ 25-akpm/mm/vmscan.c 2004-02-22 15:15:52.000000000 -0800 @@ -859,13 +859,12 @@ shrink_zone(struct zone *zone, unsigned */ static int shrink_caches(struct zone **zones, int priority, int *total_scanned, - int gfp_mask, int nr_pages, struct page_state *ps) + int gfp_mask, struct page_state *ps) { int ret = 0; int i; for (i = 0; zones[i] != NULL; i++) { - int to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX); struct zone *zone = zones[i]; int nr_scanned; @@ -875,8 +874,8 @@ shrink_caches(struct zone **zones, int p if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - ret += shrink_zone(zone, gfp_mask, - to_reclaim, &nr_scanned, ps, priority); + ret += shrink_zone(zone, gfp_mask, zone->reclaim_batch, + &nr_scanned, ps, priority); *total_scanned += nr_scanned; } return ret; @@ -904,7 +903,6 @@ int try_to_free_pages(struct zone **zone { int priority; int ret = 0; - const int nr_pages = SWAP_CLUSTER_MAX; int nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; int i; @@ -920,7 +918,7 @@ int try_to_free_pages(struct zone **zone get_page_state(&ps); nr_reclaimed += shrink_caches(zones, priority, &total_scanned, - gfp_mask, nr_pages, &ps); + gfp_mask, &ps); shrink_slab(total_scanned, gfp_mask); if (reclaim_state) { @@ -928,7 +926,7 @@ int try_to_free_pages(struct zone **zone reclaim_state->reclaimed_slab = 0; } - if (nr_reclaimed >= nr_pages) { + if (nr_reclaimed >= SWAP_CLUSTER_MAX) { ret = 1; if (gfp_mask & __GFP_FS) wakeup_bdflush(total_scanned); @@ -1008,13 +1006,11 @@ static int balance_pgdat(pg_data_t *pgda if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - if (nr_pages) { /* Software suspend */ + if (nr_pages) /* Software suspend */ to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8); - } else { /* Zone balancing */ - to_reclaim = zone->pages_high-zone->free_pages; - if (to_reclaim <= 0) - continue; - } + else /* Zone balancing */ + to_reclaim = zone->reclaim_batch; + all_zones_ok = 0; zone->temp_priority = priority; reclaimed = shrink_zone(zone, GFP_KERNEL, diff -puN include/linux/mmzone.h~zone-balancing-batching include/linux/mmzone.h --- 25/include/linux/mmzone.h~zone-balancing-batching 2004-02-22 15:15:52.000000000 -0800 +++ 25-akpm/include/linux/mmzone.h 2004-02-22 15:15:52.000000000 -0800 @@ -69,7 +69,10 @@ struct zone { */ spinlock_t lock; unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; + unsigned long pages_min; + unsigned long pages_low; + unsigned long pages_high; + unsigned long reclaim_batch; ZONE_PADDING(_pad1_) _