From: "Chen, Kenneth W" The issue of exceedingly large hash tables has been discussed on the mailing list a while back, but seems to slip through the cracks. What we found is it's not a problem for x86 (and most other architectures) because __get_free_pages won't be able to get anything beyond order MAX_ORDER-1 (10) which means at most those hash tables are 4MB each (assume 4K page size). However, on ia64, in order to support larger hugeTLB page size, the MAX_ORDER is bumped up to 18, which now means a 2GB upper limits enforced by the page allocator (assume 16K page size). PPC64 is another example that bumps up MAX_ORDER. Last time I checked, the tcp ehash table is taking a whooping (insane!) 2GB on one of our large machine. dentry and inode hash tables also take considerable amount of memory. Setting the size of these tables is difficult: they need to be constrained on many-zone ia64 machines, but this could cause significant performance problems when there are (for example) 100 million dentries in cache. Large-memory machines which do not slice that memory up into huge numbers of zones do not need to run the risk of this slowdown. So the sizing algorithms remain essentially unchanged, and boot-time options are provided which permit the tables to be scaled down. --- 25-akpm/fs/dcache.c | 24 +++++++++++++++++++----- 25-akpm/fs/inode.c | 22 +++++++++++++++++++--- 25-akpm/net/ipv4/route.c | 15 ++++++++++++++- 25-akpm/net/ipv4/tcp.c | 14 ++++++++++++++ 4 files changed, 66 insertions(+), 9 deletions(-) diff -puN fs/dcache.c~limit-hash-table-sizes-boot-options fs/dcache.c --- 25/fs/dcache.c~limit-hash-table-sizes-boot-options Tue Feb 17 15:02:57 2004 +++ 25-akpm/fs/dcache.c Tue Feb 17 15:02:57 2004 @@ -49,6 +49,7 @@ static kmem_cache_t *dentry_cache; */ #define D_HASHBITS d_hash_shift #define D_HASHMASK d_hash_mask +#define D_HASHMAX (2*1024*1024UL) /* max number of entries */ static unsigned int d_hash_mask; static unsigned int d_hash_shift; @@ -1531,6 +1532,16 @@ out: return ino; } +static __initdata unsigned long dhash_entries; +static int __init set_dhash_entries(char *str) +{ + if (!str) + return 0; + dhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("dhash_entries=", set_dhash_entries); + static void __init dcache_init(unsigned long mempages) { struct hlist_head *d; @@ -1556,11 +1567,14 @@ static void __init dcache_init(unsigned set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); -#if PAGE_SHIFT < 13 - mempages >>= (13 - PAGE_SHIFT); -#endif - mempages *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + if (!dhash_entries) { + dhash_entries = PAGE_SHIFT < 13 ? + mempages >> (13 - PAGE_SHIFT) : + mempages << (PAGE_SHIFT - 13); + dhash_entries = min(D_HASHMAX, dhash_entries); + } + dhash_entries *= sizeof(struct hlist_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < dhash_entries; order++) ; do { diff -puN fs/inode.c~limit-hash-table-sizes-boot-options fs/inode.c --- 25/fs/inode.c~limit-hash-table-sizes-boot-options Tue Feb 17 15:02:57 2004 +++ 25-akpm/fs/inode.c Tue Feb 17 15:02:57 2004 @@ -53,6 +53,7 @@ */ #define I_HASHBITS i_hash_shift #define I_HASHMASK i_hash_mask +#define I_HASHMAX (2*1024*1024UL) /* max number of entries */ static unsigned int i_hash_mask; static unsigned int i_hash_shift; @@ -1312,6 +1313,16 @@ void wake_up_inode(struct inode *inode) wake_up_all(wq); } +static __initdata unsigned long ihash_entries; +static int __init set_ihash_entries(char *str) +{ + if (!str) + return 0; + ihash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("ihash_entries=", set_ihash_entries); + /* * Initialize the waitqueues and inode hash table. */ @@ -1325,9 +1336,14 @@ void __init inode_init(unsigned long mem for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) init_waitqueue_head(&i_wait_queue_heads[i].wqh); - mempages >>= (14 - PAGE_SHIFT); - mempages *= sizeof(struct hlist_head); - for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + if (!ihash_entries) { + ihash_entries = PAGE_SHIFT < 14 ? + mempages >> (14 - PAGE_SHIFT) : + mempages << (PAGE_SHIFT - 14); + ihash_entries = min(I_HASHMAX, ihash_entries); + } + ihash_entries *= sizeof(struct hlist_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < ihash_entries; order++) ; do { diff -puN net/ipv4/route.c~limit-hash-table-sizes-boot-options net/ipv4/route.c --- 25/net/ipv4/route.c~limit-hash-table-sizes-boot-options Tue Feb 17 15:02:57 2004 +++ 25-akpm/net/ipv4/route.c Tue Feb 17 15:02:57 2004 @@ -2717,6 +2717,16 @@ static int ip_rt_acct_read(char *buffer, #endif /* CONFIG_PROC_FS */ #endif /* CONFIG_NET_CLS_ROUTE */ +static __initdata unsigned long rhash_entries; +static int __init set_rhash_entries(char *str) +{ + if (!str) + return 0; + rhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("rhash_entries=", set_rhash_entries); + int __init ip_rt_init(void) { int i, order, goal, rc = 0; @@ -2743,7 +2753,10 @@ int __init ip_rt_init(void) panic("IP: failed to allocate ip_dst_cache\n"); goal = num_physpages >> (26 - PAGE_SHIFT); - + if (!rhash_entries) + goal = min(10, goal); + else + goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; for (order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff -puN net/ipv4/tcp.c~limit-hash-table-sizes-boot-options net/ipv4/tcp.c --- 25/net/ipv4/tcp.c~limit-hash-table-sizes-boot-options Tue Feb 17 15:02:57 2004 +++ 25-akpm/net/ipv4/tcp.c Tue Feb 17 15:02:57 2004 @@ -2570,6 +2570,16 @@ int tcp_getsockopt(struct sock *sk, int extern void __skb_cb_too_small_for_tcp(int, int); extern void tcpdiag_init(void); +static __initdata unsigned long thash_entries; +static int __init set_thash_entries(char *str) +{ + if (!str) + return 0; + thash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("thash_entries=", set_thash_entries); + void __init tcp_init(void) { struct sk_buff *skb = NULL; @@ -2611,6 +2621,10 @@ void __init tcp_init(void) else goal = num_physpages >> (23 - PAGE_SHIFT); + if (!thash_entries) + goal = min(10, goal); + else + goal = (thash_entries * sizeof(struct tcp_ehash_bucket)) >> PAGE_SHIFT; for (order = 0; (1UL << order) < goal; order++) ; do { _