[PATCH 1/4] mm/hotplug: correctly setup fallback zonelists when creating new pgdat

July 05th, 2012 - 06:00 am ET by Jiang Liu | Report spam
When hotadd_new_pgdat() is called to create new pgdat for a new node,
a fallback zonelist should be created for the new node. There's code
to try to achieve that in hotadd_new_pgdat() as below:
/*
* The node we allocated has no zone fallback lists. For avoiding
* to access not-initialized zonelist, build here.
*/
mutex_lock(&zonelists_mutex);
build_all_zonelists(pgdat, NULL);
mutex_unlock(&zonelists_mutex);

But it doesn't work as expected. When hotadd_new_pgdat() is called, the
new node is still in offline state because node_set_online(nid) hasn't
been called yet. And build_all_zonelists() only builds zonelists for
online nodes as:
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);

build_zonelists(pgdat);
build_zonelist_cache(pgdat);
}

Though we hope to create zonelist for the new pgdat, but it doesn't.
So add a new parameter "pgdat" the build_all_zonelists() to build pgdat
for the new pgdat too.

Signed-off-by: Jiang Liu <liuj97@gmail.com>
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>

include/linux/mmzone.h | 2 +-
init/main.c | 2 +-
kernel/cpu.c | 2 +-
mm/memory_hotplug.c | 4 ++--
mm/page_alloc.c | 17 ++++++++++++--
5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2427706..8ddbfb4 100644
a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -718,7 +718,7 @@ typedef struct pglist_data {
#include <linux/memory_hotplug.h>

extern struct mutex zonelists_mutex;
-void build_all_zonelists(void *data);
+void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags);
diff --git a/init/main.c b/init/main.c
index b5cc0a7..622364d 100644
a/init/main.c
+++ b/init/main.c
@@ -501,7 +501,7 @@ asmlinkage void __init start_kernel(void)
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */

- build_all_zonelists(NULL);
+ build_all_zonelists(NULL, NULL);
page_alloc_init();

printk(KERN_NOTICE "Kernel command line: %s", boot_command_line);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb522..14d3258 100644
a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)

if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec..f93c5b5 100644
a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -513,7 +513,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
if (need_zonelists_rebuild)
- build_all_zonelists(zone);
+ build_all_zonelists(NULL, zone);
else
zone_pcp_update(zone);

@@ -562,7 +562,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
* to access not-initialized zonelist, build here.
*/
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(pgdat, NULL);
mutex_unlock(&zonelists_mutex);

return pgdat;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4403009..ebf319d 100644
a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3030,7 +3030,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
}
}
@@ -3413,10 +3413,17 @@ static __init_refok int __build_all_zonelists(void *data)
{
int nid;
int cpu;
+ pg_data_t *self = data;

#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+
+ if (self && !node_online(self->node_id)) {
+ build_zonelists(self);
+ build_zonelist_cache(self);
+ }
+
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);

@@ -3461,7 +3468,7 @@ static __init_refok int __build_all_zonelists(void *data)
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order();

@@ -3473,10 +3480,10 @@ void __ref build_all_zonelists(void *data)
/* we have to stop all cpus to guarantee there is no user
of zonelist */
#ifdef CONFIG_MEMORY_HOTPLUG
- if (data)
- setup_zone_pageset((struct zone *)data);
+ if (zone)
+ setup_zone_pageset(zone);
#endif
- stop_machine(__build_all_zonelists, NULL, NULL);
+ stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
1.7.1


To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
email Follow the discussionReplies 3 repliesReplies Make a reply

Replies

#1 Bob Liu
July 19th, 2012 - 04:00 am ET | Report spam
On Thu, Jul 5, 2012 at 5:45 PM, Jiang Liu wrote:
When a zone becomes empty after memory offlining, free zone->pageset.
Otherwise it will cause memory leak when adding memory to the empty
zone again because build_all_zonelists() will allocate zone->pageset
for an empty zone.




What about other area allocated to the zone? eg. wait_table?

Signed-off-by: Jiang Liu
Signed-off-by: Wei Wang

include/linux/mm.h | 1 +
mm/memory_hotplug.c | 3 +++
mm/page_alloc.c | 13 +++++++++++++
3 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b36d08c..f8b62f2 100644
a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1331,6 +1331,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
extern void setup_per_cpu_pageset(void);

extern void zone_pcp_update(struct zone *zone);
+extern void zone_pcp_reset(struct zone *zone);

/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bce80c7..998b792 100644
a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -966,6 +966,9 @@ repeat:

init_per_zone_wmark_min();

+ if (!populated_zone(zone))
+ zone_pcp_reset(zone);
+
if (!node_present_pages(node)) {
node_clear_state(node, N_HIGH_MEMORY);
kswapd_stop(node);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ebf319d..5964b7a 100644
a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5872,6 +5872,19 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
#endif

#ifdef CONFIG_MEMORY_HOTREMOVE
+void zone_pcp_reset(struct zone *zone)
+{
+ unsigned long flags;
+
+ /* avoid races with drain_pages() */
+ local_irq_save(flags);
+ if (zone->pageset != &boot_pageset) {
+ free_percpu(zone->pageset);
+ zone->pageset = &boot_pageset;
+ }
+ local_irq_restore(flags);
+}
+
/*
* All pages in the range must be isolated before calling this.
*/
1.7.1


To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:""> </a>





Regards,
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Similar topics