memory_hotplug.c 47.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2
3
4
5
6
7
8
9
/*
 *  linux/mm/memory_hotplug.c
 *
 *  Copyright (C)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
10
#include <linux/sched/signal.h>
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
#include <linux/memremap.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
#include <linux/pfn.h>
#include <linux/suspend.h>
#include <linux/mm_inline.h>
#include <linux/firmware-map.h>
#include <linux/stop_machine.h>
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/compaction.h>
38
#include <linux/rmap.h>
39
40
41
42

#include <asm/tlbflush.h>

#include "internal.h"
43
#include "shuffle.h"
44
45
46
47
48
49
50
51
52
53
54

/*
 * online_page_callback contains pointer to current page onlining function.
 * Initially it is generic_online_page(). If it is required it could be
 * changed by calling set_online_page_callback() for callback registration
 * and restore_online_page_callback() for generic callback restore.
 */

static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);

55
DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
56

57
58
59
60
void get_online_mems(void)
{
	percpu_down_read(&mem_hotplug_lock);
}
61

62
63
64
65
66
67
void put_online_mems(void)
{
	percpu_up_read(&mem_hotplug_lock);
}

bool movable_node_enabled = false;
68
69

#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
70
int memhp_default_online_type = MMOP_OFFLINE;
71
#else
72
int memhp_default_online_type = MMOP_ONLINE;
73
74
75
76
#endif

static int __init setup_memhp_default_state(char *str)
{
77
78
79
80
	const int online_type = memhp_online_type_from_str(str);

	if (online_type >= 0)
		memhp_default_online_type = online_type;
81
82
83
84
85
86
87

	return 1;
}
__setup("memhp_default_state=", setup_memhp_default_state);

void mem_hotplug_begin(void)
{
88
89
	cpus_read_lock();
	percpu_down_write(&mem_hotplug_lock);
90
91
92
93
}

void mem_hotplug_done(void)
{
94
95
	percpu_up_write(&mem_hotplug_lock);
	cpus_read_unlock();
96
97
}

98
99
u64 max_mem_size = U64_MAX;

100
101
102
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
103
104
105
106
	struct resource *res;
	unsigned long flags =  IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
	char *resource_name = "System RAM";

107
108
109
110
111
112
113
	/*
	 * Make sure value parsed from 'mem=' only restricts memory adding
	 * while booting, so that memory hotplug won't be impacted. Please
	 * refer to document of 'mem=' in kernel-parameters.txt for more
	 * details.
	 */
	if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
114
115
116
117
118
119
120
121
122
123
124
125
126
		return ERR_PTR(-E2BIG);

	/*
	 * Request ownership of the new memory range.  This might be
	 * a child of an existing resource that was present but
	 * not marked as busy.
	 */
	res = __request_region(&iomem_resource, start, size,
			       resource_name, flags);

	if (!res) {
		pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
				start, start + size);
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
		return ERR_PTR(-EEXIST);
	}
	return res;
}

static void release_memory_resource(struct resource *res)
{
	if (!res)
		return;
	release_resource(res);
	kfree(res);
}

#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
void get_page_bootmem(unsigned long info,  struct page *page,
		      unsigned long type)
{
144
	page->freelist = (void *)type;
145
146
147
148
149
150
151
152
153
	SetPagePrivate(page);
	set_page_private(page, info);
	page_ref_inc(page);
}

void put_page_bootmem(struct page *page)
{
	unsigned long type;

154
	type = (unsigned long) page->freelist;
155
156
157
158
	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);

	if (page_ref_dec_return(page) == 1) {
159
		page->freelist = NULL;
160
161
162
163
164
165
166
167
168
169
170
		ClearPagePrivate(page);
		set_page_private(page, 0);
		INIT_LIST_HEAD(&page->lru);
		free_reserved_page(page);
	}
}

#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
171
	unsigned long mapsize, section_nr, i;
172
173
	struct mem_section *ms;
	struct page *page, *memmap;
174
	struct mem_section_usage *usage;
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	/* Get section's memmap address */
	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	/*
	 * Get page for the memmap's phys address
	 * XXX: need more consideration for sparse_vmemmap...
	 */
	page = virt_to_page(memmap);
	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;

	/* remember memmap's page */
	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, SECTION_INFO);

194
195
	usage = ms->usage;
	page = virt_to_page(usage);
196

197
	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
198
199
200
201
202
203
204
205

	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);

}
#else /* CONFIG_SPARSEMEM_VMEMMAP */
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
206
	unsigned long mapsize, section_nr, i;
207
208
	struct mem_section *ms;
	struct page *page, *memmap;
209
	struct mem_section_usage *usage;
210
211
212
213
214
215
216
217

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);

218
219
	usage = ms->usage;
	page = virt_to_page(usage);
220

221
	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256

	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
{
	unsigned long i, pfn, end_pfn, nr_pages;
	int node = pgdat->node_id;
	struct page *page;

	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
	page = virt_to_page(pgdat);

	for (i = 0; i < nr_pages; i++, page++)
		get_page_bootmem(node, page, NODE_INFO);

	pfn = pgdat->node_start_pfn;
	end_pfn = pgdat_end_pfn(pgdat);

	/* register section info */
	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
		/*
		 * Some platforms can assign the same pfn to multiple nodes - on
		 * node0 as well as nodeN.  To avoid registering a pfn against
		 * multiple nodes we check that this pfn does not already
		 * reside in some other nodes.
		 */
		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
			register_page_bootmem_info_section(pfn);
	}
}
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */

257
258
static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
		const char *reason)
259
{
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
	/*
	 * Disallow all operations smaller than a sub-section and only
	 * allow operations smaller than a section for
	 * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
	 * enforces a larger memory_block_size_bytes() granularity for
	 * memory that will be marked online, so this check should only
	 * fire for direct arch_{add,remove}_memory() users outside of
	 * add_memory_resource().
	 */
	unsigned long min_align;

	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
		min_align = PAGES_PER_SUBSECTION;
	else
		min_align = PAGES_PER_SECTION;
	if (!IS_ALIGNED(pfn, min_align)
			|| !IS_ALIGNED(nr_pages, min_align)) {
		WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
				reason, pfn, pfn + nr_pages - 1);
		return -EINVAL;
	}
	return 0;
282
283
}

284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
static int check_hotplug_memory_addressable(unsigned long pfn,
					    unsigned long nr_pages)
{
	const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;

	if (max_addr >> MAX_PHYSMEM_BITS) {
		const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
		WARN(1,
		     "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
		     (u64)PFN_PHYS(pfn), max_addr, max_allowed);
		return -E2BIG;
	}

	return 0;
}

300
301
302
303
304
305
/*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
 * call this function after deciding the zone to which to
 * add the new pages.
 */
306
int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
307
		struct mhp_params *params)
308
{
309
310
	const unsigned long end_pfn = pfn + nr_pages;
	unsigned long cur_nr_pages;
311
	int err;
312
313
314
315
	struct vmem_altmap *altmap = params->altmap;

	if (WARN_ON_ONCE(!params->pgprot.pgprot))
		return -EINVAL;
316

317
318
319
320
	err = check_hotplug_memory_addressable(pfn, nr_pages);
	if (err)
		return err;

321
322
323
324
	if (altmap) {
		/*
		 * Validate altmap is within bounds of the total request
		 */
325
		if (altmap->base_pfn != pfn
326
327
				|| vmem_altmap_offset(altmap) > nr_pages) {
			pr_warn_once("memory add fail, invalid altmap\n");
328
			return -EINVAL;
329
330
331
332
		}
		altmap->alloc = 0;
	}

333
334
335
	err = check_pfn_span(pfn, nr_pages, "add");
	if (err)
		return err;
336

337
338
339
340
341
	for (; pfn < end_pfn; pfn += cur_nr_pages) {
		/* Select all remaining pages up to the next section boundary */
		cur_nr_pages = min(end_pfn - pfn,
				   SECTION_ALIGN_UP(pfn + 1) - pfn);
		err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
342
		if (err)
343
			break;
344
		cond_resched();
345
346
347
348
349
350
	}
	vmemmap_populate_print_last();
	return err;
}

/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
351
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
352
353
354
				     unsigned long start_pfn,
				     unsigned long end_pfn)
{
355
	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
356
		if (unlikely(!pfn_to_online_page(start_pfn)))
357
358
359
360
361
			continue;

		if (unlikely(pfn_to_nid(start_pfn) != nid))
			continue;

362
		if (zone != page_zone(pfn_to_page(start_pfn)))
363
364
365
366
367
368
369
370
371
			continue;

		return start_pfn;
	}

	return 0;
}

/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
372
static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
373
374
375
376
377
378
379
				    unsigned long start_pfn,
				    unsigned long end_pfn)
{
	unsigned long pfn;

	/* pfn is the end pfn of a memory section. */
	pfn = end_pfn - 1;
380
	for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
381
		if (unlikely(!pfn_to_online_page(pfn)))
382
383
384
385
386
			continue;

		if (unlikely(pfn_to_nid(pfn) != nid))
			continue;

387
		if (zone != page_zone(pfn_to_page(pfn)))
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
			continue;

		return pfn;
	}

	return 0;
}

static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
			     unsigned long end_pfn)
{
	unsigned long pfn;
	int nid = zone_to_nid(zone);

	zone_span_writelock(zone);
403
	if (zone->zone_start_pfn == start_pfn) {
404
405
406
407
408
409
410
		/*
		 * If the section is smallest section in the zone, it need
		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
		 * In this case, we find second smallest valid mem_section
		 * for shrinking zone.
		 */
		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
411
						zone_end_pfn(zone));
412
		if (pfn) {
413
			zone->spanned_pages = zone_end_pfn(zone) - pfn;
414
			zone->zone_start_pfn = pfn;
415
416
417
		} else {
			zone->zone_start_pfn = 0;
			zone->spanned_pages = 0;
418
		}
419
	} else if (zone_end_pfn(zone) == end_pfn) {
420
421
422
423
424
425
		/*
		 * If the section is biggest section in the zone, it need
		 * shrink zone->spanned_pages.
		 * In this case, we find second biggest valid mem_section for
		 * shrinking zone.
		 */
426
		pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
427
428
					       start_pfn);
		if (pfn)
429
430
431
432
433
			zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
		else {
			zone->zone_start_pfn = 0;
			zone->spanned_pages = 0;
		}
434
435
436
437
	}
	zone_span_writeunlock(zone);
}

438
static void update_pgdat_span(struct pglist_data *pgdat)
439
{
440
441
	unsigned long node_start_pfn = 0, node_end_pfn = 0;
	struct zone *zone;
442

443
444
445
446
	for (zone = pgdat->node_zones;
	     zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
		unsigned long zone_end_pfn = zone->zone_start_pfn +
					     zone->spanned_pages;
447

448
449
		/* No need to lock the zones, they can't change. */
		if (!zone->spanned_pages)
450
			continue;
451
452
453
		if (!node_end_pfn) {
			node_start_pfn = zone->zone_start_pfn;
			node_end_pfn = zone_end_pfn;
454
			continue;
455
		}
456

457
458
459
460
		if (zone_end_pfn > node_end_pfn)
			node_end_pfn = zone_end_pfn;
		if (zone->zone_start_pfn < node_start_pfn)
			node_start_pfn = zone->zone_start_pfn;
461
462
	}

463
464
	pgdat->node_start_pfn = node_start_pfn;
	pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
465
466
}

467
468
469
void __ref remove_pfn_range_from_zone(struct zone *zone,
				      unsigned long start_pfn,
				      unsigned long nr_pages)
470
{
471
	const unsigned long end_pfn = start_pfn + nr_pages;
472
	struct pglist_data *pgdat = zone->zone_pgdat;
473
	unsigned long pfn, cur_nr_pages, flags;
474

475
	/* Poison struct pages because they are now uninitialized again. */
476
477
478
479
480
481
482
483
484
	for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
		cond_resched();

		/* Select all remaining pages up to the next section boundary */
		cur_nr_pages =
			min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
		page_init_poison(pfn_to_page(pfn),
				 sizeof(struct page) * cur_nr_pages);
	}
485

486
487
488
489
490
491
492
493
494
495
#ifdef CONFIG_ZONE_DEVICE
	/*
	 * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
	 * we will not try to shrink the zones - which is okay as
	 * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
	 */
	if (zone_idx(zone) == ZONE_DEVICE)
		return;
#endif

496
497
	clear_zone_contiguous(zone);

498
499
	pgdat_resize_lock(zone->zone_pgdat, &flags);
	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
500
	update_pgdat_span(pgdat);
501
	pgdat_resize_unlock(zone->zone_pgdat, &flags);
502
503

	set_zone_contiguous(zone);
504
505
}

506
507
508
static void __remove_section(unsigned long pfn, unsigned long nr_pages,
			     unsigned long map_offset,
			     struct vmem_altmap *altmap)
509
{
510
	struct mem_section *ms = __pfn_to_section(pfn);
511

512
513
	if (WARN_ON_ONCE(!valid_section(ms)))
		return;
514

515
	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
516
517
518
}

/**
519
 * __remove_pages() - remove sections of pages
520
 * @pfn: starting pageframe (must be aligned to start of a section)
521
 * @nr_pages: number of pages to remove (must be multiple of section size)
522
 * @altmap: alternative device page map or %NULL if default memmap is used
523
524
525
526
527
528
 *
 * Generic helper function to remove section mappings and sysfs entries
 * for the section of the memory we are removing. Caller needs to make
 * sure that pages are marked reserved and zones are adjust properly by
 * calling offline_pages().
 */
529
530
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
		    struct vmem_altmap *altmap)
531
{
532
533
	const unsigned long end_pfn = pfn + nr_pages;
	unsigned long cur_nr_pages;
534
535
	unsigned long map_offset = 0;

536
	map_offset = vmem_altmap_offset(altmap);
537

538
539
	if (check_pfn_span(pfn, nr_pages, "remove"))
		return;
540

541
	for (; pfn < end_pfn; pfn += cur_nr_pages) {
542
		cond_resched();
543
544
545
546
		/* Select all remaining pages up to the next section boundary */
		cur_nr_pages = min(end_pfn - pfn,
				   SECTION_ALIGN_UP(pfn + 1) - pfn);
		__remove_section(pfn, cur_nr_pages, map_offset, altmap);
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
		map_offset = 0;
	}
}

int set_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

	get_online_mems();
	mutex_lock(&online_page_callback_lock);

	if (online_page_callback == generic_online_page) {
		online_page_callback = callback;
		rc = 0;
	}

	mutex_unlock(&online_page_callback_lock);
	put_online_mems();

	return rc;
}
EXPORT_SYMBOL_GPL(set_online_page_callback);

int restore_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

	get_online_mems();
	mutex_lock(&online_page_callback_lock);

	if (online_page_callback == callback) {
		online_page_callback = generic_online_page;
		rc = 0;
	}

	mutex_unlock(&online_page_callback_lock);
	put_online_mems();

	return rc;
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);

589
void generic_online_page(struct page *page, unsigned int order)
590
{
591
592
593
594
595
596
597
	/*
	 * Freeing the page with debug_pagealloc enabled will try to unmap it,
	 * so we should map it first. This is better than introducing a special
	 * case in page freeing fast path.
	 */
	if (debug_pagealloc_enabled_static())
		kernel_map_pages(page, 1 << order, 1);
598
599
600
601
602
603
604
	__free_pages_core(page, order);
	totalram_pages_add(1UL << order);
#ifdef CONFIG_HIGHMEM
	if (PageHighMem(page))
		totalhigh_pages_add(1UL << order);
#endif
}
605
EXPORT_SYMBOL_GPL(generic_online_page);
606

607
608
609
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
{
610
611
612
	const unsigned long end_pfn = start_pfn + nr_pages;
	unsigned long pfn;
	int order;
613

614
615
616
617
618
619
620
621
622
623
624
625
	/*
	 * Online the pages. The callback might decide to keep some pages
	 * PG_reserved (to add them to the buddy later), but we still account
	 * them as being online/belonging to this zone ("present").
	 */
	for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
		order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
		/* __free_pages_core() wants pfns to be aligned to the order */
		if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
			order = 0;
		(*online_page_callback)(pfn_to_page(pfn), order);
	}
626

627
628
	/* mark all involved sections as online */
	online_mem_sections(start_pfn, end_pfn);
629

630
	*(unsigned long *)arg += nr_pages;
631
632
633
634
635
636
637
638
639
	return 0;
}

/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
	struct zone *zone, struct memory_notify *arg)
{
	int nid = zone_to_nid(zone);

640
641
642
	arg->status_change_nid = NUMA_NO_NODE;
	arg->status_change_nid_normal = NUMA_NO_NODE;
	arg->status_change_nid_high = NUMA_NO_NODE;
643

644
645
646
	if (!node_state(nid, N_MEMORY))
		arg->status_change_nid = nid;
	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
647
648
		arg->status_change_nid_normal = nid;
#ifdef CONFIG_HIGHMEM
649
	if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
650
651
652
653
654
655
656
657
658
659
660
661
		arg->status_change_nid_high = nid;
#endif
}

static void node_states_set_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_set_state(node, N_NORMAL_MEMORY);

	if (arg->status_change_nid_high >= 0)
		node_set_state(node, N_HIGH_MEMORY);

662
663
	if (arg->status_change_nid >= 0)
		node_set_state(node, N_MEMORY);
664
665
}

666
667
668
669
670
671
672
673
674
675
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
		unsigned long nr_pages)
{
	unsigned long old_end_pfn = zone_end_pfn(zone);

	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
		zone->zone_start_pfn = start_pfn;

	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
}
676

677
678
679
680
681
682
683
684
685
686
static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
                                     unsigned long nr_pages)
{
	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);

	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
		pgdat->node_start_pfn = start_pfn;

	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;

687
688
689
690
691
692
}
/*
 * Associate the pfn range with the given zone, initializing the memmaps
 * and resizing the pgdat/zone data to span the added pages. After this
 * call, all affected pages are PG_reserved.
 */
693
694
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
695
696
697
698
699
700
701
702
703
704
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nid = pgdat->node_id;
	unsigned long flags;

	clear_zone_contiguous(zone);

	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
	pgdat_resize_lock(pgdat, &flags);
	zone_span_writelock(zone);
705
706
	if (zone_is_empty(zone))
		init_currently_empty_zone(zone, start_pfn, nr_pages);
707
708
709
710
711
712
713
714
715
716
717
	resize_zone_range(zone, start_pfn, nr_pages);
	zone_span_writeunlock(zone);
	resize_pgdat_range(pgdat, start_pfn, nr_pages);
	pgdat_resize_unlock(pgdat, &flags);

	/*
	 * TODO now we have a visible range of pages which are not associated
	 * with their zone properly. Not nice but set_pfnblock_flags_mask
	 * expects the zone spans the pfn range. All the pages in the range
	 * are reserved so nobody should be touching them so we should be safe
	 */
718
719
	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
			MEMMAP_HOTPLUG, altmap);
720
721
722
723
724
725
726
727
728

	set_zone_contiguous(zone);
}

/*
 * Returns a default kernel memory zone for the given pfn range.
 * If no kernel zone covers this pfn range it will automatically go
 * to the ZONE_NORMAL.
 */
729
static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
730
731
732
733
734
735
736
737
738
739
		unsigned long nr_pages)
{
	struct pglist_data *pgdat = NODE_DATA(nid);
	int zid;

	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
		struct zone *zone = &pgdat->node_zones[zid];

		if (zone_intersects(zone, start_pfn, nr_pages))
			return zone;
740
741
	}

742
743
	return &pgdat->node_zones[ZONE_NORMAL];
}
744

745
746
static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
		unsigned long nr_pages)
747
{
748
749
750
751
752
	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
			nr_pages);
	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
753

754
755
756
757
758
759
	/*
	 * We inherit the existing zone in a simple case where zones do not
	 * overlap in the given range
	 */
	if (in_kernel ^ in_movable)
		return (in_kernel) ? kernel_zone : movable_zone;
760

761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
	/*
	 * If the range doesn't belong to any zone or two zones overlap in the
	 * given range then we use movable zone only if movable_node is
	 * enabled because we always online to a kernel zone by default.
	 */
	return movable_node_enabled ? movable_zone : kernel_zone;
}

struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
		unsigned long nr_pages)
{
	if (online_type == MMOP_ONLINE_KERNEL)
		return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);

	if (online_type == MMOP_ONLINE_MOVABLE)
		return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];

	return default_zone_for_pfn(nid, start_pfn, nr_pages);
779
780
}

781
782
int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
		       int online_type, int nid)
783
784
785
786
787
788
789
{
	unsigned long flags;
	unsigned long onlined_pages = 0;
	struct zone *zone;
	int need_zonelists_rebuild = 0;
	int ret;
	struct memory_notify arg;
790

791
792
	mem_hotplug_begin();

793
	/* associate pfn range with the zone */
794
795
	zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812

	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
	node_states_check_changes_online(nr_pages, zone, &arg);

	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret)
		goto failed_addition;

	/*
	 * If this zone is not populated, then it is not in zonelist.
	 * This means the page allocator ignores this zone.
	 * So, zonelist must be updated after online.
	 */
	if (!populated_zone(zone)) {
		need_zonelists_rebuild = 1;
813
		setup_zone_pageset(zone);
814
815
816
817
818
	}

	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
		online_pages_range);
	if (ret) {
819
		/* not a single memory resource was applicable */
820
821
822
823
824
825
826
827
828
829
830
		if (need_zonelists_rebuild)
			zone_pcp_reset(zone);
		goto failed_addition;
	}

	zone->present_pages += onlined_pages;

	pgdat_resize_lock(zone->zone_pgdat, &flags);
	zone->zone_pgdat->node_present_pages += onlined_pages;
	pgdat_resize_unlock(zone->zone_pgdat, &flags);

831
832
	shuffle_zone(zone);

833
834
835
836
837
	node_states_set_node(nid, &arg);
	if (need_zonelists_rebuild)
		build_all_zonelists(NULL);
	else
		zone_pcp_update(zone);
838
839
840

	init_per_zone_wmark_min();

841
842
	kswapd_run(nid);
	kcompactd_run(nid);
843
844
845
846
847

	vm_total_pages = nr_free_pagecache_pages();

	writeback_set_ratelimit();

848
	memory_notify(MEM_ONLINE, &arg);
849
	mem_hotplug_done();
850
851
852
853
854
855
856
	return 0;

failed_addition:
	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
		 (unsigned long long) pfn << PAGE_SHIFT,
		 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
	memory_notify(MEM_CANCEL_ONLINE, &arg);
857
	remove_pfn_range_from_zone(zone, pfn, nr_pages);
858
	mem_hotplug_done();
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
	return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */

static void reset_node_present_pages(pg_data_t *pgdat)
{
	struct zone *z;

	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
		z->present_pages = 0;

	pgdat->node_present_pages = 0;
}

/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
	struct pglist_data *pgdat;
	unsigned long start_pfn = PFN_DOWN(start);

	pgdat = NODE_DATA(nid);
	if (!pgdat) {
		pgdat = arch_alloc_nodedata(nid);
		if (!pgdat)
			return NULL;

885
886
		pgdat->per_cpu_nodestats =
			alloc_percpu(struct per_cpu_nodestat);
887
888
		arch_refresh_nodedata(nid, pgdat);
	} else {
889
		int cpu;
890
891
892
893
894
		/*
		 * Reset the nr_zones, order and classzone_idx before reuse.
		 * Note that kswapd will init kswapd_classzone_idx properly
		 * when it starts in the near future.
		 */
895
		pgdat->nr_zones = 0;
896
897
		pgdat->kswapd_order = 0;
		pgdat->kswapd_classzone_idx = 0;
898
899
900
901
902
903
		for_each_online_cpu(cpu) {
			struct per_cpu_nodestat *p;

			p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
			memset(p, 0, sizeof(*p));
		}
904
905
906
907
	}

	/* we can use NODE_DATA(nid) from here */

908
909
910
	pgdat->node_id = nid;
	pgdat->node_start_pfn = start_pfn;

911
	/* init node's zones as empty zones, we don't have any present pages.*/
912
	free_area_init_core_hotplug(nid);
913
914
915
916
917

	/*
	 * The node we allocated has no zone fallback lists. For avoiding
	 * to access not-initialized zonelist, build here.
	 */
918
	build_all_zonelists(pgdat);
919
920
921
922
923
924

	/*
	 * When memory is hot-added, all the memory is in offline state. So
	 * clear all zones' present_pages because they will be updated in
	 * online_pages() and offline_pages().
	 */
925
	reset_node_managed_pages(pgdat);
926
927
928
929
930
	reset_node_present_pages(pgdat);

	return pgdat;
}

931
static void rollback_node_hotadd(int nid)
932
{
933
934
	pg_data_t *pgdat = NODE_DATA(nid);

935
	arch_refresh_nodedata(nid, NULL);
936
	free_percpu(pgdat->per_cpu_nodestats);
937
938
939
940
941
942
	arch_free_nodedata(pgdat);
}


/**
 * try_online_node - online a node if offlined
943
 * @nid: the node ID
944
945
 * @start: start addr of the node
 * @set_node_online: Whether we want to online the node
946
 * called by cpu_up() to online a node without onlined memory.
947
948
949
950
951
 *
 * Returns:
 * 1 -> a new node has been allocated
 * 0 -> the node is already online
 * -ENOMEM -> the node could not be allocated
952
 */
953
static int __try_online_node(int nid, u64 start, bool set_node_online)
954
{
955
956
	pg_data_t *pgdat;
	int ret = 1;
957
958
959
960

	if (node_online(nid))
		return 0;

961
	pgdat = hotadd_new_pgdat(nid, start);
962
963
964
965
966
	if (!pgdat) {
		pr_err("Cannot online node %d due to NULL pgdat\n", nid);
		ret = -ENOMEM;
		goto out;
	}
967
968
969
970
971
972

	if (set_node_online) {
		node_set_online(nid);
		ret = register_one_node(nid);
		BUG_ON(ret);
	}
973
out:
974
975
976
977
978
979
980
981
982
983
984
985
	return ret;
}

/*
 * Users of this function always want to online/register the node
 */
int try_online_node(int nid)
{
	int ret;

	mem_hotplug_begin();
	ret =  __try_online_node(nid, 0, true);
986
987
988
989
990
991
	mem_hotplug_done();
	return ret;
}

static int check_hotplug_memory_range(u64 start, u64 size)
{
992
	/* memory range must be block size aligned */
993
994
	if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
	    !IS_ALIGNED(size, memory_block_size_bytes())) {
995
		pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
996
		       memory_block_size_bytes(), start, size);
997
998
999
1000
1001
1002
1003
1004
		return -EINVAL;
	}

	return 0;
}

static int online_memory_block(struct memory_block *mem, void *arg)
{
1005
	mem->online_type = memhp_default_online_type;
1006
	return device_online(&mem->dev);
1007
1008
}

1009
1010
1011
1012
1013
1014
1015
/*
 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
 * and online/offline operations (triggered e.g. by sysfs).
 *
 * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
 */
int __ref add_memory_resource(int nid, struct resource *res)
1016
{
1017
	struct mhp_params params = { .pgprot = PAGE_KERNEL };
1018
	u64 start, size;
1019
	bool new_node = false;
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
	int ret;

	start = res->start;
	size = resource_size(res);

	ret = check_hotplug_memory_range(start, size);
	if (ret)
		return ret;

	mem_hotplug_begin();

	/*
	 * Add new range to memblock so that when hotadd_new_pgdat() is called
	 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
	 * this new range and calculate total pages correctly.  The range will
	 * be removed at hot-remove time.
	 */
	memblock_add_node(start, size, nid);

1039
1040
1041
1042
	ret = __try_online_node(nid, start, false);
	if (ret < 0)
		goto error;
	new_node = ret;
1043
1044

	/* call arch's memory hotadd */
1045
	ret = arch_add_memory(nid, start, size, &params);
1046
1047
1048
	if (ret < 0)
		goto error;

1049
1050
1051
1052
1053
1054
1055
	/* create memory block devices after memory was added */
	ret = create_memory_block_devices(start, size);
	if (ret) {
		arch_remove_memory(nid, start, size, NULL);
		goto error;
	}

1056
	if (new_node) {
1057
		/* If sysfs file of new node can't be created, cpu on the node
1058
1059
		 * can't be hot-added. There is no rollback way now.
		 * So, check by BUG_ON() to catch it reluctantly..
1060
		 * We online node here. We can't roll back from here.
1061
		 */
1062
1063
		node_set_online(nid);
		ret = __register_one_node(nid);
1064
1065
1066
		BUG_ON(ret);
	}

1067
1068
1069
1070
	/* link memory sections under this node.*/
	ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
	BUG_ON(ret);

1071
1072
1073
	/* create new memmap entry */
	firmware_map_add_hotplug(start, start + size, "System RAM");

1074
1075
1076
	/* device_online() will take the lock when calling online_pages() */
	mem_hotplug_done();

1077
	/* online pages if requested */
1078
	if (memhp_default_online_type != MMOP_OFFLINE)
1079
		walk_memory_blocks(start, size, NULL, online_memory_block);
1080

1081
	return ret;
1082
1083
error:
	/* rollback pgdat allocation and others */
1084
1085
	if (new_node)
		rollback_node_hotadd(nid);
1086
1087
1088
1089
1090
	memblock_remove(start, size);
	mem_hotplug_done();
	return ret;
}

1091
1092
/* requires device_hotplug_lock, see add_memory_resource() */
int __ref __add_memory(int nid, u64 start, u64 size)
1093
1094
1095
1096
1097
1098
1099
1100
{
	struct resource *res;
	int ret;

	res = register_memory_resource(start, size);
	if (IS_ERR(res))
		return PTR_ERR(res);

1101
	ret = add_memory_resource(nid, res);
1102
1103
1104
1105
	if (ret < 0)
		release_memory_resource(res);
	return ret;
}
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116

int add_memory(int nid, u64 start, u64 size)
{
	int rc;

	lock_device_hotplug();
	rc = __add_memory(nid, start, size);
	unlock_device_hotplug();

	return rc;
}
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
EXPORT_SYMBOL_GPL(add_memory);

#ifdef CONFIG_MEMORY_HOTREMOVE
/*
 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
 * set and the size of the free page is given by page_order(). Using this,
 * the function determines if the pageblock contains only free pages.
 * Due to buddy contraints, a free page at least the size of a pageblock will
 * be located at the start of the pageblock
 */
static inline int pageblock_free(struct page *page)
{
	return PageBuddy(page) && page_order(page) >= pageblock_order;
}

1132
1133
/* Return the pfn of the start of the next active pageblock after a given pfn */
static unsigned long next_active_pageblock(unsigned long pfn)
1134
{
1135
1136
	struct page *page = pfn_to_page(pfn);

1137
	/* Ensure the starting page is pageblock-aligned */
1138
	BUG_ON(pfn & (pageblock_nr_pages - 1));
1139
1140
1141
1142
1143
1144
1145

	/* If the entire pageblock is free, move to the end of free page */
	if (pageblock_free(page)) {
		int order;
		/* be careful. we don't have locks, page_order can be changed.*/
		order = page_order(page);
		if ((order < MAX_ORDER) && (order >= pageblock_order))
1146
			return pfn + (1 << order);
1147
1148
	}

1149
	return pfn + pageblock_nr_pages;
1150
1151
}

1152
static bool is_pageblock_removable_nolock(unsigned long pfn)
1153
{
1154
	struct page *page = pfn_to_page(pfn);
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
	struct zone *zone;

	/*
	 * We have to be careful here because we are iterating over memory
	 * sections which are not zone aware so we might end up outside of
	 * the zone but still within the section.
	 * We have to take care about the node as well. If the node is offline
	 * its NODE_DATA will be NULL - see page_zone.
	 */
	if (!node_online(page_to_nid(page)))
		return false;

	zone = page_zone(page);
	pfn = page_to_pfn(page);
	if (!zone_spans_pfn(zone, pfn))
		return false;

1172
	return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
1173
				    MEMORY_OFFLINE);
1174
1175
}

1176
1177
1178
/* Checks if this range of memory is likely to be hot-removable. */
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
1179
1180
1181
1182
	unsigned long end_pfn, pfn;

	end_pfn = min(start_pfn + nr_pages,
			zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
1183
1184

	/* Check the starting page of each pageblock within the range */
1185
1186
	for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
		if (!is_pageblock_removable_nolock(pfn))
1187
1188
1189
1190
1191
1192
1193
1194
1195
			return false;
		cond_resched();
	}

	/* All pageblocks in the memory block are likely to be hot-removable */
	return true;
}

/*
1196
1197
 * Confirm all pages in a range [start, end) belong to the same zone (skipping
 * memory holes). When true, return the zone.
1198
 */
1199
1200
struct zone *test_pages_in_a_zone(unsigned long start_pfn,
				  unsigned long end_pfn)
1201
1202
1203
1204
1205
{
	unsigned long pfn, sec_end_pfn;
	struct zone *zone = NULL;
	struct page *page;
	int i;
1206
	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
1207
	     pfn < end_pfn;
1208
	     pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
		/* Make sure the memory section is present first */
		if (!present_section_nr(pfn_to_section_nr(pfn)))
			continue;
		for (; pfn < sec_end_pfn && pfn < end_pfn;
		     pfn += MAX_ORDER_NR_PAGES) {
			i = 0;
			/* This is just a CONFIG_HOLES_IN_ZONE check.*/
			while ((i < MAX_ORDER_NR_PAGES) &&
				!pfn_valid_within(pfn + i))
				i++;
1219
			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
1220
				continue;
1221
1222
			/* Check if we got outside of the zone */
			if (zone && !zone_spans_pfn(zone, pfn + i))
1223
				return NULL;
1224
1225
			page = pfn_to_page(pfn + i);
			if (zone && page_zone(page) != zone)
1226
				return NULL;
1227
1228
1229
			zone = page_zone(page);
		}
	}
1230

1231
	return zone;
1232
1233
1234
}

/*
1235
1236
1237
1238
 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
 * non-lru movable pages and hugepages). We scan pfn because it's much
 * easier than scanning over linked list. This function returns the pfn
 * of the first found movable page if it's found, otherwise 0.
1239
1240
1241
1242
 */
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
{
	unsigned long pfn;
1243

1244
	for (pfn = start; pfn < end; pfn++) {
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
		struct page *page, *head;
		unsigned long skip;

		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);
		if (PageLRU(page))
			return pfn;
		if (__PageMovable(page))
			return pfn;

		if (!PageHuge(page))
			continue;
		head = compound_head(page);
1259
		if (page_huge_active(head))
1260
			return pfn;
1261
		skip = compound_nr(head) - (page - head);
1262
		pfn += skip - 1;
1263
1264
1265
1266
	}
	return 0;
}

1267
static struct page *new_node_page(struct page *page, unsigned long private)
1268
1269
1270
1271
1272
{
	int nid = page_to_nid(page);
	nodemask_t nmask = node_states[N_MEMORY];

	/*
1273
1274
1275
	 * try to allocate from a different node but reuse this node if there
	 * are no other online nodes to be used (e.g. we are offlining a part
	 * of the only existing node)
1276
1277
	 */
	node_clear(nid, nmask);
1278
1279
	if (nodes_empty(nmask))
		node_set(nid, nmask);
1280

1281
	return new_page_nodemask(page, nid, &nmask);
1282
1283
}

1284
1285
1286
1287
1288
1289
1290
1291
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct page *page;
	int ret = 0;
	LIST_HEAD(source);

1292
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1293
1294
1295
1296
1297
1298
		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);

		if (PageHuge(page)) {
			struct page *head = compound_head(page);
1299
			pfn = page_to_pfn(head) + compound_nr(head) - 1;
1300
			isolate_huge_page(head, &source);
1301
			continue;
1302
		} else if (PageTransHuge(page))
1303
1304
			pfn = page_to_pfn(compound_head(page))
				+ hpage_nr_pages(page) - 1;
1305

1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
		/*
		 * HWPoison pages have elevated reference counts so the migration would
		 * fail on them. It also doesn't make any sense to migrate them in the
		 * first place. Still try to unmap such a page in case it is still mapped
		 * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
		 * the unmap as the catch all safety net).
		 */
		if (PageHWPoison(page)) {
			if (WARN_ON(PageLRU(page)))
				isolate_lru_page(page);
			if (page_mapped(page))
				try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
			continue;
		}

1321
1322
1323
		if (!get_page_unless_zero(page))
			continue;
		/*
1324
1325
		 * We can skip free pages. And we can deal with pages on
		 * LRU and non-lru movable pages.
1326
		 */
1327
1328
1329
1330
		if (PageLRU(page))
			ret = isolate_lru_page(page);
		else
			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1331
1332
		if (!ret) { /* Success */
			list_add_tail(&page->lru, &source);
1333
1334
			if (!__PageMovable(page))
				inc_node_page_state(page, NR_ISOLATED_ANON +
1335
						    page_is_file_lru(page));
1336
1337

		} else {
1338
			pr_warn("failed to isolate pfn %lx\n", pfn);
1339
			dump_page(page, "isolation failed");
1340
		}
1341
		put_page(page);
1342
1343
	}
	if (!list_empty(&source)) {
1344
1345
		/* Allocate a new page from the nearest neighbor node */
		ret = migrate_pages(&source, new_node_page, NULL, 0,
1346
					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);