mm, sparse: pass nid instead of pgdat to sparse_add_one_section()
[sfrench/cifs-2.6.git] / mm / sparse.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * sparse memory mappings.
4  */
5 #include <linux/mm.h>
6 #include <linux/slab.h>
7 #include <linux/mmzone.h>
8 #include <linux/memblock.h>
9 #include <linux/compiler.h>
10 #include <linux/highmem.h>
11 #include <linux/export.h>
12 #include <linux/spinlock.h>
13 #include <linux/vmalloc.h>
14
15 #include "internal.h"
16 #include <asm/dma.h>
17 #include <asm/pgalloc.h>
18 #include <asm/pgtable.h>
19
20 /*
21  * Permanent SPARSEMEM data:
22  *
23  * 1) mem_section       - memory sections, mem_map's for valid memory
24  */
25 #ifdef CONFIG_SPARSEMEM_EXTREME
26 struct mem_section **mem_section;
27 #else
28 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
29         ____cacheline_internodealigned_in_smp;
30 #endif
31 EXPORT_SYMBOL(mem_section);
32
33 #ifdef NODE_NOT_IN_PAGE_FLAGS
34 /*
35  * If we did not store the node number in the page then we have to
36  * do a lookup in the section_to_node_table in order to find which
37  * node the page belongs to.
38  */
39 #if MAX_NUMNODES <= 256
40 static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
41 #else
42 static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
43 #endif
44
45 int page_to_nid(const struct page *page)
46 {
47         return section_to_node_table[page_to_section(page)];
48 }
49 EXPORT_SYMBOL(page_to_nid);
50
51 static void set_section_nid(unsigned long section_nr, int nid)
52 {
53         section_to_node_table[section_nr] = nid;
54 }
55 #else /* !NODE_NOT_IN_PAGE_FLAGS */
56 static inline void set_section_nid(unsigned long section_nr, int nid)
57 {
58 }
59 #endif
60
61 #ifdef CONFIG_SPARSEMEM_EXTREME
62 static noinline struct mem_section __ref *sparse_index_alloc(int nid)
63 {
64         struct mem_section *section = NULL;
65         unsigned long array_size = SECTIONS_PER_ROOT *
66                                    sizeof(struct mem_section);
67
68         if (slab_is_available())
69                 section = kzalloc_node(array_size, GFP_KERNEL, nid);
70         else
71                 section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
72                                               nid);
73
74         return section;
75 }
76
77 static int __meminit sparse_index_init(unsigned long section_nr, int nid)
78 {
79         unsigned long root = SECTION_NR_TO_ROOT(section_nr);
80         struct mem_section *section;
81
82         if (mem_section[root])
83                 return -EEXIST;
84
85         section = sparse_index_alloc(nid);
86         if (!section)
87                 return -ENOMEM;
88
89         mem_section[root] = section;
90
91         return 0;
92 }
93 #else /* !SPARSEMEM_EXTREME */
94 static inline int sparse_index_init(unsigned long section_nr, int nid)
95 {
96         return 0;
97 }
98 #endif
99
100 #ifdef CONFIG_SPARSEMEM_EXTREME
101 int __section_nr(struct mem_section* ms)
102 {
103         unsigned long root_nr;
104         struct mem_section *root = NULL;
105
106         for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
107                 root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
108                 if (!root)
109                         continue;
110
111                 if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
112                      break;
113         }
114
115         VM_BUG_ON(!root);
116
117         return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
118 }
119 #else
120 int __section_nr(struct mem_section* ms)
121 {
122         return (int)(ms - mem_section[0]);
123 }
124 #endif
125
126 /*
127  * During early boot, before section_mem_map is used for an actual
128  * mem_map, we use section_mem_map to store the section's NUMA
129  * node.  This keeps us from having to use another data structure.  The
130  * node information is cleared just before we store the real mem_map.
131  */
132 static inline unsigned long sparse_encode_early_nid(int nid)
133 {
134         return (nid << SECTION_NID_SHIFT);
135 }
136
137 static inline int sparse_early_nid(struct mem_section *section)
138 {
139         return (section->section_mem_map >> SECTION_NID_SHIFT);
140 }
141
142 /* Validate the physical addressing limitations of the model */
143 void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
144                                                 unsigned long *end_pfn)
145 {
146         unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
147
148         /*
149          * Sanity checks - do not allow an architecture to pass
150          * in larger pfns than the maximum scope of sparsemem:
151          */
152         if (*start_pfn > max_sparsemem_pfn) {
153                 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
154                         "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
155                         *start_pfn, *end_pfn, max_sparsemem_pfn);
156                 WARN_ON_ONCE(1);
157                 *start_pfn = max_sparsemem_pfn;
158                 *end_pfn = max_sparsemem_pfn;
159         } else if (*end_pfn > max_sparsemem_pfn) {
160                 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
161                         "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
162                         *start_pfn, *end_pfn, max_sparsemem_pfn);
163                 WARN_ON_ONCE(1);
164                 *end_pfn = max_sparsemem_pfn;
165         }
166 }
167
168 /*
169  * There are a number of times that we loop over NR_MEM_SECTIONS,
170  * looking for section_present() on each.  But, when we have very
171  * large physical address spaces, NR_MEM_SECTIONS can also be
172  * very large which makes the loops quite long.
173  *
174  * Keeping track of this gives us an easy way to break out of
175  * those loops early.
176  */
177 int __highest_present_section_nr;
178 static void section_mark_present(struct mem_section *ms)
179 {
180         int section_nr = __section_nr(ms);
181
182         if (section_nr > __highest_present_section_nr)
183                 __highest_present_section_nr = section_nr;
184
185         ms->section_mem_map |= SECTION_MARKED_PRESENT;
186 }
187
188 static inline int next_present_section_nr(int section_nr)
189 {
190         do {
191                 section_nr++;
192                 if (present_section_nr(section_nr))
193                         return section_nr;
194         } while ((section_nr <= __highest_present_section_nr));
195
196         return -1;
197 }
198 #define for_each_present_section_nr(start, section_nr)          \
199         for (section_nr = next_present_section_nr(start-1);     \
200              ((section_nr >= 0) &&                              \
201               (section_nr <= __highest_present_section_nr));    \
202              section_nr = next_present_section_nr(section_nr))
203
204 static inline unsigned long first_present_section_nr(void)
205 {
206         return next_present_section_nr(-1);
207 }
208
209 /* Record a memory area against a node. */
210 void __init memory_present(int nid, unsigned long start, unsigned long end)
211 {
212         unsigned long pfn;
213
214 #ifdef CONFIG_SPARSEMEM_EXTREME
215         if (unlikely(!mem_section)) {
216                 unsigned long size, align;
217
218                 size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
219                 align = 1 << (INTERNODE_CACHE_SHIFT);
220                 mem_section = memblock_alloc(size, align);
221         }
222 #endif
223
224         start &= PAGE_SECTION_MASK;
225         mminit_validate_memmodel_limits(&start, &end);
226         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
227                 unsigned long section = pfn_to_section_nr(pfn);
228                 struct mem_section *ms;
229
230                 sparse_index_init(section, nid);
231                 set_section_nid(section, nid);
232
233                 ms = __nr_to_section(section);
234                 if (!ms->section_mem_map) {
235                         ms->section_mem_map = sparse_encode_early_nid(nid) |
236                                                         SECTION_IS_ONLINE;
237                         section_mark_present(ms);
238                 }
239         }
240 }
241
242 /*
243  * Mark all memblocks as present using memory_present(). This is a
244  * convienence function that is useful for a number of arches
245  * to mark all of the systems memory as present during initialization.
246  */
247 void __init memblocks_present(void)
248 {
249         struct memblock_region *reg;
250
251         for_each_memblock(memory, reg) {
252                 memory_present(memblock_get_region_node(reg),
253                                memblock_region_memory_base_pfn(reg),
254                                memblock_region_memory_end_pfn(reg));
255         }
256 }
257
258 /*
259  * Subtle, we encode the real pfn into the mem_map such that
260  * the identity pfn - section_mem_map will return the actual
261  * physical page frame number.
262  */
263 static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
264 {
265         unsigned long coded_mem_map =
266                 (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
267         BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
268         BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
269         return coded_mem_map;
270 }
271
272 /*
273  * Decode mem_map from the coded memmap
274  */
275 struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
276 {
277         /* mask off the extra low bits of information */
278         coded_mem_map &= SECTION_MAP_MASK;
279         return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
280 }
281
282 static void __meminit sparse_init_one_section(struct mem_section *ms,
283                 unsigned long pnum, struct page *mem_map,
284                 unsigned long *pageblock_bitmap)
285 {
286         ms->section_mem_map &= ~SECTION_MAP_MASK;
287         ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
288                                                         SECTION_HAS_MEM_MAP;
289         ms->pageblock_flags = pageblock_bitmap;
290 }
291
292 unsigned long usemap_size(void)
293 {
294         return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
295 }
296
297 #ifdef CONFIG_MEMORY_HOTPLUG
298 static unsigned long *__kmalloc_section_usemap(void)
299 {
300         return kmalloc(usemap_size(), GFP_KERNEL);
301 }
302 #endif /* CONFIG_MEMORY_HOTPLUG */
303
304 #ifdef CONFIG_MEMORY_HOTREMOVE
305 static unsigned long * __init
306 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
307                                          unsigned long size)
308 {
309         unsigned long goal, limit;
310         unsigned long *p;
311         int nid;
312         /*
313          * A page may contain usemaps for other sections preventing the
314          * page being freed and making a section unremovable while
315          * other sections referencing the usemap remain active. Similarly,
316          * a pgdat can prevent a section being removed. If section A
317          * contains a pgdat and section B contains the usemap, both
318          * sections become inter-dependent. This allocates usemaps
319          * from the same section as the pgdat where possible to avoid
320          * this problem.
321          */
322         goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
323         limit = goal + (1UL << PA_SECTION_SHIFT);
324         nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
325 again:
326         p = memblock_alloc_try_nid_nopanic(size,
327                                                 SMP_CACHE_BYTES, goal, limit,
328                                                 nid);
329         if (!p && limit) {
330                 limit = 0;
331                 goto again;
332         }
333         return p;
334 }
335
336 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
337 {
338         unsigned long usemap_snr, pgdat_snr;
339         static unsigned long old_usemap_snr;
340         static unsigned long old_pgdat_snr;
341         struct pglist_data *pgdat = NODE_DATA(nid);
342         int usemap_nid;
343
344         /* First call */
345         if (!old_usemap_snr) {
346                 old_usemap_snr = NR_MEM_SECTIONS;
347                 old_pgdat_snr = NR_MEM_SECTIONS;
348         }
349
350         usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
351         pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
352         if (usemap_snr == pgdat_snr)
353                 return;
354
355         if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
356                 /* skip redundant message */
357                 return;
358
359         old_usemap_snr = usemap_snr;
360         old_pgdat_snr = pgdat_snr;
361
362         usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
363         if (usemap_nid != nid) {
364                 pr_info("node %d must be removed before remove section %ld\n",
365                         nid, usemap_snr);
366                 return;
367         }
368         /*
369          * There is a circular dependency.
370          * Some platforms allow un-removable section because they will just
371          * gather other removable sections for dynamic partitioning.
372          * Just notify un-removable section's number here.
373          */
374         pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
375                 usemap_snr, pgdat_snr, nid);
376 }
377 #else
378 static unsigned long * __init
379 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
380                                          unsigned long size)
381 {
382         return memblock_alloc_node_nopanic(size, pgdat->node_id);
383 }
384
385 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
386 {
387 }
388 #endif /* CONFIG_MEMORY_HOTREMOVE */
389
390 #ifdef CONFIG_SPARSEMEM_VMEMMAP
391 static unsigned long __init section_map_size(void)
392 {
393         return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
394 }
395
396 #else
397 static unsigned long __init section_map_size(void)
398 {
399         return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
400 }
401
402 struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
403                 struct vmem_altmap *altmap)
404 {
405         unsigned long size = section_map_size();
406         struct page *map = sparse_buffer_alloc(size);
407
408         if (map)
409                 return map;
410
411         map = memblock_alloc_try_nid(size,
412                                           PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
413                                           MEMBLOCK_ALLOC_ACCESSIBLE, nid);
414         return map;
415 }
416 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
417
418 static void *sparsemap_buf __meminitdata;
419 static void *sparsemap_buf_end __meminitdata;
420
421 static void __init sparse_buffer_init(unsigned long size, int nid)
422 {
423         WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
424         sparsemap_buf =
425                 memblock_alloc_try_nid_raw(size, PAGE_SIZE,
426                                                 __pa(MAX_DMA_ADDRESS),
427                                                 MEMBLOCK_ALLOC_ACCESSIBLE, nid);
428         sparsemap_buf_end = sparsemap_buf + size;
429 }
430
431 static void __init sparse_buffer_fini(void)
432 {
433         unsigned long size = sparsemap_buf_end - sparsemap_buf;
434
435         if (sparsemap_buf && size > 0)
436                 memblock_free_early(__pa(sparsemap_buf), size);
437         sparsemap_buf = NULL;
438 }
439
440 void * __meminit sparse_buffer_alloc(unsigned long size)
441 {
442         void *ptr = NULL;
443
444         if (sparsemap_buf) {
445                 ptr = PTR_ALIGN(sparsemap_buf, size);
446                 if (ptr + size > sparsemap_buf_end)
447                         ptr = NULL;
448                 else
449                         sparsemap_buf = ptr + size;
450         }
451         return ptr;
452 }
453
454 void __weak __meminit vmemmap_populate_print_last(void)
455 {
456 }
457
458 /*
459  * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
460  * And number of present sections in this node is map_count.
461  */
462 static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
463                                    unsigned long pnum_end,
464                                    unsigned long map_count)
465 {
466         unsigned long pnum, usemap_longs, *usemap;
467         struct page *map;
468
469         usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
470         usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
471                                                           usemap_size() *
472                                                           map_count);
473         if (!usemap) {
474                 pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
475                 goto failed;
476         }
477         sparse_buffer_init(map_count * section_map_size(), nid);
478         for_each_present_section_nr(pnum_begin, pnum) {
479                 if (pnum >= pnum_end)
480                         break;
481
482                 map = sparse_mem_map_populate(pnum, nid, NULL);
483                 if (!map) {
484                         pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
485                                __func__, nid);
486                         pnum_begin = pnum;
487                         goto failed;
488                 }
489                 check_usemap_section_nr(nid, usemap);
490                 sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
491                 usemap += usemap_longs;
492         }
493         sparse_buffer_fini();
494         return;
495 failed:
496         /* We failed to allocate, mark all the following pnums as not present */
497         for_each_present_section_nr(pnum_begin, pnum) {
498                 struct mem_section *ms;
499
500                 if (pnum >= pnum_end)
501                         break;
502                 ms = __nr_to_section(pnum);
503                 ms->section_mem_map = 0;
504         }
505 }
506
507 /*
508  * Allocate the accumulated non-linear sections, allocate a mem_map
509  * for each and record the physical to section mapping.
510  */
511 void __init sparse_init(void)
512 {
513         unsigned long pnum_begin = first_present_section_nr();
514         int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
515         unsigned long pnum_end, map_count = 1;
516
517         /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
518         set_pageblock_order();
519
520         for_each_present_section_nr(pnum_begin + 1, pnum_end) {
521                 int nid = sparse_early_nid(__nr_to_section(pnum_end));
522
523                 if (nid == nid_begin) {
524                         map_count++;
525                         continue;
526                 }
527                 /* Init node with sections in range [pnum_begin, pnum_end) */
528                 sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
529                 nid_begin = nid;
530                 pnum_begin = pnum_end;
531                 map_count = 1;
532         }
533         /* cover the last node */
534         sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
535         vmemmap_populate_print_last();
536 }
537
538 #ifdef CONFIG_MEMORY_HOTPLUG
539
540 /* Mark all memory sections within the pfn range as online */
541 void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
542 {
543         unsigned long pfn;
544
545         for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
546                 unsigned long section_nr = pfn_to_section_nr(pfn);
547                 struct mem_section *ms;
548
549                 /* onlining code should never touch invalid ranges */
550                 if (WARN_ON(!valid_section_nr(section_nr)))
551                         continue;
552
553                 ms = __nr_to_section(section_nr);
554                 ms->section_mem_map |= SECTION_IS_ONLINE;
555         }
556 }
557
558 #ifdef CONFIG_MEMORY_HOTREMOVE
559 /* Mark all memory sections within the pfn range as online */
560 void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
561 {
562         unsigned long pfn;
563
564         for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
565                 unsigned long section_nr = pfn_to_section_nr(pfn);
566                 struct mem_section *ms;
567
568                 /*
569                  * TODO this needs some double checking. Offlining code makes
570                  * sure to check pfn_valid but those checks might be just bogus
571                  */
572                 if (WARN_ON(!valid_section_nr(section_nr)))
573                         continue;
574
575                 ms = __nr_to_section(section_nr);
576                 ms->section_mem_map &= ~SECTION_IS_ONLINE;
577         }
578 }
579 #endif
580
581 #ifdef CONFIG_SPARSEMEM_VMEMMAP
582 static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
583                 struct vmem_altmap *altmap)
584 {
585         /* This will make the necessary allocations eventually. */
586         return sparse_mem_map_populate(pnum, nid, altmap);
587 }
588 static void __kfree_section_memmap(struct page *memmap,
589                 struct vmem_altmap *altmap)
590 {
591         unsigned long start = (unsigned long)memmap;
592         unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
593
594         vmemmap_free(start, end, altmap);
595 }
596 #ifdef CONFIG_MEMORY_HOTREMOVE
597 static void free_map_bootmem(struct page *memmap)
598 {
599         unsigned long start = (unsigned long)memmap;
600         unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
601
602         vmemmap_free(start, end, NULL);
603 }
604 #endif /* CONFIG_MEMORY_HOTREMOVE */
605 #else
606 static struct page *__kmalloc_section_memmap(void)
607 {
608         struct page *page, *ret;
609         unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
610
611         page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
612         if (page)
613                 goto got_map_page;
614
615         ret = vmalloc(memmap_size);
616         if (ret)
617                 goto got_map_ptr;
618
619         return NULL;
620 got_map_page:
621         ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
622 got_map_ptr:
623
624         return ret;
625 }
626
627 static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
628                 struct vmem_altmap *altmap)
629 {
630         return __kmalloc_section_memmap();
631 }
632
633 static void __kfree_section_memmap(struct page *memmap,
634                 struct vmem_altmap *altmap)
635 {
636         if (is_vmalloc_addr(memmap))
637                 vfree(memmap);
638         else
639                 free_pages((unsigned long)memmap,
640                            get_order(sizeof(struct page) * PAGES_PER_SECTION));
641 }
642
643 #ifdef CONFIG_MEMORY_HOTREMOVE
644 static void free_map_bootmem(struct page *memmap)
645 {
646         unsigned long maps_section_nr, removing_section_nr, i;
647         unsigned long magic, nr_pages;
648         struct page *page = virt_to_page(memmap);
649
650         nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
651                 >> PAGE_SHIFT;
652
653         for (i = 0; i < nr_pages; i++, page++) {
654                 magic = (unsigned long) page->freelist;
655
656                 BUG_ON(magic == NODE_INFO);
657
658                 maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
659                 removing_section_nr = page_private(page);
660
661                 /*
662                  * When this function is called, the removing section is
663                  * logical offlined state. This means all pages are isolated
664                  * from page allocator. If removing section's memmap is placed
665                  * on the same section, it must not be freed.
666                  * If it is freed, page allocator may allocate it which will
667                  * be removed physically soon.
668                  */
669                 if (maps_section_nr != removing_section_nr)
670                         put_page_bootmem(page);
671         }
672 }
673 #endif /* CONFIG_MEMORY_HOTREMOVE */
674 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
675
676 /*
677  * returns the number of sections whose mem_maps were properly
678  * set.  If this is <=0, then that means that the passed-in
679  * map was not consumed and must be freed.
680  */
681 int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
682                                      struct vmem_altmap *altmap)
683 {
684         unsigned long section_nr = pfn_to_section_nr(start_pfn);
685         struct mem_section *ms;
686         struct page *memmap;
687         unsigned long *usemap;
688         int ret;
689
690         /*
691          * no locking for this, because it does its own
692          * plus, it does a kmalloc
693          */
694         ret = sparse_index_init(section_nr, nid);
695         if (ret < 0 && ret != -EEXIST)
696                 return ret;
697         ret = 0;
698         memmap = kmalloc_section_memmap(section_nr, nid, altmap);
699         if (!memmap)
700                 return -ENOMEM;
701         usemap = __kmalloc_section_usemap();
702         if (!usemap) {
703                 __kfree_section_memmap(memmap, altmap);
704                 return -ENOMEM;
705         }
706
707         ms = __pfn_to_section(start_pfn);
708         if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
709                 ret = -EEXIST;
710                 goto out;
711         }
712
713         /*
714          * Poison uninitialized struct pages in order to catch invalid flags
715          * combinations.
716          */
717         page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
718
719         section_mark_present(ms);
720         sparse_init_one_section(ms, section_nr, memmap, usemap);
721
722 out:
723         if (ret < 0) {
724                 kfree(usemap);
725                 __kfree_section_memmap(memmap, altmap);
726         }
727         return ret;
728 }
729
730 #ifdef CONFIG_MEMORY_HOTREMOVE
731 #ifdef CONFIG_MEMORY_FAILURE
732 static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
733 {
734         int i;
735
736         if (!memmap)
737                 return;
738
739         /*
740          * A further optimization is to have per section refcounted
741          * num_poisoned_pages.  But that would need more space per memmap, so
742          * for now just do a quick global check to speed up this routine in the
743          * absence of bad pages.
744          */
745         if (atomic_long_read(&num_poisoned_pages) == 0)
746                 return;
747
748         for (i = 0; i < nr_pages; i++) {
749                 if (PageHWPoison(&memmap[i])) {
750                         atomic_long_sub(1, &num_poisoned_pages);
751                         ClearPageHWPoison(&memmap[i]);
752                 }
753         }
754 }
755 #else
756 static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
757 {
758 }
759 #endif
760
761 static void free_section_usemap(struct page *memmap, unsigned long *usemap,
762                 struct vmem_altmap *altmap)
763 {
764         struct page *usemap_page;
765
766         if (!usemap)
767                 return;
768
769         usemap_page = virt_to_page(usemap);
770         /*
771          * Check to see if allocation came from hot-plug-add
772          */
773         if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
774                 kfree(usemap);
775                 if (memmap)
776                         __kfree_section_memmap(memmap, altmap);
777                 return;
778         }
779
780         /*
781          * The usemap came from bootmem. This is packed with other usemaps
782          * on the section which has pgdat at boot time. Just keep it as is now.
783          */
784
785         if (memmap)
786                 free_map_bootmem(memmap);
787 }
788
789 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
790                 unsigned long map_offset, struct vmem_altmap *altmap)
791 {
792         struct page *memmap = NULL;
793         unsigned long *usemap = NULL;
794
795         if (ms->section_mem_map) {
796                 usemap = ms->pageblock_flags;
797                 memmap = sparse_decode_mem_map(ms->section_mem_map,
798                                                 __section_nr(ms));
799                 ms->section_mem_map = 0;
800                 ms->pageblock_flags = NULL;
801         }
802
803         clear_hwpoisoned_pages(memmap + map_offset,
804                         PAGES_PER_SECTION - map_offset);
805         free_section_usemap(memmap, usemap, altmap);
806 }
807 #endif /* CONFIG_MEMORY_HOTREMOVE */
808 #endif /* CONFIG_MEMORY_HOTPLUG */