whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

percpu.c (87759B)


      1 /*
      2  * mm/percpu.c - percpu memory allocator
      3  *
      4  * Copyright (C) 2009		SUSE Linux Products GmbH
      5  * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
      6  *
      7  * Copyright (C) 2017		Facebook Inc.
      8  * Copyright (C) 2017		Dennis Zhou <dennisszhou@gmail.com>
      9  *
     10  * This file is released under the GPLv2 license.
     11  *
     12  * The percpu allocator handles both static and dynamic areas.  Percpu
     13  * areas are allocated in chunks which are divided into units.  There is
     14  * a 1-to-1 mapping for units to possible cpus.  These units are grouped
     15  * based on NUMA properties of the machine.
     16  *
     17  *  c0                           c1                         c2
     18  *  -------------------          -------------------        ------------
     19  * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
     20  *  -------------------  ......  -------------------  ....  ------------
     21  *
     22  * Allocation is done by offsets into a unit's address space.  Ie., an
     23  * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
     24  * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
     25  * and even sparse.  Access is handled by configuring percpu base
     26  * registers according to the cpu to unit mappings and offsetting the
     27  * base address using pcpu_unit_size.
     28  *
     29  * There is special consideration for the first chunk which must handle
     30  * the static percpu variables in the kernel image as allocation services
     31  * are not online yet.  In short, the first chunk is structured like so:
     32  *
     33  *                  <Static | [Reserved] | Dynamic>
     34  *
     35  * The static data is copied from the original section managed by the
     36  * linker.  The reserved section, if non-zero, primarily manages static
     37  * percpu variables from kernel modules.  Finally, the dynamic section
     38  * takes care of normal allocations.
     39  *
     40  * The allocator organizes chunks into lists according to free size and
     41  * tries to allocate from the fullest chunk first.  Each chunk is managed
     42  * by a bitmap with metadata blocks.  The allocation map is updated on
     43  * every allocation and free to reflect the current state while the boundary
     44  * map is only updated on allocation.  Each metadata block contains
     45  * information to help mitigate the need to iterate over large portions
     46  * of the bitmap.  The reverse mapping from page to chunk is stored in
     47  * the page's index.  Lastly, units are lazily backed and grow in unison.
     48  *
     49  * There is a unique conversion that goes on here between bytes and bits.
     50  * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
     51  * tracks the number of pages it is responsible for in nr_pages.  Helper
     52  * functions are used to convert from between the bytes, bits, and blocks.
     53  * All hints are managed in bits unless explicitly stated.
     54  *
     55  * To use this allocator, arch code should do the following:
     56  *
     57  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
     58  *   regular address to percpu pointer and back if they need to be
     59  *   different from the default
     60  *
     61  * - use pcpu_setup_first_chunk() during percpu area initialization to
     62  *   setup the first chunk containing the kernel static percpu area
     63  */
     64 
     65 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
     66 
     67 #include <linux/bitmap.h>
     68 #include <linux/memblock.h>
     69 #include <linux/err.h>
     70 #include <linux/lcm.h>
     71 #include <linux/list.h>
     72 #include <linux/log2.h>
     73 #include <linux/mm.h>
     74 #include <linux/module.h>
     75 #include <linux/mutex.h>
     76 #include <linux/percpu.h>
     77 #include <linux/pfn.h>
     78 #include <linux/slab.h>
     79 #include <linux/spinlock.h>
     80 #include <linux/vmalloc.h>
     81 #include <linux/workqueue.h>
     82 #include <linux/kmemleak.h>
     83 #include <linux/sched.h>
     84 
     85 #include <asm/cacheflush.h>
     86 #include <asm/sections.h>
     87 #include <asm/tlbflush.h>
     88 #include <asm/io.h>
     89 
     90 #define CREATE_TRACE_POINTS
     91 #include <trace/events/percpu.h>
     92 
     93 #include "percpu-internal.h"
     94 
     95 /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
     96 #define PCPU_SLOT_BASE_SHIFT		5
     97 
     98 #define PCPU_EMPTY_POP_PAGES_LOW	2
     99 #define PCPU_EMPTY_POP_PAGES_HIGH	4
    100 
    101 #ifdef CONFIG_SMP
    102 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
    103 #ifndef __addr_to_pcpu_ptr
    104 #define __addr_to_pcpu_ptr(addr)					\
    105 	(void __percpu *)((unsigned long)(addr) -			\
    106 			  (unsigned long)pcpu_base_addr	+		\
    107 			  (unsigned long)__per_cpu_start)
    108 #endif
    109 #ifndef __pcpu_ptr_to_addr
    110 #define __pcpu_ptr_to_addr(ptr)						\
    111 	(void __force *)((unsigned long)(ptr) +				\
    112 			 (unsigned long)pcpu_base_addr -		\
    113 			 (unsigned long)__per_cpu_start)
    114 #endif
    115 #else	/* CONFIG_SMP */
    116 /* on UP, it's always identity mapped */
    117 #define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
    118 #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
    119 #endif	/* CONFIG_SMP */
    120 
    121 static int pcpu_unit_pages __ro_after_init;
    122 static int pcpu_unit_size __ro_after_init;
    123 static int pcpu_nr_units __ro_after_init;
    124 static int pcpu_atom_size __ro_after_init;
    125 int pcpu_nr_slots __ro_after_init;
    126 static size_t pcpu_chunk_struct_size __ro_after_init;
    127 
    128 /* cpus with the lowest and highest unit addresses */
    129 static unsigned int pcpu_low_unit_cpu __ro_after_init;
    130 static unsigned int pcpu_high_unit_cpu __ro_after_init;
    131 
    132 /* the address of the first chunk which starts with the kernel static area */
    133 void *pcpu_base_addr __ro_after_init;
    134 EXPORT_SYMBOL_GPL(pcpu_base_addr);
    135 
    136 static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
    137 const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
    138 
    139 /* group information, used for vm allocation */
    140 static int pcpu_nr_groups __ro_after_init;
    141 static const unsigned long *pcpu_group_offsets __ro_after_init;
    142 static const size_t *pcpu_group_sizes __ro_after_init;
    143 
    144 /*
    145  * The first chunk which always exists.  Note that unlike other
    146  * chunks, this one can be allocated and mapped in several different
    147  * ways and thus often doesn't live in the vmalloc area.
    148  */
    149 struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
    150 
    151 /*
    152  * Optional reserved chunk.  This chunk reserves part of the first
    153  * chunk and serves it for reserved allocations.  When the reserved
    154  * region doesn't exist, the following variable is NULL.
    155  */
    156 struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
    157 
    158 DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
    159 static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
    160 
    161 struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
    162 
    163 /* chunks which need their map areas extended, protected by pcpu_lock */
    164 static LIST_HEAD(pcpu_map_extend_chunks);
    165 
    166 /*
    167  * The number of empty populated pages, protected by pcpu_lock.  The
    168  * reserved chunk doesn't contribute to the count.
    169  */
    170 int pcpu_nr_empty_pop_pages;
    171 
    172 /*
    173  * The number of populated pages in use by the allocator, protected by
    174  * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
    175  * allocated/deallocated, it is allocated/deallocated in all units of a chunk
    176  * and increments/decrements this count by 1).
    177  */
    178 static unsigned long pcpu_nr_populated;
    179 
    180 /*
    181  * Balance work is used to populate or destroy chunks asynchronously.  We
    182  * try to keep the number of populated free pages between
    183  * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
    184  * empty chunk.
    185  */
    186 static void pcpu_balance_workfn(struct work_struct *work);
    187 static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
    188 static bool pcpu_async_enabled __read_mostly;
    189 static bool pcpu_atomic_alloc_failed;
    190 
    191 static void pcpu_schedule_balance_work(void)
    192 {
    193 	if (pcpu_async_enabled)
    194 		schedule_work(&pcpu_balance_work);
    195 }
    196 
    197 /**
    198  * pcpu_addr_in_chunk - check if the address is served from this chunk
    199  * @chunk: chunk of interest
    200  * @addr: percpu address
    201  *
    202  * RETURNS:
    203  * True if the address is served from this chunk.
    204  */
    205 static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
    206 {
    207 	void *start_addr, *end_addr;
    208 
    209 	if (!chunk)
    210 		return false;
    211 
    212 	start_addr = chunk->base_addr + chunk->start_offset;
    213 	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
    214 		   chunk->end_offset;
    215 
    216 	return addr >= start_addr && addr < end_addr;
    217 }
    218 
    219 static int __pcpu_size_to_slot(int size)
    220 {
    221 	int highbit = fls(size);	/* size is in bytes */
    222 	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
    223 }
    224 
    225 static int pcpu_size_to_slot(int size)
    226 {
    227 	if (size == pcpu_unit_size)
    228 		return pcpu_nr_slots - 1;
    229 	return __pcpu_size_to_slot(size);
    230 }
    231 
    232 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
    233 {
    234 	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
    235 		return 0;
    236 
    237 	return pcpu_size_to_slot(chunk->free_bytes);
    238 }
    239 
    240 /* set the pointer to a chunk in a page struct */
    241 static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
    242 {
    243 	page->index = (unsigned long)pcpu;
    244 }
    245 
    246 /* obtain pointer to a chunk from a page struct */
    247 static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
    248 {
    249 	return (struct pcpu_chunk *)page->index;
    250 }
    251 
    252 static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
    253 {
    254 	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
    255 }
    256 
    257 static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
    258 {
    259 	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
    260 }
    261 
    262 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
    263 				     unsigned int cpu, int page_idx)
    264 {
    265 	return (unsigned long)chunk->base_addr +
    266 	       pcpu_unit_page_offset(cpu, page_idx);
    267 }
    268 
    269 static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
    270 {
    271 	*rs = find_next_zero_bit(bitmap, end, *rs);
    272 	*re = find_next_bit(bitmap, end, *rs + 1);
    273 }
    274 
    275 static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
    276 {
    277 	*rs = find_next_bit(bitmap, end, *rs);
    278 	*re = find_next_zero_bit(bitmap, end, *rs + 1);
    279 }
    280 
    281 /*
    282  * Bitmap region iterators.  Iterates over the bitmap between
    283  * [@start, @end) in @chunk.  @rs and @re should be integer variables
    284  * and will be set to start and end index of the current free region.
    285  */
    286 #define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
    287 	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
    288 	     (rs) < (re);						     \
    289 	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
    290 
    291 #define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
    292 	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
    293 	     (rs) < (re);						     \
    294 	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
    295 
    296 /*
    297  * The following are helper functions to help access bitmaps and convert
    298  * between bitmap offsets to address offsets.
    299  */
    300 static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
    301 {
    302 	return chunk->alloc_map +
    303 	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
    304 }
    305 
    306 static unsigned long pcpu_off_to_block_index(int off)
    307 {
    308 	return off / PCPU_BITMAP_BLOCK_BITS;
    309 }
    310 
    311 static unsigned long pcpu_off_to_block_off(int off)
    312 {
    313 	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
    314 }
    315 
    316 static unsigned long pcpu_block_off_to_off(int index, int off)
    317 {
    318 	return index * PCPU_BITMAP_BLOCK_BITS + off;
    319 }
    320 
    321 /**
    322  * pcpu_next_md_free_region - finds the next hint free area
    323  * @chunk: chunk of interest
    324  * @bit_off: chunk offset
    325  * @bits: size of free area
    326  *
    327  * Helper function for pcpu_for_each_md_free_region.  It checks
    328  * block->contig_hint and performs aggregation across blocks to find the
    329  * next hint.  It modifies bit_off and bits in-place to be consumed in the
    330  * loop.
    331  */
    332 static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
    333 				     int *bits)
    334 {
    335 	int i = pcpu_off_to_block_index(*bit_off);
    336 	int block_off = pcpu_off_to_block_off(*bit_off);
    337 	struct pcpu_block_md *block;
    338 
    339 	*bits = 0;
    340 	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
    341 	     block++, i++) {
    342 		/* handles contig area across blocks */
    343 		if (*bits) {
    344 			*bits += block->left_free;
    345 			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
    346 				continue;
    347 			return;
    348 		}
    349 
    350 		/*
    351 		 * This checks three things.  First is there a contig_hint to
    352 		 * check.  Second, have we checked this hint before by
    353 		 * comparing the block_off.  Third, is this the same as the
    354 		 * right contig hint.  In the last case, it spills over into
    355 		 * the next block and should be handled by the contig area
    356 		 * across blocks code.
    357 		 */
    358 		*bits = block->contig_hint;
    359 		if (*bits && block->contig_hint_start >= block_off &&
    360 		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
    361 			*bit_off = pcpu_block_off_to_off(i,
    362 					block->contig_hint_start);
    363 			return;
    364 		}
    365 		/* reset to satisfy the second predicate above */
    366 		block_off = 0;
    367 
    368 		*bits = block->right_free;
    369 		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
    370 	}
    371 }
    372 
    373 /**
    374  * pcpu_next_fit_region - finds fit areas for a given allocation request
    375  * @chunk: chunk of interest
    376  * @alloc_bits: size of allocation
    377  * @align: alignment of area (max PAGE_SIZE)
    378  * @bit_off: chunk offset
    379  * @bits: size of free area
    380  *
    381  * Finds the next free region that is viable for use with a given size and
    382  * alignment.  This only returns if there is a valid area to be used for this
    383  * allocation.  block->first_free is returned if the allocation request fits
    384  * within the block to see if the request can be fulfilled prior to the contig
    385  * hint.
    386  */
    387 static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
    388 				 int align, int *bit_off, int *bits)
    389 {
    390 	int i = pcpu_off_to_block_index(*bit_off);
    391 	int block_off = pcpu_off_to_block_off(*bit_off);
    392 	struct pcpu_block_md *block;
    393 
    394 	*bits = 0;
    395 	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
    396 	     block++, i++) {
    397 		/* handles contig area across blocks */
    398 		if (*bits) {
    399 			*bits += block->left_free;
    400 			if (*bits >= alloc_bits)
    401 				return;
    402 			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
    403 				continue;
    404 		}
    405 
    406 		/* check block->contig_hint */
    407 		*bits = ALIGN(block->contig_hint_start, align) -
    408 			block->contig_hint_start;
    409 		/*
    410 		 * This uses the block offset to determine if this has been
    411 		 * checked in the prior iteration.
    412 		 */
    413 		if (block->contig_hint &&
    414 		    block->contig_hint_start >= block_off &&
    415 		    block->contig_hint >= *bits + alloc_bits) {
    416 			*bits += alloc_bits + block->contig_hint_start -
    417 				 block->first_free;
    418 			*bit_off = pcpu_block_off_to_off(i, block->first_free);
    419 			return;
    420 		}
    421 		/* reset to satisfy the second predicate above */
    422 		block_off = 0;
    423 
    424 		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
    425 				 align);
    426 		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
    427 		*bit_off = pcpu_block_off_to_off(i, *bit_off);
    428 		if (*bits >= alloc_bits)
    429 			return;
    430 	}
    431 
    432 	/* no valid offsets were found - fail condition */
    433 	*bit_off = pcpu_chunk_map_bits(chunk);
    434 }
    435 
    436 /*
    437  * Metadata free area iterators.  These perform aggregation of free areas
    438  * based on the metadata blocks and return the offset @bit_off and size in
    439  * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
    440  * a fit is found for the allocation request.
    441  */
    442 #define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
    443 	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
    444 	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
    445 	     (bit_off) += (bits) + 1,					\
    446 	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
    447 
    448 #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
    449 	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
    450 				  &(bits));				      \
    451 	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
    452 	     (bit_off) += (bits),					      \
    453 	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
    454 				  &(bits)))
    455 
    456 /**
    457  * pcpu_mem_zalloc - allocate memory
    458  * @size: bytes to allocate
    459  * @gfp: allocation flags
    460  *
    461  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
    462  * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
    463  * This is to facilitate passing through whitelisted flags.  The
    464  * returned memory is always zeroed.
    465  *
    466  * RETURNS:
    467  * Pointer to the allocated area on success, NULL on failure.
    468  */
    469 static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
    470 {
    471 	if (WARN_ON_ONCE(!slab_is_available()))
    472 		return NULL;
    473 
    474 	if (size <= PAGE_SIZE)
    475 		return kzalloc(size, gfp);
    476 	else
    477 		return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
    478 }
    479 
    480 /**
    481  * pcpu_mem_free - free memory
    482  * @ptr: memory to free
    483  *
    484  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
    485  */
    486 static void pcpu_mem_free(void *ptr)
    487 {
    488 	kvfree(ptr);
    489 }
    490 
    491 /**
    492  * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
    493  * @chunk: chunk of interest
    494  * @oslot: the previous slot it was on
    495  *
    496  * This function is called after an allocation or free changed @chunk.
    497  * New slot according to the changed state is determined and @chunk is
    498  * moved to the slot.  Note that the reserved chunk is never put on
    499  * chunk slots.
    500  *
    501  * CONTEXT:
    502  * pcpu_lock.
    503  */
    504 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
    505 {
    506 	int nslot = pcpu_chunk_slot(chunk);
    507 
    508 	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
    509 		if (oslot < nslot)
    510 			list_move(&chunk->list, &pcpu_slot[nslot]);
    511 		else
    512 			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
    513 	}
    514 }
    515 
    516 /**
    517  * pcpu_cnt_pop_pages- counts populated backing pages in range
    518  * @chunk: chunk of interest
    519  * @bit_off: start offset
    520  * @bits: size of area to check
    521  *
    522  * Calculates the number of populated pages in the region
    523  * [page_start, page_end).  This keeps track of how many empty populated
    524  * pages are available and decide if async work should be scheduled.
    525  *
    526  * RETURNS:
    527  * The nr of populated pages.
    528  */
    529 static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
    530 				     int bits)
    531 {
    532 	int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
    533 	int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
    534 
    535 	if (page_start >= page_end)
    536 		return 0;
    537 
    538 	/*
    539 	 * bitmap_weight counts the number of bits set in a bitmap up to
    540 	 * the specified number of bits.  This is counting the populated
    541 	 * pages up to page_end and then subtracting the populated pages
    542 	 * up to page_start to count the populated pages in
    543 	 * [page_start, page_end).
    544 	 */
    545 	return bitmap_weight(chunk->populated, page_end) -
    546 	       bitmap_weight(chunk->populated, page_start);
    547 }
    548 
    549 /**
    550  * pcpu_chunk_update - updates the chunk metadata given a free area
    551  * @chunk: chunk of interest
    552  * @bit_off: chunk offset
    553  * @bits: size of free area
    554  *
    555  * This updates the chunk's contig hint and starting offset given a free area.
    556  * Choose the best starting offset if the contig hint is equal.
    557  */
    558 static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
    559 {
    560 	if (bits > chunk->contig_bits) {
    561 		chunk->contig_bits_start = bit_off;
    562 		chunk->contig_bits = bits;
    563 	} else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
    564 		   (!bit_off ||
    565 		    __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
    566 		/* use the start with the best alignment */
    567 		chunk->contig_bits_start = bit_off;
    568 	}
    569 }
    570 
    571 /**
    572  * pcpu_chunk_refresh_hint - updates metadata about a chunk
    573  * @chunk: chunk of interest
    574  *
    575  * Iterates over the metadata blocks to find the largest contig area.
    576  * It also counts the populated pages and uses the delta to update the
    577  * global count.
    578  *
    579  * Updates:
    580  *      chunk->contig_bits
    581  *      chunk->contig_bits_start
    582  *      nr_empty_pop_pages (chunk and global)
    583  */
    584 static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
    585 {
    586 	int bit_off, bits, nr_empty_pop_pages;
    587 
    588 	/* clear metadata */
    589 	chunk->contig_bits = 0;
    590 
    591 	bit_off = chunk->first_bit;
    592 	bits = nr_empty_pop_pages = 0;
    593 	pcpu_for_each_md_free_region(chunk, bit_off, bits) {
    594 		pcpu_chunk_update(chunk, bit_off, bits);
    595 
    596 		nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
    597 	}
    598 
    599 	/*
    600 	 * Keep track of nr_empty_pop_pages.
    601 	 *
    602 	 * The chunk maintains the previous number of free pages it held,
    603 	 * so the delta is used to update the global counter.  The reserved
    604 	 * chunk is not part of the free page count as they are populated
    605 	 * at init and are special to serving reserved allocations.
    606 	 */
    607 	if (chunk != pcpu_reserved_chunk)
    608 		pcpu_nr_empty_pop_pages +=
    609 			(nr_empty_pop_pages - chunk->nr_empty_pop_pages);
    610 
    611 	chunk->nr_empty_pop_pages = nr_empty_pop_pages;
    612 }
    613 
    614 /**
    615  * pcpu_block_update - updates a block given a free area
    616  * @block: block of interest
    617  * @start: start offset in block
    618  * @end: end offset in block
    619  *
    620  * Updates a block given a known free area.  The region [start, end) is
    621  * expected to be the entirety of the free area within a block.  Chooses
    622  * the best starting offset if the contig hints are equal.
    623  */
    624 static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
    625 {
    626 	int contig = end - start;
    627 
    628 	block->first_free = min(block->first_free, start);
    629 	if (start == 0)
    630 		block->left_free = contig;
    631 
    632 	if (end == PCPU_BITMAP_BLOCK_BITS)
    633 		block->right_free = contig;
    634 
    635 	if (contig > block->contig_hint) {
    636 		block->contig_hint_start = start;
    637 		block->contig_hint = contig;
    638 	} else if (block->contig_hint_start && contig == block->contig_hint &&
    639 		   (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
    640 		/* use the start with the best alignment */
    641 		block->contig_hint_start = start;
    642 	}
    643 }
    644 
    645 /**
    646  * pcpu_block_refresh_hint
    647  * @chunk: chunk of interest
    648  * @index: index of the metadata block
    649  *
    650  * Scans over the block beginning at first_free and updates the block
    651  * metadata accordingly.
    652  */
    653 static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
    654 {
    655 	struct pcpu_block_md *block = chunk->md_blocks + index;
    656 	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
    657 	int rs, re;	/* region start, region end */
    658 
    659 	/* clear hints */
    660 	block->contig_hint = 0;
    661 	block->left_free = block->right_free = 0;
    662 
    663 	/* iterate over free areas and update the contig hints */
    664 	pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
    665 				   PCPU_BITMAP_BLOCK_BITS) {
    666 		pcpu_block_update(block, rs, re);
    667 	}
    668 }
    669 
    670 /**
    671  * pcpu_block_update_hint_alloc - update hint on allocation path
    672  * @chunk: chunk of interest
    673  * @bit_off: chunk offset
    674  * @bits: size of request
    675  *
    676  * Updates metadata for the allocation path.  The metadata only has to be
    677  * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
    678  * scans are required if the block's contig hint is broken.
    679  */
    680 static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
    681 					 int bits)
    682 {
    683 	struct pcpu_block_md *s_block, *e_block, *block;
    684 	int s_index, e_index;	/* block indexes of the freed allocation */
    685 	int s_off, e_off;	/* block offsets of the freed allocation */
    686 
    687 	/*
    688 	 * Calculate per block offsets.
    689 	 * The calculation uses an inclusive range, but the resulting offsets
    690 	 * are [start, end).  e_index always points to the last block in the
    691 	 * range.
    692 	 */
    693 	s_index = pcpu_off_to_block_index(bit_off);
    694 	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
    695 	s_off = pcpu_off_to_block_off(bit_off);
    696 	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
    697 
    698 	s_block = chunk->md_blocks + s_index;
    699 	e_block = chunk->md_blocks + e_index;
    700 
    701 	/*
    702 	 * Update s_block.
    703 	 * block->first_free must be updated if the allocation takes its place.
    704 	 * If the allocation breaks the contig_hint, a scan is required to
    705 	 * restore this hint.
    706 	 */
    707 	if (s_off == s_block->first_free)
    708 		s_block->first_free = find_next_zero_bit(
    709 					pcpu_index_alloc_map(chunk, s_index),
    710 					PCPU_BITMAP_BLOCK_BITS,
    711 					s_off + bits);
    712 
    713 	if (s_off >= s_block->contig_hint_start &&
    714 	    s_off < s_block->contig_hint_start + s_block->contig_hint) {
    715 		/* block contig hint is broken - scan to fix it */
    716 		pcpu_block_refresh_hint(chunk, s_index);
    717 	} else {
    718 		/* update left and right contig manually */
    719 		s_block->left_free = min(s_block->left_free, s_off);
    720 		if (s_index == e_index)
    721 			s_block->right_free = min_t(int, s_block->right_free,
    722 					PCPU_BITMAP_BLOCK_BITS - e_off);
    723 		else
    724 			s_block->right_free = 0;
    725 	}
    726 
    727 	/*
    728 	 * Update e_block.
    729 	 */
    730 	if (s_index != e_index) {
    731 		/*
    732 		 * When the allocation is across blocks, the end is along
    733 		 * the left part of the e_block.
    734 		 */
    735 		e_block->first_free = find_next_zero_bit(
    736 				pcpu_index_alloc_map(chunk, e_index),
    737 				PCPU_BITMAP_BLOCK_BITS, e_off);
    738 
    739 		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
    740 			/* reset the block */
    741 			e_block++;
    742 		} else {
    743 			if (e_off > e_block->contig_hint_start) {
    744 				/* contig hint is broken - scan to fix it */
    745 				pcpu_block_refresh_hint(chunk, e_index);
    746 			} else {
    747 				e_block->left_free = 0;
    748 				e_block->right_free =
    749 					min_t(int, e_block->right_free,
    750 					      PCPU_BITMAP_BLOCK_BITS - e_off);
    751 			}
    752 		}
    753 
    754 		/* update in-between md_blocks */
    755 		for (block = s_block + 1; block < e_block; block++) {
    756 			block->contig_hint = 0;
    757 			block->left_free = 0;
    758 			block->right_free = 0;
    759 		}
    760 	}
    761 
    762 	/*
    763 	 * The only time a full chunk scan is required is if the chunk
    764 	 * contig hint is broken.  Otherwise, it means a smaller space
    765 	 * was used and therefore the chunk contig hint is still correct.
    766 	 */
    767 	if (bit_off >= chunk->contig_bits_start  &&
    768 	    bit_off < chunk->contig_bits_start + chunk->contig_bits)
    769 		pcpu_chunk_refresh_hint(chunk);
    770 }
    771 
    772 /**
    773  * pcpu_block_update_hint_free - updates the block hints on the free path
    774  * @chunk: chunk of interest
    775  * @bit_off: chunk offset
    776  * @bits: size of request
    777  *
    778  * Updates metadata for the allocation path.  This avoids a blind block
    779  * refresh by making use of the block contig hints.  If this fails, it scans
    780  * forward and backward to determine the extent of the free area.  This is
    781  * capped at the boundary of blocks.
    782  *
    783  * A chunk update is triggered if a page becomes free, a block becomes free,
    784  * or the free spans across blocks.  This tradeoff is to minimize iterating
    785  * over the block metadata to update chunk->contig_bits.  chunk->contig_bits
    786  * may be off by up to a page, but it will never be more than the available
    787  * space.  If the contig hint is contained in one block, it will be accurate.
    788  */
    789 static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
    790 					int bits)
    791 {
    792 	struct pcpu_block_md *s_block, *e_block, *block;
    793 	int s_index, e_index;	/* block indexes of the freed allocation */
    794 	int s_off, e_off;	/* block offsets of the freed allocation */
    795 	int start, end;		/* start and end of the whole free area */
    796 
    797 	/*
    798 	 * Calculate per block offsets.
    799 	 * The calculation uses an inclusive range, but the resulting offsets
    800 	 * are [start, end).  e_index always points to the last block in the
    801 	 * range.
    802 	 */
    803 	s_index = pcpu_off_to_block_index(bit_off);
    804 	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
    805 	s_off = pcpu_off_to_block_off(bit_off);
    806 	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
    807 
    808 	s_block = chunk->md_blocks + s_index;
    809 	e_block = chunk->md_blocks + e_index;
    810 
    811 	/*
    812 	 * Check if the freed area aligns with the block->contig_hint.
    813 	 * If it does, then the scan to find the beginning/end of the
    814 	 * larger free area can be avoided.
    815 	 *
    816 	 * start and end refer to beginning and end of the free area
    817 	 * within each their respective blocks.  This is not necessarily
    818 	 * the entire free area as it may span blocks past the beginning
    819 	 * or end of the block.
    820 	 */
    821 	start = s_off;
    822 	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
    823 		start = s_block->contig_hint_start;
    824 	} else {
    825 		/*
    826 		 * Scan backwards to find the extent of the free area.
    827 		 * find_last_bit returns the starting bit, so if the start bit
    828 		 * is returned, that means there was no last bit and the
    829 		 * remainder of the chunk is free.
    830 		 */
    831 		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
    832 					  start);
    833 		start = (start == l_bit) ? 0 : l_bit + 1;
    834 	}
    835 
    836 	end = e_off;
    837 	if (e_off == e_block->contig_hint_start)
    838 		end = e_block->contig_hint_start + e_block->contig_hint;
    839 	else
    840 		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
    841 				    PCPU_BITMAP_BLOCK_BITS, end);
    842 
    843 	/* update s_block */
    844 	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
    845 	pcpu_block_update(s_block, start, e_off);
    846 
    847 	/* freeing in the same block */
    848 	if (s_index != e_index) {
    849 		/* update e_block */
    850 		pcpu_block_update(e_block, 0, end);
    851 
    852 		/* reset md_blocks in the middle */
    853 		for (block = s_block + 1; block < e_block; block++) {
    854 			block->first_free = 0;
    855 			block->contig_hint_start = 0;
    856 			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
    857 			block->left_free = PCPU_BITMAP_BLOCK_BITS;
    858 			block->right_free = PCPU_BITMAP_BLOCK_BITS;
    859 		}
    860 	}
    861 
    862 	/*
    863 	 * Refresh chunk metadata when the free makes a page free, a block
    864 	 * free, or spans across blocks.  The contig hint may be off by up to
    865 	 * a page, but if the hint is contained in a block, it will be accurate
    866 	 * with the else condition below.
    867 	 */
    868 	if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
    869 	     ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
    870 	    s_index != e_index)
    871 		pcpu_chunk_refresh_hint(chunk);
    872 	else
    873 		pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
    874 				  s_block->contig_hint);
    875 }
    876 
    877 /**
    878  * pcpu_is_populated - determines if the region is populated
    879  * @chunk: chunk of interest
    880  * @bit_off: chunk offset
    881  * @bits: size of area
    882  * @next_off: return value for the next offset to start searching
    883  *
    884  * For atomic allocations, check if the backing pages are populated.
    885  *
    886  * RETURNS:
    887  * Bool if the backing pages are populated.
    888  * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
    889  */
    890 static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
    891 			      int *next_off)
    892 {
    893 	int page_start, page_end, rs, re;
    894 
    895 	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
    896 	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
    897 
    898 	rs = page_start;
    899 	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
    900 	if (rs >= page_end)
    901 		return true;
    902 
    903 	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
    904 	return false;
    905 }
    906 
    907 /**
    908  * pcpu_find_block_fit - finds the block index to start searching
    909  * @chunk: chunk of interest
    910  * @alloc_bits: size of request in allocation units
    911  * @align: alignment of area (max PAGE_SIZE bytes)
    912  * @pop_only: use populated regions only
    913  *
    914  * Given a chunk and an allocation spec, find the offset to begin searching
    915  * for a free region.  This iterates over the bitmap metadata blocks to
    916  * find an offset that will be guaranteed to fit the requirements.  It is
    917  * not quite first fit as if the allocation does not fit in the contig hint
    918  * of a block or chunk, it is skipped.  This errs on the side of caution
    919  * to prevent excess iteration.  Poor alignment can cause the allocator to
    920  * skip over blocks and chunks that have valid free areas.
    921  *
    922  * RETURNS:
    923  * The offset in the bitmap to begin searching.
    924  * -1 if no offset is found.
    925  */
    926 static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
    927 			       size_t align, bool pop_only)
    928 {
    929 	int bit_off, bits, next_off;
    930 
    931 	/*
    932 	 * Check to see if the allocation can fit in the chunk's contig hint.
    933 	 * This is an optimization to prevent scanning by assuming if it
    934 	 * cannot fit in the global hint, there is memory pressure and creating
    935 	 * a new chunk would happen soon.
    936 	 */
    937 	bit_off = ALIGN(chunk->contig_bits_start, align) -
    938 		  chunk->contig_bits_start;
    939 	if (bit_off + alloc_bits > chunk->contig_bits)
    940 		return -1;
    941 
    942 	bit_off = chunk->first_bit;
    943 	bits = 0;
    944 	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
    945 		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
    946 						   &next_off))
    947 			break;
    948 
    949 		bit_off = next_off;
    950 		bits = 0;
    951 	}
    952 
    953 	if (bit_off == pcpu_chunk_map_bits(chunk))
    954 		return -1;
    955 
    956 	return bit_off;
    957 }
    958 
    959 /**
    960  * pcpu_alloc_area - allocates an area from a pcpu_chunk
    961  * @chunk: chunk of interest
    962  * @alloc_bits: size of request in allocation units
    963  * @align: alignment of area (max PAGE_SIZE)
    964  * @start: bit_off to start searching
    965  *
    966  * This function takes in a @start offset to begin searching to fit an
    967  * allocation of @alloc_bits with alignment @align.  It needs to scan
    968  * the allocation map because if it fits within the block's contig hint,
    969  * @start will be block->first_free. This is an attempt to fill the
    970  * allocation prior to breaking the contig hint.  The allocation and
    971  * boundary maps are updated accordingly if it confirms a valid
    972  * free area.
    973  *
    974  * RETURNS:
    975  * Allocated addr offset in @chunk on success.
    976  * -1 if no matching area is found.
    977  */
    978 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
    979 			   size_t align, int start)
    980 {
    981 	size_t align_mask = (align) ? (align - 1) : 0;
    982 	int bit_off, end, oslot;
    983 
    984 	lockdep_assert_held(&pcpu_lock);
    985 
    986 	oslot = pcpu_chunk_slot(chunk);
    987 
    988 	/*
    989 	 * Search to find a fit.
    990 	 */
    991 	end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
    992 	bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
    993 					     alloc_bits, align_mask);
    994 	if (bit_off >= end)
    995 		return -1;
    996 
    997 	/* update alloc map */
    998 	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
    999 
   1000 	/* update boundary map */
   1001 	set_bit(bit_off, chunk->bound_map);
   1002 	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
   1003 	set_bit(bit_off + alloc_bits, chunk->bound_map);
   1004 
   1005 	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
   1006 
   1007 	/* update first free bit */
   1008 	if (bit_off == chunk->first_bit)
   1009 		chunk->first_bit = find_next_zero_bit(
   1010 					chunk->alloc_map,
   1011 					pcpu_chunk_map_bits(chunk),
   1012 					bit_off + alloc_bits);
   1013 
   1014 	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
   1015 
   1016 	pcpu_chunk_relocate(chunk, oslot);
   1017 
   1018 	return bit_off * PCPU_MIN_ALLOC_SIZE;
   1019 }
   1020 
   1021 /**
   1022  * pcpu_free_area - frees the corresponding offset
   1023  * @chunk: chunk of interest
   1024  * @off: addr offset into chunk
   1025  *
   1026  * This function determines the size of an allocation to free using
   1027  * the boundary bitmap and clears the allocation map.
   1028  */
   1029 static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
   1030 {
   1031 	int bit_off, bits, end, oslot;
   1032 
   1033 	lockdep_assert_held(&pcpu_lock);
   1034 	pcpu_stats_area_dealloc(chunk);
   1035 
   1036 	oslot = pcpu_chunk_slot(chunk);
   1037 
   1038 	bit_off = off / PCPU_MIN_ALLOC_SIZE;
   1039 
   1040 	/* find end index */
   1041 	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
   1042 			    bit_off + 1);
   1043 	bits = end - bit_off;
   1044 	bitmap_clear(chunk->alloc_map, bit_off, bits);
   1045 
   1046 	/* update metadata */
   1047 	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
   1048 
   1049 	/* update first free bit */
   1050 	chunk->first_bit = min(chunk->first_bit, bit_off);
   1051 
   1052 	pcpu_block_update_hint_free(chunk, bit_off, bits);
   1053 
   1054 	pcpu_chunk_relocate(chunk, oslot);
   1055 }
   1056 
   1057 static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
   1058 {
   1059 	struct pcpu_block_md *md_block;
   1060 
   1061 	for (md_block = chunk->md_blocks;
   1062 	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
   1063 	     md_block++) {
   1064 		md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
   1065 		md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
   1066 		md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
   1067 	}
   1068 }
   1069 
   1070 /**
   1071  * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
   1072  * @tmp_addr: the start of the region served
   1073  * @map_size: size of the region served
   1074  *
   1075  * This is responsible for creating the chunks that serve the first chunk.  The
   1076  * base_addr is page aligned down of @tmp_addr while the region end is page
   1077  * aligned up.  Offsets are kept track of to determine the region served. All
   1078  * this is done to appease the bitmap allocator in avoiding partial blocks.
   1079  *
   1080  * RETURNS:
   1081  * Chunk serving the region at @tmp_addr of @map_size.
   1082  */
   1083 static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
   1084 							 int map_size)
   1085 {
   1086 	struct pcpu_chunk *chunk;
   1087 	unsigned long aligned_addr, lcm_align;
   1088 	int start_offset, offset_bits, region_size, region_bits;
   1089 	size_t alloc_size;
   1090 
   1091 	/* region calculations */
   1092 	aligned_addr = tmp_addr & PAGE_MASK;
   1093 
   1094 	start_offset = tmp_addr - aligned_addr;
   1095 
   1096 	/*
   1097 	 * Align the end of the region with the LCM of PAGE_SIZE and
   1098 	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
   1099 	 * the other.
   1100 	 */
   1101 	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
   1102 	region_size = ALIGN(start_offset + map_size, lcm_align);
   1103 
   1104 	/* allocate chunk */
   1105 	alloc_size = sizeof(struct pcpu_chunk) +
   1106 		BITS_TO_LONGS(region_size >> PAGE_SHIFT);
   1107 	chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
   1108 	if (!chunk)
   1109 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   1110 		      alloc_size);
   1111 
   1112 	INIT_LIST_HEAD(&chunk->list);
   1113 
   1114 	chunk->base_addr = (void *)aligned_addr;
   1115 	chunk->start_offset = start_offset;
   1116 	chunk->end_offset = region_size - chunk->start_offset - map_size;
   1117 
   1118 	chunk->nr_pages = region_size >> PAGE_SHIFT;
   1119 	region_bits = pcpu_chunk_map_bits(chunk);
   1120 
   1121 	alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
   1122 	chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
   1123 	if (!chunk->alloc_map)
   1124 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   1125 		      alloc_size);
   1126 
   1127 	alloc_size =
   1128 		BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
   1129 	chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
   1130 	if (!chunk->bound_map)
   1131 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   1132 		      alloc_size);
   1133 
   1134 	alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
   1135 	chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
   1136 	if (!chunk->md_blocks)
   1137 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   1138 		      alloc_size);
   1139 
   1140 	pcpu_init_md_blocks(chunk);
   1141 
   1142 	/* manage populated page bitmap */
   1143 	chunk->immutable = true;
   1144 	bitmap_fill(chunk->populated, chunk->nr_pages);
   1145 	chunk->nr_populated = chunk->nr_pages;
   1146 	chunk->nr_empty_pop_pages =
   1147 		pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
   1148 				   map_size / PCPU_MIN_ALLOC_SIZE);
   1149 
   1150 	chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
   1151 	chunk->free_bytes = map_size;
   1152 
   1153 	if (chunk->start_offset) {
   1154 		/* hide the beginning of the bitmap */
   1155 		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
   1156 		bitmap_set(chunk->alloc_map, 0, offset_bits);
   1157 		set_bit(0, chunk->bound_map);
   1158 		set_bit(offset_bits, chunk->bound_map);
   1159 
   1160 		chunk->first_bit = offset_bits;
   1161 
   1162 		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
   1163 	}
   1164 
   1165 	if (chunk->end_offset) {
   1166 		/* hide the end of the bitmap */
   1167 		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
   1168 		bitmap_set(chunk->alloc_map,
   1169 			   pcpu_chunk_map_bits(chunk) - offset_bits,
   1170 			   offset_bits);
   1171 		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
   1172 			chunk->bound_map);
   1173 		set_bit(region_bits, chunk->bound_map);
   1174 
   1175 		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
   1176 					     - offset_bits, offset_bits);
   1177 	}
   1178 
   1179 	return chunk;
   1180 }
   1181 
   1182 static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
   1183 {
   1184 	struct pcpu_chunk *chunk;
   1185 	int region_bits;
   1186 
   1187 	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
   1188 	if (!chunk)
   1189 		return NULL;
   1190 
   1191 	INIT_LIST_HEAD(&chunk->list);
   1192 	chunk->nr_pages = pcpu_unit_pages;
   1193 	region_bits = pcpu_chunk_map_bits(chunk);
   1194 
   1195 	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
   1196 					   sizeof(chunk->alloc_map[0]), gfp);
   1197 	if (!chunk->alloc_map)
   1198 		goto alloc_map_fail;
   1199 
   1200 	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
   1201 					   sizeof(chunk->bound_map[0]), gfp);
   1202 	if (!chunk->bound_map)
   1203 		goto bound_map_fail;
   1204 
   1205 	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
   1206 					   sizeof(chunk->md_blocks[0]), gfp);
   1207 	if (!chunk->md_blocks)
   1208 		goto md_blocks_fail;
   1209 
   1210 	pcpu_init_md_blocks(chunk);
   1211 
   1212 	/* init metadata */
   1213 	chunk->contig_bits = region_bits;
   1214 	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
   1215 
   1216 	return chunk;
   1217 
   1218 md_blocks_fail:
   1219 	pcpu_mem_free(chunk->bound_map);
   1220 bound_map_fail:
   1221 	pcpu_mem_free(chunk->alloc_map);
   1222 alloc_map_fail:
   1223 	pcpu_mem_free(chunk);
   1224 
   1225 	return NULL;
   1226 }
   1227 
   1228 static void pcpu_free_chunk(struct pcpu_chunk *chunk)
   1229 {
   1230 	if (!chunk)
   1231 		return;
   1232 	pcpu_mem_free(chunk->md_blocks);
   1233 	pcpu_mem_free(chunk->bound_map);
   1234 	pcpu_mem_free(chunk->alloc_map);
   1235 	pcpu_mem_free(chunk);
   1236 }
   1237 
   1238 /**
   1239  * pcpu_chunk_populated - post-population bookkeeping
   1240  * @chunk: pcpu_chunk which got populated
   1241  * @page_start: the start page
   1242  * @page_end: the end page
   1243  * @for_alloc: if this is to populate for allocation
   1244  *
   1245  * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
   1246  * the bookkeeping information accordingly.  Must be called after each
   1247  * successful population.
   1248  *
   1249  * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
   1250  * is to serve an allocation in that area.
   1251  */
   1252 static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
   1253 				 int page_end, bool for_alloc)
   1254 {
   1255 	int nr = page_end - page_start;
   1256 
   1257 	lockdep_assert_held(&pcpu_lock);
   1258 
   1259 	bitmap_set(chunk->populated, page_start, nr);
   1260 	chunk->nr_populated += nr;
   1261 	pcpu_nr_populated += nr;
   1262 
   1263 	if (!for_alloc) {
   1264 		chunk->nr_empty_pop_pages += nr;
   1265 		pcpu_nr_empty_pop_pages += nr;
   1266 	}
   1267 }
   1268 
   1269 /**
   1270  * pcpu_chunk_depopulated - post-depopulation bookkeeping
   1271  * @chunk: pcpu_chunk which got depopulated
   1272  * @page_start: the start page
   1273  * @page_end: the end page
   1274  *
   1275  * Pages in [@page_start,@page_end) have been depopulated from @chunk.
   1276  * Update the bookkeeping information accordingly.  Must be called after
   1277  * each successful depopulation.
   1278  */
   1279 static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
   1280 				   int page_start, int page_end)
   1281 {
   1282 	int nr = page_end - page_start;
   1283 
   1284 	lockdep_assert_held(&pcpu_lock);
   1285 
   1286 	bitmap_clear(chunk->populated, page_start, nr);
   1287 	chunk->nr_populated -= nr;
   1288 	chunk->nr_empty_pop_pages -= nr;
   1289 	pcpu_nr_empty_pop_pages -= nr;
   1290 	pcpu_nr_populated -= nr;
   1291 }
   1292 
   1293 /*
   1294  * Chunk management implementation.
   1295  *
   1296  * To allow different implementations, chunk alloc/free and
   1297  * [de]population are implemented in a separate file which is pulled
   1298  * into this file and compiled together.  The following functions
   1299  * should be implemented.
   1300  *
   1301  * pcpu_populate_chunk		- populate the specified range of a chunk
   1302  * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
   1303  * pcpu_create_chunk		- create a new chunk
   1304  * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
   1305  * pcpu_addr_to_page		- translate address to physical address
   1306  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
   1307  */
   1308 static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
   1309 			       int page_start, int page_end, gfp_t gfp);
   1310 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
   1311 				  int page_start, int page_end);
   1312 static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
   1313 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
   1314 static struct page *pcpu_addr_to_page(void *addr);
   1315 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
   1316 
   1317 #ifdef CONFIG_NEED_PER_CPU_KM
   1318 #include "percpu-km.c"
   1319 #else
   1320 #include "percpu-vm.c"
   1321 #endif
   1322 
   1323 /**
   1324  * pcpu_chunk_addr_search - determine chunk containing specified address
   1325  * @addr: address for which the chunk needs to be determined.
   1326  *
   1327  * This is an internal function that handles all but static allocations.
   1328  * Static percpu address values should never be passed into the allocator.
   1329  *
   1330  * RETURNS:
   1331  * The address of the found chunk.
   1332  */
   1333 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
   1334 {
   1335 	/* is it in the dynamic region (first chunk)? */
   1336 	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
   1337 		return pcpu_first_chunk;
   1338 
   1339 	/* is it in the reserved region? */
   1340 	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
   1341 		return pcpu_reserved_chunk;
   1342 
   1343 	/*
   1344 	 * The address is relative to unit0 which might be unused and
   1345 	 * thus unmapped.  Offset the address to the unit space of the
   1346 	 * current processor before looking it up in the vmalloc
   1347 	 * space.  Note that any possible cpu id can be used here, so
   1348 	 * there's no need to worry about preemption or cpu hotplug.
   1349 	 */
   1350 	addr += pcpu_unit_offsets[raw_smp_processor_id()];
   1351 	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
   1352 }
   1353 
   1354 /**
   1355  * pcpu_alloc - the percpu allocator
   1356  * @size: size of area to allocate in bytes
   1357  * @align: alignment of area (max PAGE_SIZE)
   1358  * @reserved: allocate from the reserved chunk if available
   1359  * @gfp: allocation flags
   1360  *
   1361  * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
   1362  * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
   1363  * then no warning will be triggered on invalid or failed allocation
   1364  * requests.
   1365  *
   1366  * RETURNS:
   1367  * Percpu pointer to the allocated area on success, NULL on failure.
   1368  */
   1369 static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
   1370 				 gfp_t gfp)
   1371 {
   1372 	/* whitelisted flags that can be passed to the backing allocators */
   1373 	gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
   1374 	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
   1375 	bool do_warn = !(gfp & __GFP_NOWARN);
   1376 	static int warn_limit = 10;
   1377 	struct pcpu_chunk *chunk;
   1378 	const char *err;
   1379 	int slot, off, cpu, ret;
   1380 	unsigned long flags;
   1381 	void __percpu *ptr;
   1382 	size_t bits, bit_align;
   1383 
   1384 	/*
   1385 	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
   1386 	 * therefore alignment must be a minimum of that many bytes.
   1387 	 * An allocation may have internal fragmentation from rounding up
   1388 	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
   1389 	 */
   1390 	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
   1391 		align = PCPU_MIN_ALLOC_SIZE;
   1392 
   1393 	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
   1394 	bits = size >> PCPU_MIN_ALLOC_SHIFT;
   1395 	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
   1396 
   1397 	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
   1398 		     !is_power_of_2(align))) {
   1399 		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
   1400 		     size, align);
   1401 		return NULL;
   1402 	}
   1403 
   1404 	if (!is_atomic) {
   1405 		/*
   1406 		 * pcpu_balance_workfn() allocates memory under this mutex,
   1407 		 * and it may wait for memory reclaim. Allow current task
   1408 		 * to become OOM victim, in case of memory pressure.
   1409 		 */
   1410 		if (gfp & __GFP_NOFAIL)
   1411 			mutex_lock(&pcpu_alloc_mutex);
   1412 		else if (mutex_lock_killable(&pcpu_alloc_mutex))
   1413 			return NULL;
   1414 	}
   1415 
   1416 	spin_lock_irqsave(&pcpu_lock, flags);
   1417 
   1418 	/* serve reserved allocations from the reserved chunk if available */
   1419 	if (reserved && pcpu_reserved_chunk) {
   1420 		chunk = pcpu_reserved_chunk;
   1421 
   1422 		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
   1423 		if (off < 0) {
   1424 			err = "alloc from reserved chunk failed";
   1425 			goto fail_unlock;
   1426 		}
   1427 
   1428 		off = pcpu_alloc_area(chunk, bits, bit_align, off);
   1429 		if (off >= 0)
   1430 			goto area_found;
   1431 
   1432 		err = "alloc from reserved chunk failed";
   1433 		goto fail_unlock;
   1434 	}
   1435 
   1436 restart:
   1437 	/* search through normal chunks */
   1438 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
   1439 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
   1440 			off = pcpu_find_block_fit(chunk, bits, bit_align,
   1441 						  is_atomic);
   1442 			if (off < 0)
   1443 				continue;
   1444 
   1445 			off = pcpu_alloc_area(chunk, bits, bit_align, off);
   1446 			if (off >= 0)
   1447 				goto area_found;
   1448 
   1449 		}
   1450 	}
   1451 
   1452 	spin_unlock_irqrestore(&pcpu_lock, flags);
   1453 
   1454 	/*
   1455 	 * No space left.  Create a new chunk.  We don't want multiple
   1456 	 * tasks to create chunks simultaneously.  Serialize and create iff
   1457 	 * there's still no empty chunk after grabbing the mutex.
   1458 	 */
   1459 	if (is_atomic) {
   1460 		err = "atomic alloc failed, no space left";
   1461 		goto fail;
   1462 	}
   1463 
   1464 	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
   1465 		chunk = pcpu_create_chunk(pcpu_gfp);
   1466 		if (!chunk) {
   1467 			err = "failed to allocate new chunk";
   1468 			goto fail;
   1469 		}
   1470 
   1471 		spin_lock_irqsave(&pcpu_lock, flags);
   1472 		pcpu_chunk_relocate(chunk, -1);
   1473 	} else {
   1474 		spin_lock_irqsave(&pcpu_lock, flags);
   1475 	}
   1476 
   1477 	goto restart;
   1478 
   1479 area_found:
   1480 	pcpu_stats_area_alloc(chunk, size);
   1481 	spin_unlock_irqrestore(&pcpu_lock, flags);
   1482 
   1483 	/* populate if not all pages are already there */
   1484 	if (!is_atomic) {
   1485 		int page_start, page_end, rs, re;
   1486 
   1487 		page_start = PFN_DOWN(off);
   1488 		page_end = PFN_UP(off + size);
   1489 
   1490 		pcpu_for_each_unpop_region(chunk->populated, rs, re,
   1491 					   page_start, page_end) {
   1492 			WARN_ON(chunk->immutable);
   1493 
   1494 			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
   1495 
   1496 			spin_lock_irqsave(&pcpu_lock, flags);
   1497 			if (ret) {
   1498 				pcpu_free_area(chunk, off);
   1499 				err = "failed to populate";
   1500 				goto fail_unlock;
   1501 			}
   1502 			pcpu_chunk_populated(chunk, rs, re, true);
   1503 			spin_unlock_irqrestore(&pcpu_lock, flags);
   1504 		}
   1505 
   1506 		mutex_unlock(&pcpu_alloc_mutex);
   1507 	}
   1508 
   1509 	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
   1510 		pcpu_schedule_balance_work();
   1511 
   1512 	/* clear the areas and return address relative to base address */
   1513 	for_each_possible_cpu(cpu)
   1514 		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
   1515 
   1516 	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
   1517 	kmemleak_alloc_percpu(ptr, size, gfp);
   1518 
   1519 	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
   1520 			chunk->base_addr, off, ptr);
   1521 
   1522 	return ptr;
   1523 
   1524 fail_unlock:
   1525 	spin_unlock_irqrestore(&pcpu_lock, flags);
   1526 fail:
   1527 	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
   1528 
   1529 	if (!is_atomic && do_warn && warn_limit) {
   1530 		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
   1531 			size, align, is_atomic, err);
   1532 		dump_stack();
   1533 		if (!--warn_limit)
   1534 			pr_info("limit reached, disable warning\n");
   1535 	}
   1536 	if (is_atomic) {
   1537 		/* see the flag handling in pcpu_blance_workfn() */
   1538 		pcpu_atomic_alloc_failed = true;
   1539 		pcpu_schedule_balance_work();
   1540 	} else {
   1541 		mutex_unlock(&pcpu_alloc_mutex);
   1542 	}
   1543 	return NULL;
   1544 }
   1545 
   1546 /**
   1547  * __alloc_percpu_gfp - allocate dynamic percpu area
   1548  * @size: size of area to allocate in bytes
   1549  * @align: alignment of area (max PAGE_SIZE)
   1550  * @gfp: allocation flags
   1551  *
   1552  * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
   1553  * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
   1554  * be called from any context but is a lot more likely to fail. If @gfp
   1555  * has __GFP_NOWARN then no warning will be triggered on invalid or failed
   1556  * allocation requests.
   1557  *
   1558  * RETURNS:
   1559  * Percpu pointer to the allocated area on success, NULL on failure.
   1560  */
   1561 void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
   1562 {
   1563 	return pcpu_alloc(size, align, false, gfp);
   1564 }
   1565 EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
   1566 
   1567 /**
   1568  * __alloc_percpu - allocate dynamic percpu area
   1569  * @size: size of area to allocate in bytes
   1570  * @align: alignment of area (max PAGE_SIZE)
   1571  *
   1572  * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
   1573  */
   1574 void __percpu *__alloc_percpu(size_t size, size_t align)
   1575 {
   1576 	return pcpu_alloc(size, align, false, GFP_KERNEL);
   1577 }
   1578 EXPORT_SYMBOL_GPL(__alloc_percpu);
   1579 
   1580 /**
   1581  * __alloc_reserved_percpu - allocate reserved percpu area
   1582  * @size: size of area to allocate in bytes
   1583  * @align: alignment of area (max PAGE_SIZE)
   1584  *
   1585  * Allocate zero-filled percpu area of @size bytes aligned at @align
   1586  * from reserved percpu area if arch has set it up; otherwise,
   1587  * allocation is served from the same dynamic area.  Might sleep.
   1588  * Might trigger writeouts.
   1589  *
   1590  * CONTEXT:
   1591  * Does GFP_KERNEL allocation.
   1592  *
   1593  * RETURNS:
   1594  * Percpu pointer to the allocated area on success, NULL on failure.
   1595  */
   1596 void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
   1597 {
   1598 	return pcpu_alloc(size, align, true, GFP_KERNEL);
   1599 }
   1600 
   1601 /**
   1602  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
   1603  * @work: unused
   1604  *
   1605  * Reclaim all fully free chunks except for the first one.  This is also
   1606  * responsible for maintaining the pool of empty populated pages.  However,
   1607  * it is possible that this is called when physical memory is scarce causing
   1608  * OOM killer to be triggered.  We should avoid doing so until an actual
   1609  * allocation causes the failure as it is possible that requests can be
   1610  * serviced from already backed regions.
   1611  */
   1612 static void pcpu_balance_workfn(struct work_struct *work)
   1613 {
   1614 	/* gfp flags passed to underlying allocators */
   1615 	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
   1616 	LIST_HEAD(to_free);
   1617 	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
   1618 	struct pcpu_chunk *chunk, *next;
   1619 	int slot, nr_to_pop, ret;
   1620 
   1621 	/*
   1622 	 * There's no reason to keep around multiple unused chunks and VM
   1623 	 * areas can be scarce.  Destroy all free chunks except for one.
   1624 	 */
   1625 	mutex_lock(&pcpu_alloc_mutex);
   1626 	spin_lock_irq(&pcpu_lock);
   1627 
   1628 	list_for_each_entry_safe(chunk, next, free_head, list) {
   1629 		WARN_ON(chunk->immutable);
   1630 
   1631 		/* spare the first one */
   1632 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
   1633 			continue;
   1634 
   1635 		list_move(&chunk->list, &to_free);
   1636 	}
   1637 
   1638 	spin_unlock_irq(&pcpu_lock);
   1639 
   1640 	list_for_each_entry_safe(chunk, next, &to_free, list) {
   1641 		int rs, re;
   1642 
   1643 		pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
   1644 					 chunk->nr_pages) {
   1645 			pcpu_depopulate_chunk(chunk, rs, re);
   1646 			spin_lock_irq(&pcpu_lock);
   1647 			pcpu_chunk_depopulated(chunk, rs, re);
   1648 			spin_unlock_irq(&pcpu_lock);
   1649 		}
   1650 		pcpu_destroy_chunk(chunk);
   1651 		cond_resched();
   1652 	}
   1653 
   1654 	/*
   1655 	 * Ensure there are certain number of free populated pages for
   1656 	 * atomic allocs.  Fill up from the most packed so that atomic
   1657 	 * allocs don't increase fragmentation.  If atomic allocation
   1658 	 * failed previously, always populate the maximum amount.  This
   1659 	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
   1660 	 * failing indefinitely; however, large atomic allocs are not
   1661 	 * something we support properly and can be highly unreliable and
   1662 	 * inefficient.
   1663 	 */
   1664 retry_pop:
   1665 	if (pcpu_atomic_alloc_failed) {
   1666 		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
   1667 		/* best effort anyway, don't worry about synchronization */
   1668 		pcpu_atomic_alloc_failed = false;
   1669 	} else {
   1670 		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
   1671 				  pcpu_nr_empty_pop_pages,
   1672 				  0, PCPU_EMPTY_POP_PAGES_HIGH);
   1673 	}
   1674 
   1675 	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
   1676 		int nr_unpop = 0, rs, re;
   1677 
   1678 		if (!nr_to_pop)
   1679 			break;
   1680 
   1681 		spin_lock_irq(&pcpu_lock);
   1682 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
   1683 			nr_unpop = chunk->nr_pages - chunk->nr_populated;
   1684 			if (nr_unpop)
   1685 				break;
   1686 		}
   1687 		spin_unlock_irq(&pcpu_lock);
   1688 
   1689 		if (!nr_unpop)
   1690 			continue;
   1691 
   1692 		/* @chunk can't go away while pcpu_alloc_mutex is held */
   1693 		pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
   1694 					   chunk->nr_pages) {
   1695 			int nr = min(re - rs, nr_to_pop);
   1696 
   1697 			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
   1698 			if (!ret) {
   1699 				nr_to_pop -= nr;
   1700 				spin_lock_irq(&pcpu_lock);
   1701 				pcpu_chunk_populated(chunk, rs, rs + nr, false);
   1702 				spin_unlock_irq(&pcpu_lock);
   1703 			} else {
   1704 				nr_to_pop = 0;
   1705 			}
   1706 
   1707 			if (!nr_to_pop)
   1708 				break;
   1709 		}
   1710 	}
   1711 
   1712 	if (nr_to_pop) {
   1713 		/* ran out of chunks to populate, create a new one and retry */
   1714 		chunk = pcpu_create_chunk(gfp);
   1715 		if (chunk) {
   1716 			spin_lock_irq(&pcpu_lock);
   1717 			pcpu_chunk_relocate(chunk, -1);
   1718 			spin_unlock_irq(&pcpu_lock);
   1719 			goto retry_pop;
   1720 		}
   1721 	}
   1722 
   1723 	mutex_unlock(&pcpu_alloc_mutex);
   1724 }
   1725 
   1726 /**
   1727  * free_percpu - free percpu area
   1728  * @ptr: pointer to area to free
   1729  *
   1730  * Free percpu area @ptr.
   1731  *
   1732  * CONTEXT:
   1733  * Can be called from atomic context.
   1734  */
   1735 void free_percpu(void __percpu *ptr)
   1736 {
   1737 	void *addr;
   1738 	struct pcpu_chunk *chunk;
   1739 	unsigned long flags;
   1740 	int off;
   1741 
   1742 	if (!ptr)
   1743 		return;
   1744 
   1745 	kmemleak_free_percpu(ptr);
   1746 
   1747 	addr = __pcpu_ptr_to_addr(ptr);
   1748 
   1749 	spin_lock_irqsave(&pcpu_lock, flags);
   1750 
   1751 	chunk = pcpu_chunk_addr_search(addr);
   1752 	off = addr - chunk->base_addr;
   1753 
   1754 	pcpu_free_area(chunk, off);
   1755 
   1756 	/* if there are more than one fully free chunks, wake up grim reaper */
   1757 	if (chunk->free_bytes == pcpu_unit_size) {
   1758 		struct pcpu_chunk *pos;
   1759 
   1760 		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
   1761 			if (pos != chunk) {
   1762 				pcpu_schedule_balance_work();
   1763 				break;
   1764 			}
   1765 	}
   1766 
   1767 	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
   1768 
   1769 	spin_unlock_irqrestore(&pcpu_lock, flags);
   1770 }
   1771 EXPORT_SYMBOL_GPL(free_percpu);
   1772 
   1773 bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
   1774 {
   1775 #ifdef CONFIG_SMP
   1776 	const size_t static_size = __per_cpu_end - __per_cpu_start;
   1777 	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
   1778 	unsigned int cpu;
   1779 
   1780 	for_each_possible_cpu(cpu) {
   1781 		void *start = per_cpu_ptr(base, cpu);
   1782 		void *va = (void *)addr;
   1783 
   1784 		if (va >= start && va < start + static_size) {
   1785 			if (can_addr) {
   1786 				*can_addr = (unsigned long) (va - start);
   1787 				*can_addr += (unsigned long)
   1788 					per_cpu_ptr(base, get_boot_cpu_id());
   1789 			}
   1790 			return true;
   1791 		}
   1792 	}
   1793 #endif
   1794 	/* on UP, can't distinguish from other static vars, always false */
   1795 	return false;
   1796 }
   1797 
   1798 /**
   1799  * is_kernel_percpu_address - test whether address is from static percpu area
   1800  * @addr: address to test
   1801  *
   1802  * Test whether @addr belongs to in-kernel static percpu area.  Module
   1803  * static percpu areas are not considered.  For those, use
   1804  * is_module_percpu_address().
   1805  *
   1806  * RETURNS:
   1807  * %true if @addr is from in-kernel static percpu area, %false otherwise.
   1808  */
   1809 bool is_kernel_percpu_address(unsigned long addr)
   1810 {
   1811 	return __is_kernel_percpu_address(addr, NULL);
   1812 }
   1813 
   1814 /**
   1815  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
   1816  * @addr: the address to be converted to physical address
   1817  *
   1818  * Given @addr which is dereferenceable address obtained via one of
   1819  * percpu access macros, this function translates it into its physical
   1820  * address.  The caller is responsible for ensuring @addr stays valid
   1821  * until this function finishes.
   1822  *
   1823  * percpu allocator has special setup for the first chunk, which currently
   1824  * supports either embedding in linear address space or vmalloc mapping,
   1825  * and, from the second one, the backing allocator (currently either vm or
   1826  * km) provides translation.
   1827  *
   1828  * The addr can be translated simply without checking if it falls into the
   1829  * first chunk. But the current code reflects better how percpu allocator
   1830  * actually works, and the verification can discover both bugs in percpu
   1831  * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
   1832  * code.
   1833  *
   1834  * RETURNS:
   1835  * The physical address for @addr.
   1836  */
   1837 phys_addr_t per_cpu_ptr_to_phys(void *addr)
   1838 {
   1839 	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
   1840 	bool in_first_chunk = false;
   1841 	unsigned long first_low, first_high;
   1842 	unsigned int cpu;
   1843 
   1844 	/*
   1845 	 * The following test on unit_low/high isn't strictly
   1846 	 * necessary but will speed up lookups of addresses which
   1847 	 * aren't in the first chunk.
   1848 	 *
   1849 	 * The address check is against full chunk sizes.  pcpu_base_addr
   1850 	 * points to the beginning of the first chunk including the
   1851 	 * static region.  Assumes good intent as the first chunk may
   1852 	 * not be full (ie. < pcpu_unit_pages in size).
   1853 	 */
   1854 	first_low = (unsigned long)pcpu_base_addr +
   1855 		    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
   1856 	first_high = (unsigned long)pcpu_base_addr +
   1857 		     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
   1858 	if ((unsigned long)addr >= first_low &&
   1859 	    (unsigned long)addr < first_high) {
   1860 		for_each_possible_cpu(cpu) {
   1861 			void *start = per_cpu_ptr(base, cpu);
   1862 
   1863 			if (addr >= start && addr < start + pcpu_unit_size) {
   1864 				in_first_chunk = true;
   1865 				break;
   1866 			}
   1867 		}
   1868 	}
   1869 
   1870 	if (in_first_chunk) {
   1871 		if (!is_vmalloc_addr(addr))
   1872 			return __pa(addr);
   1873 		else
   1874 			return page_to_phys(vmalloc_to_page(addr)) +
   1875 			       offset_in_page(addr);
   1876 	} else
   1877 		return page_to_phys(pcpu_addr_to_page(addr)) +
   1878 		       offset_in_page(addr);
   1879 }
   1880 
   1881 /**
   1882  * pcpu_alloc_alloc_info - allocate percpu allocation info
   1883  * @nr_groups: the number of groups
   1884  * @nr_units: the number of units
   1885  *
   1886  * Allocate ai which is large enough for @nr_groups groups containing
   1887  * @nr_units units.  The returned ai's groups[0].cpu_map points to the
   1888  * cpu_map array which is long enough for @nr_units and filled with
   1889  * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
   1890  * pointer of other groups.
   1891  *
   1892  * RETURNS:
   1893  * Pointer to the allocated pcpu_alloc_info on success, NULL on
   1894  * failure.
   1895  */
   1896 struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
   1897 						      int nr_units)
   1898 {
   1899 	struct pcpu_alloc_info *ai;
   1900 	size_t base_size, ai_size;
   1901 	void *ptr;
   1902 	int unit;
   1903 
   1904 	base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
   1905 			  __alignof__(ai->groups[0].cpu_map[0]));
   1906 	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
   1907 
   1908 	ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
   1909 	if (!ptr)
   1910 		return NULL;
   1911 	ai = ptr;
   1912 	ptr += base_size;
   1913 
   1914 	ai->groups[0].cpu_map = ptr;
   1915 
   1916 	for (unit = 0; unit < nr_units; unit++)
   1917 		ai->groups[0].cpu_map[unit] = NR_CPUS;
   1918 
   1919 	ai->nr_groups = nr_groups;
   1920 	ai->__ai_size = PFN_ALIGN(ai_size);
   1921 
   1922 	return ai;
   1923 }
   1924 
   1925 /**
   1926  * pcpu_free_alloc_info - free percpu allocation info
   1927  * @ai: pcpu_alloc_info to free
   1928  *
   1929  * Free @ai which was allocated by pcpu_alloc_alloc_info().
   1930  */
   1931 void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
   1932 {
   1933 	memblock_free_early(__pa(ai), ai->__ai_size);
   1934 }
   1935 
   1936 /**
   1937  * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
   1938  * @lvl: loglevel
   1939  * @ai: allocation info to dump
   1940  *
   1941  * Print out information about @ai using loglevel @lvl.
   1942  */
   1943 static void pcpu_dump_alloc_info(const char *lvl,
   1944 				 const struct pcpu_alloc_info *ai)
   1945 {
   1946 	int group_width = 1, cpu_width = 1, width;
   1947 	char empty_str[] = "--------";
   1948 	int alloc = 0, alloc_end = 0;
   1949 	int group, v;
   1950 	int upa, apl;	/* units per alloc, allocs per line */
   1951 
   1952 	v = ai->nr_groups;
   1953 	while (v /= 10)
   1954 		group_width++;
   1955 
   1956 	v = num_possible_cpus();
   1957 	while (v /= 10)
   1958 		cpu_width++;
   1959 	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
   1960 
   1961 	upa = ai->alloc_size / ai->unit_size;
   1962 	width = upa * (cpu_width + 1) + group_width + 3;
   1963 	apl = rounddown_pow_of_two(max(60 / width, 1));
   1964 
   1965 	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
   1966 	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
   1967 	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
   1968 
   1969 	for (group = 0; group < ai->nr_groups; group++) {
   1970 		const struct pcpu_group_info *gi = &ai->groups[group];
   1971 		int unit = 0, unit_end = 0;
   1972 
   1973 		BUG_ON(gi->nr_units % upa);
   1974 		for (alloc_end += gi->nr_units / upa;
   1975 		     alloc < alloc_end; alloc++) {
   1976 			if (!(alloc % apl)) {
   1977 				pr_cont("\n");
   1978 				printk("%spcpu-alloc: ", lvl);
   1979 			}
   1980 			pr_cont("[%0*d] ", group_width, group);
   1981 
   1982 			for (unit_end += upa; unit < unit_end; unit++)
   1983 				if (gi->cpu_map[unit] != NR_CPUS)
   1984 					pr_cont("%0*d ",
   1985 						cpu_width, gi->cpu_map[unit]);
   1986 				else
   1987 					pr_cont("%s ", empty_str);
   1988 		}
   1989 	}
   1990 	pr_cont("\n");
   1991 }
   1992 
   1993 /**
   1994  * pcpu_setup_first_chunk - initialize the first percpu chunk
   1995  * @ai: pcpu_alloc_info describing how to percpu area is shaped
   1996  * @base_addr: mapped address
   1997  *
   1998  * Initialize the first percpu chunk which contains the kernel static
   1999  * perpcu area.  This function is to be called from arch percpu area
   2000  * setup path.
   2001  *
   2002  * @ai contains all information necessary to initialize the first
   2003  * chunk and prime the dynamic percpu allocator.
   2004  *
   2005  * @ai->static_size is the size of static percpu area.
   2006  *
   2007  * @ai->reserved_size, if non-zero, specifies the amount of bytes to
   2008  * reserve after the static area in the first chunk.  This reserves
   2009  * the first chunk such that it's available only through reserved
   2010  * percpu allocation.  This is primarily used to serve module percpu
   2011  * static areas on architectures where the addressing model has
   2012  * limited offset range for symbol relocations to guarantee module
   2013  * percpu symbols fall inside the relocatable range.
   2014  *
   2015  * @ai->dyn_size determines the number of bytes available for dynamic
   2016  * allocation in the first chunk.  The area between @ai->static_size +
   2017  * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
   2018  *
   2019  * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
   2020  * and equal to or larger than @ai->static_size + @ai->reserved_size +
   2021  * @ai->dyn_size.
   2022  *
   2023  * @ai->atom_size is the allocation atom size and used as alignment
   2024  * for vm areas.
   2025  *
   2026  * @ai->alloc_size is the allocation size and always multiple of
   2027  * @ai->atom_size.  This is larger than @ai->atom_size if
   2028  * @ai->unit_size is larger than @ai->atom_size.
   2029  *
   2030  * @ai->nr_groups and @ai->groups describe virtual memory layout of
   2031  * percpu areas.  Units which should be colocated are put into the
   2032  * same group.  Dynamic VM areas will be allocated according to these
   2033  * groupings.  If @ai->nr_groups is zero, a single group containing
   2034  * all units is assumed.
   2035  *
   2036  * The caller should have mapped the first chunk at @base_addr and
   2037  * copied static data to each unit.
   2038  *
   2039  * The first chunk will always contain a static and a dynamic region.
   2040  * However, the static region is not managed by any chunk.  If the first
   2041  * chunk also contains a reserved region, it is served by two chunks -
   2042  * one for the reserved region and one for the dynamic region.  They
   2043  * share the same vm, but use offset regions in the area allocation map.
   2044  * The chunk serving the dynamic region is circulated in the chunk slots
   2045  * and available for dynamic allocation like any other chunk.
   2046  *
   2047  * RETURNS:
   2048  * 0 on success, -errno on failure.
   2049  */
   2050 int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
   2051 				  void *base_addr)
   2052 {
   2053 	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
   2054 	size_t static_size, dyn_size;
   2055 	struct pcpu_chunk *chunk;
   2056 	unsigned long *group_offsets;
   2057 	size_t *group_sizes;
   2058 	unsigned long *unit_off;
   2059 	unsigned int cpu;
   2060 	int *unit_map;
   2061 	int group, unit, i;
   2062 	int map_size;
   2063 	unsigned long tmp_addr;
   2064 	size_t alloc_size;
   2065 
   2066 #define PCPU_SETUP_BUG_ON(cond)	do {					\
   2067 	if (unlikely(cond)) {						\
   2068 		pr_emerg("failed to initialize, %s\n", #cond);		\
   2069 		pr_emerg("cpu_possible_mask=%*pb\n",			\
   2070 			 cpumask_pr_args(cpu_possible_mask));		\
   2071 		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
   2072 		BUG();							\
   2073 	}								\
   2074 } while (0)
   2075 
   2076 	/* sanity checks */
   2077 	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
   2078 #ifdef CONFIG_SMP
   2079 	PCPU_SETUP_BUG_ON(!ai->static_size);
   2080 	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
   2081 #endif
   2082 	PCPU_SETUP_BUG_ON(!base_addr);
   2083 	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
   2084 	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
   2085 	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
   2086 	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
   2087 	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
   2088 	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
   2089 	PCPU_SETUP_BUG_ON(!ai->dyn_size);
   2090 	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
   2091 	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
   2092 			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
   2093 	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
   2094 
   2095 	/* process group information and build config tables accordingly */
   2096 	alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
   2097 	group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
   2098 	if (!group_offsets)
   2099 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   2100 		      alloc_size);
   2101 
   2102 	alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
   2103 	group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
   2104 	if (!group_sizes)
   2105 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   2106 		      alloc_size);
   2107 
   2108 	alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
   2109 	unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
   2110 	if (!unit_map)
   2111 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   2112 		      alloc_size);
   2113 
   2114 	alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
   2115 	unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
   2116 	if (!unit_off)
   2117 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   2118 		      alloc_size);
   2119 
   2120 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
   2121 		unit_map[cpu] = UINT_MAX;
   2122 
   2123 	pcpu_low_unit_cpu = NR_CPUS;
   2124 	pcpu_high_unit_cpu = NR_CPUS;
   2125 
   2126 	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
   2127 		const struct pcpu_group_info *gi = &ai->groups[group];
   2128 
   2129 		group_offsets[group] = gi->base_offset;
   2130 		group_sizes[group] = gi->nr_units * ai->unit_size;
   2131 
   2132 		for (i = 0; i < gi->nr_units; i++) {
   2133 			cpu = gi->cpu_map[i];
   2134 			if (cpu == NR_CPUS)
   2135 				continue;
   2136 
   2137 			PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
   2138 			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
   2139 			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
   2140 
   2141 			unit_map[cpu] = unit + i;
   2142 			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
   2143 
   2144 			/* determine low/high unit_cpu */
   2145 			if (pcpu_low_unit_cpu == NR_CPUS ||
   2146 			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
   2147 				pcpu_low_unit_cpu = cpu;
   2148 			if (pcpu_high_unit_cpu == NR_CPUS ||
   2149 			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
   2150 				pcpu_high_unit_cpu = cpu;
   2151 		}
   2152 	}
   2153 	pcpu_nr_units = unit;
   2154 
   2155 	for_each_possible_cpu(cpu)
   2156 		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
   2157 
   2158 	/* we're done parsing the input, undefine BUG macro and dump config */
   2159 #undef PCPU_SETUP_BUG_ON
   2160 	pcpu_dump_alloc_info(KERN_DEBUG, ai);
   2161 
   2162 	pcpu_nr_groups = ai->nr_groups;
   2163 	pcpu_group_offsets = group_offsets;
   2164 	pcpu_group_sizes = group_sizes;
   2165 	pcpu_unit_map = unit_map;
   2166 	pcpu_unit_offsets = unit_off;
   2167 
   2168 	/* determine basic parameters */
   2169 	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
   2170 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
   2171 	pcpu_atom_size = ai->atom_size;
   2172 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
   2173 		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
   2174 
   2175 	pcpu_stats_save_ai(ai);
   2176 
   2177 	/*
   2178 	 * Allocate chunk slots.  The additional last slot is for
   2179 	 * empty chunks.
   2180 	 */
   2181 	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
   2182 	pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]),
   2183 				   SMP_CACHE_BYTES);
   2184 	if (!pcpu_slot)
   2185 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   2186 		      pcpu_nr_slots * sizeof(pcpu_slot[0]));
   2187 	for (i = 0; i < pcpu_nr_slots; i++)
   2188 		INIT_LIST_HEAD(&pcpu_slot[i]);
   2189 
   2190 	/*
   2191 	 * The end of the static region needs to be aligned with the
   2192 	 * minimum allocation size as this offsets the reserved and
   2193 	 * dynamic region.  The first chunk ends page aligned by
   2194 	 * expanding the dynamic region, therefore the dynamic region
   2195 	 * can be shrunk to compensate while still staying above the
   2196 	 * configured sizes.
   2197 	 */
   2198 	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
   2199 	dyn_size = ai->dyn_size - (static_size - ai->static_size);
   2200 
   2201 	/*
   2202 	 * Initialize first chunk.
   2203 	 * If the reserved_size is non-zero, this initializes the reserved
   2204 	 * chunk.  If the reserved_size is zero, the reserved chunk is NULL
   2205 	 * and the dynamic region is initialized here.  The first chunk,
   2206 	 * pcpu_first_chunk, will always point to the chunk that serves
   2207 	 * the dynamic region.
   2208 	 */
   2209 	tmp_addr = (unsigned long)base_addr + static_size;
   2210 	map_size = ai->reserved_size ?: dyn_size;
   2211 	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
   2212 
   2213 	/* init dynamic chunk if necessary */
   2214 	if (ai->reserved_size) {
   2215 		pcpu_reserved_chunk = chunk;
   2216 
   2217 		tmp_addr = (unsigned long)base_addr + static_size +
   2218 			   ai->reserved_size;
   2219 		map_size = dyn_size;
   2220 		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
   2221 	}
   2222 
   2223 	/* link the first chunk in */
   2224 	pcpu_first_chunk = chunk;
   2225 	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
   2226 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
   2227 
   2228 	/* include all regions of the first chunk */
   2229 	pcpu_nr_populated += PFN_DOWN(size_sum);
   2230 
   2231 	pcpu_stats_chunk_alloc();
   2232 	trace_percpu_create_chunk(base_addr);
   2233 
   2234 	/* we're done */
   2235 	pcpu_base_addr = base_addr;
   2236 	return 0;
   2237 }
   2238 
   2239 #ifdef CONFIG_SMP
   2240 
   2241 const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
   2242 	[PCPU_FC_AUTO]	= "auto",
   2243 	[PCPU_FC_EMBED]	= "embed",
   2244 	[PCPU_FC_PAGE]	= "page",
   2245 };
   2246 
   2247 enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
   2248 
   2249 static int __init percpu_alloc_setup(char *str)
   2250 {
   2251 	if (!str)
   2252 		return -EINVAL;
   2253 
   2254 	if (0)
   2255 		/* nada */;
   2256 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
   2257 	else if (!strcmp(str, "embed"))
   2258 		pcpu_chosen_fc = PCPU_FC_EMBED;
   2259 #endif
   2260 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
   2261 	else if (!strcmp(str, "page"))
   2262 		pcpu_chosen_fc = PCPU_FC_PAGE;
   2263 #endif
   2264 	else
   2265 		pr_warn("unknown allocator %s specified\n", str);
   2266 
   2267 	return 0;
   2268 }
   2269 early_param("percpu_alloc", percpu_alloc_setup);
   2270 
   2271 /*
   2272  * pcpu_embed_first_chunk() is used by the generic percpu setup.
   2273  * Build it if needed by the arch config or the generic setup is going
   2274  * to be used.
   2275  */
   2276 #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
   2277 	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
   2278 #define BUILD_EMBED_FIRST_CHUNK
   2279 #endif
   2280 
   2281 /* build pcpu_page_first_chunk() iff needed by the arch config */
   2282 #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
   2283 #define BUILD_PAGE_FIRST_CHUNK
   2284 #endif
   2285 
   2286 /* pcpu_build_alloc_info() is used by both embed and page first chunk */
   2287 #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
   2288 /**
   2289  * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
   2290  * @reserved_size: the size of reserved percpu area in bytes
   2291  * @dyn_size: minimum free size for dynamic allocation in bytes
   2292  * @atom_size: allocation atom size
   2293  * @cpu_distance_fn: callback to determine distance between cpus, optional
   2294  *
   2295  * This function determines grouping of units, their mappings to cpus
   2296  * and other parameters considering needed percpu size, allocation
   2297  * atom size and distances between CPUs.
   2298  *
   2299  * Groups are always multiples of atom size and CPUs which are of
   2300  * LOCAL_DISTANCE both ways are grouped together and share space for
   2301  * units in the same group.  The returned configuration is guaranteed
   2302  * to have CPUs on different nodes on different groups and >=75% usage
   2303  * of allocated virtual address space.
   2304  *
   2305  * RETURNS:
   2306  * On success, pointer to the new allocation_info is returned.  On
   2307  * failure, ERR_PTR value is returned.
   2308  */
   2309 static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
   2310 				size_t reserved_size, size_t dyn_size,
   2311 				size_t atom_size,
   2312 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
   2313 {
   2314 	static int group_map[NR_CPUS] __initdata;
   2315 	static int group_cnt[NR_CPUS] __initdata;
   2316 	const size_t static_size = __per_cpu_end - __per_cpu_start;
   2317 	int nr_groups = 1, nr_units = 0;
   2318 	size_t size_sum, min_unit_size, alloc_size;
   2319 	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
   2320 	int last_allocs, group, unit;
   2321 	unsigned int cpu, tcpu;
   2322 	struct pcpu_alloc_info *ai;
   2323 	unsigned int *cpu_map;
   2324 
   2325 	/* this function may be called multiple times */
   2326 	memset(group_map, 0, sizeof(group_map));
   2327 	memset(group_cnt, 0, sizeof(group_cnt));
   2328 
   2329 	/* calculate size_sum and ensure dyn_size is enough for early alloc */
   2330 	size_sum = PFN_ALIGN(static_size + reserved_size +
   2331 			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
   2332 	dyn_size = size_sum - static_size - reserved_size;
   2333 
   2334 	/*
   2335 	 * Determine min_unit_size, alloc_size and max_upa such that
   2336 	 * alloc_size is multiple of atom_size and is the smallest
   2337 	 * which can accommodate 4k aligned segments which are equal to
   2338 	 * or larger than min_unit_size.
   2339 	 */
   2340 	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
   2341 
   2342 	/* determine the maximum # of units that can fit in an allocation */
   2343 	alloc_size = roundup(min_unit_size, atom_size);
   2344 	upa = alloc_size / min_unit_size;
   2345 	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
   2346 		upa--;
   2347 	max_upa = upa;
   2348 
   2349 	/* group cpus according to their proximity */
   2350 	for_each_possible_cpu(cpu) {
   2351 		group = 0;
   2352 	next_group:
   2353 		for_each_possible_cpu(tcpu) {
   2354 			if (cpu == tcpu)
   2355 				break;
   2356 			if (group_map[tcpu] == group && cpu_distance_fn &&
   2357 			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
   2358 			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
   2359 				group++;
   2360 				nr_groups = max(nr_groups, group + 1);
   2361 				goto next_group;
   2362 			}
   2363 		}
   2364 		group_map[cpu] = group;
   2365 		group_cnt[group]++;
   2366 	}
   2367 
   2368 	/*
   2369 	 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
   2370 	 * Expand the unit_size until we use >= 75% of the units allocated.
   2371 	 * Related to atom_size, which could be much larger than the unit_size.
   2372 	 */
   2373 	last_allocs = INT_MAX;
   2374 	for (upa = max_upa; upa; upa--) {
   2375 		int allocs = 0, wasted = 0;
   2376 
   2377 		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
   2378 			continue;
   2379 
   2380 		for (group = 0; group < nr_groups; group++) {
   2381 			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
   2382 			allocs += this_allocs;
   2383 			wasted += this_allocs * upa - group_cnt[group];
   2384 		}
   2385 
   2386 		/*
   2387 		 * Don't accept if wastage is over 1/3.  The
   2388 		 * greater-than comparison ensures upa==1 always
   2389 		 * passes the following check.
   2390 		 */
   2391 		if (wasted > num_possible_cpus() / 3)
   2392 			continue;
   2393 
   2394 		/* and then don't consume more memory */
   2395 		if (allocs > last_allocs)
   2396 			break;
   2397 		last_allocs = allocs;
   2398 		best_upa = upa;
   2399 	}
   2400 	upa = best_upa;
   2401 
   2402 	/* allocate and fill alloc_info */
   2403 	for (group = 0; group < nr_groups; group++)
   2404 		nr_units += roundup(group_cnt[group], upa);
   2405 
   2406 	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
   2407 	if (!ai)
   2408 		return ERR_PTR(-ENOMEM);
   2409 	cpu_map = ai->groups[0].cpu_map;
   2410 
   2411 	for (group = 0; group < nr_groups; group++) {
   2412 		ai->groups[group].cpu_map = cpu_map;
   2413 		cpu_map += roundup(group_cnt[group], upa);
   2414 	}
   2415 
   2416 	ai->static_size = static_size;
   2417 	ai->reserved_size = reserved_size;
   2418 	ai->dyn_size = dyn_size;
   2419 	ai->unit_size = alloc_size / upa;
   2420 	ai->atom_size = atom_size;
   2421 	ai->alloc_size = alloc_size;
   2422 
   2423 	for (group = 0, unit = 0; group < nr_groups; group++) {
   2424 		struct pcpu_group_info *gi = &ai->groups[group];
   2425 
   2426 		/*
   2427 		 * Initialize base_offset as if all groups are located
   2428 		 * back-to-back.  The caller should update this to
   2429 		 * reflect actual allocation.
   2430 		 */
   2431 		gi->base_offset = unit * ai->unit_size;
   2432 
   2433 		for_each_possible_cpu(cpu)
   2434 			if (group_map[cpu] == group)
   2435 				gi->cpu_map[gi->nr_units++] = cpu;
   2436 		gi->nr_units = roundup(gi->nr_units, upa);
   2437 		unit += gi->nr_units;
   2438 	}
   2439 	BUG_ON(unit != nr_units);
   2440 
   2441 	return ai;
   2442 }
   2443 #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
   2444 
   2445 #if defined(BUILD_EMBED_FIRST_CHUNK)
   2446 /**
   2447  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
   2448  * @reserved_size: the size of reserved percpu area in bytes
   2449  * @dyn_size: minimum free size for dynamic allocation in bytes
   2450  * @atom_size: allocation atom size
   2451  * @cpu_distance_fn: callback to determine distance between cpus, optional
   2452  * @alloc_fn: function to allocate percpu page
   2453  * @free_fn: function to free percpu page
   2454  *
   2455  * This is a helper to ease setting up embedded first percpu chunk and
   2456  * can be called where pcpu_setup_first_chunk() is expected.
   2457  *
   2458  * If this function is used to setup the first chunk, it is allocated
   2459  * by calling @alloc_fn and used as-is without being mapped into
   2460  * vmalloc area.  Allocations are always whole multiples of @atom_size
   2461  * aligned to @atom_size.
   2462  *
   2463  * This enables the first chunk to piggy back on the linear physical
   2464  * mapping which often uses larger page size.  Please note that this
   2465  * can result in very sparse cpu->unit mapping on NUMA machines thus
   2466  * requiring large vmalloc address space.  Don't use this allocator if
   2467  * vmalloc space is not orders of magnitude larger than distances
   2468  * between node memory addresses (ie. 32bit NUMA machines).
   2469  *
   2470  * @dyn_size specifies the minimum dynamic area size.
   2471  *
   2472  * If the needed size is smaller than the minimum or specified unit
   2473  * size, the leftover is returned using @free_fn.
   2474  *
   2475  * RETURNS:
   2476  * 0 on success, -errno on failure.
   2477  */
   2478 int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
   2479 				  size_t atom_size,
   2480 				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
   2481 				  pcpu_fc_alloc_fn_t alloc_fn,
   2482 				  pcpu_fc_free_fn_t free_fn)
   2483 {
   2484 	void *base = (void *)ULONG_MAX;
   2485 	void **areas = NULL;
   2486 	struct pcpu_alloc_info *ai;
   2487 	size_t size_sum, areas_size;
   2488 	unsigned long max_distance;
   2489 	int group, i, highest_group, rc;
   2490 
   2491 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
   2492 				   cpu_distance_fn);
   2493 	if (IS_ERR(ai))
   2494 		return PTR_ERR(ai);
   2495 
   2496 	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
   2497 	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
   2498 
   2499 	areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
   2500 	if (!areas) {
   2501 		rc = -ENOMEM;
   2502 		goto out_free;
   2503 	}
   2504 
   2505 	/* allocate, copy and determine base address & max_distance */
   2506 	highest_group = 0;
   2507 	for (group = 0; group < ai->nr_groups; group++) {
   2508 		struct pcpu_group_info *gi = &ai->groups[group];
   2509 		unsigned int cpu = NR_CPUS;
   2510 		void *ptr;
   2511 
   2512 		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
   2513 			cpu = gi->cpu_map[i];
   2514 		BUG_ON(cpu == NR_CPUS);
   2515 
   2516 		/* allocate space for the whole group */
   2517 		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
   2518 		if (!ptr) {
   2519 			rc = -ENOMEM;
   2520 			goto out_free_areas;
   2521 		}
   2522 		/* kmemleak tracks the percpu allocations separately */
   2523 		kmemleak_free(ptr);
   2524 		areas[group] = ptr;
   2525 
   2526 		base = min(ptr, base);
   2527 		if (ptr > areas[highest_group])
   2528 			highest_group = group;
   2529 	}
   2530 	max_distance = areas[highest_group] - base;
   2531 	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
   2532 
   2533 	/* warn if maximum distance is further than 75% of vmalloc space */
   2534 	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
   2535 		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
   2536 				max_distance, VMALLOC_TOTAL);
   2537 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
   2538 		/* and fail if we have fallback */
   2539 		rc = -EINVAL;
   2540 		goto out_free_areas;
   2541 #endif
   2542 	}
   2543 
   2544 	/*
   2545 	 * Copy data and free unused parts.  This should happen after all
   2546 	 * allocations are complete; otherwise, we may end up with
   2547 	 * overlapping groups.
   2548 	 */
   2549 	for (group = 0; group < ai->nr_groups; group++) {
   2550 		struct pcpu_group_info *gi = &ai->groups[group];
   2551 		void *ptr = areas[group];
   2552 
   2553 		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
   2554 			if (gi->cpu_map[i] == NR_CPUS) {
   2555 				/* unused unit, free whole */
   2556 				free_fn(ptr, ai->unit_size);
   2557 				continue;
   2558 			}
   2559 			/* copy and return the unused part */
   2560 			memcpy(ptr, __per_cpu_load, ai->static_size);
   2561 			free_fn(ptr + size_sum, ai->unit_size - size_sum);
   2562 		}
   2563 	}
   2564 
   2565 	/* base address is now known, determine group base offsets */
   2566 	for (group = 0; group < ai->nr_groups; group++) {
   2567 		ai->groups[group].base_offset = areas[group] - base;
   2568 	}
   2569 
   2570 	pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
   2571 		PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
   2572 		ai->dyn_size, ai->unit_size);
   2573 
   2574 	rc = pcpu_setup_first_chunk(ai, base);
   2575 	goto out_free;
   2576 
   2577 out_free_areas:
   2578 	for (group = 0; group < ai->nr_groups; group++)
   2579 		if (areas[group])
   2580 			free_fn(areas[group],
   2581 				ai->groups[group].nr_units * ai->unit_size);
   2582 out_free:
   2583 	pcpu_free_alloc_info(ai);
   2584 	if (areas)
   2585 		memblock_free_early(__pa(areas), areas_size);
   2586 	return rc;
   2587 }
   2588 #endif /* BUILD_EMBED_FIRST_CHUNK */
   2589 
   2590 #ifdef BUILD_PAGE_FIRST_CHUNK
   2591 /**
   2592  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
   2593  * @reserved_size: the size of reserved percpu area in bytes
   2594  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
   2595  * @free_fn: function to free percpu page, always called with PAGE_SIZE
   2596  * @populate_pte_fn: function to populate pte
   2597  *
   2598  * This is a helper to ease setting up page-remapped first percpu
   2599  * chunk and can be called where pcpu_setup_first_chunk() is expected.
   2600  *
   2601  * This is the basic allocator.  Static percpu area is allocated
   2602  * page-by-page into vmalloc area.
   2603  *
   2604  * RETURNS:
   2605  * 0 on success, -errno on failure.
   2606  */
   2607 int __init pcpu_page_first_chunk(size_t reserved_size,
   2608 				 pcpu_fc_alloc_fn_t alloc_fn,
   2609 				 pcpu_fc_free_fn_t free_fn,
   2610 				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
   2611 {
   2612 	static struct vm_struct vm;
   2613 	struct pcpu_alloc_info *ai;
   2614 	char psize_str[16];
   2615 	int unit_pages;
   2616 	size_t pages_size;
   2617 	struct page **pages;
   2618 	int unit, i, j, rc;
   2619 	int upa;
   2620 	int nr_g0_units;
   2621 
   2622 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
   2623 
   2624 	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
   2625 	if (IS_ERR(ai))
   2626 		return PTR_ERR(ai);
   2627 	BUG_ON(ai->nr_groups != 1);
   2628 	upa = ai->alloc_size/ai->unit_size;
   2629 	nr_g0_units = roundup(num_possible_cpus(), upa);
   2630 	if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
   2631 		pcpu_free_alloc_info(ai);
   2632 		return -EINVAL;
   2633 	}
   2634 
   2635 	unit_pages = ai->unit_size >> PAGE_SHIFT;
   2636 
   2637 	/* unaligned allocations can't be freed, round up to page size */
   2638 	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
   2639 			       sizeof(pages[0]));
   2640 	pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
   2641 	if (!pages)
   2642 		panic("%s: Failed to allocate %zu bytes\n", __func__,
   2643 		      pages_size);
   2644 
   2645 	/* allocate pages */
   2646 	j = 0;
   2647 	for (unit = 0; unit < num_possible_cpus(); unit++) {
   2648 		unsigned int cpu = ai->groups[0].cpu_map[unit];
   2649 		for (i = 0; i < unit_pages; i++) {
   2650 			void *ptr;
   2651 
   2652 			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
   2653 			if (!ptr) {
   2654 				pr_warn("failed to allocate %s page for cpu%u\n",
   2655 						psize_str, cpu);
   2656 				goto enomem;
   2657 			}
   2658 			/* kmemleak tracks the percpu allocations separately */
   2659 			kmemleak_free(ptr);
   2660 			pages[j++] = virt_to_page(ptr);
   2661 		}
   2662 	}
   2663 
   2664 	/* allocate vm area, map the pages and copy static data */
   2665 	vm.flags = VM_ALLOC;
   2666 	vm.size = num_possible_cpus() * ai->unit_size;
   2667 	vm_area_register_early(&vm, PAGE_SIZE);
   2668 
   2669 	for (unit = 0; unit < num_possible_cpus(); unit++) {
   2670 		unsigned long unit_addr =
   2671 			(unsigned long)vm.addr + unit * ai->unit_size;
   2672 
   2673 		for (i = 0; i < unit_pages; i++)
   2674 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
   2675 
   2676 		/* pte already populated, the following shouldn't fail */
   2677 		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
   2678 				      unit_pages);
   2679 		if (rc < 0)
   2680 			panic("failed to map percpu area, err=%d\n", rc);
   2681 
   2682 		/*
   2683 		 * FIXME: Archs with virtual cache should flush local
   2684 		 * cache for the linear mapping here - something
   2685 		 * equivalent to flush_cache_vmap() on the local cpu.
   2686 		 * flush_cache_vmap() can't be used as most supporting
   2687 		 * data structures are not set up yet.
   2688 		 */
   2689 
   2690 		/* copy static data */
   2691 		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
   2692 	}
   2693 
   2694 	/* we're ready, commit */
   2695 	pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
   2696 		unit_pages, psize_str, ai->static_size,
   2697 		ai->reserved_size, ai->dyn_size);
   2698 
   2699 	rc = pcpu_setup_first_chunk(ai, vm.addr);
   2700 	goto out_free_ar;
   2701 
   2702 enomem:
   2703 	while (--j >= 0)
   2704 		free_fn(page_address(pages[j]), PAGE_SIZE);
   2705 	rc = -ENOMEM;
   2706 out_free_ar:
   2707 	memblock_free_early(__pa(pages), pages_size);
   2708 	pcpu_free_alloc_info(ai);
   2709 	return rc;
   2710 }
   2711 #endif /* BUILD_PAGE_FIRST_CHUNK */
   2712 
   2713 #ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
   2714 /*
   2715  * Generic SMP percpu area setup.
   2716  *
   2717  * The embedding helper is used because its behavior closely resembles
   2718  * the original non-dynamic generic percpu area setup.  This is
   2719  * important because many archs have addressing restrictions and might
   2720  * fail if the percpu area is located far away from the previous
   2721  * location.  As an added bonus, in non-NUMA cases, embedding is
   2722  * generally a good idea TLB-wise because percpu area can piggy back
   2723  * on the physical linear memory mapping which uses large page
   2724  * mappings on applicable archs.
   2725  */
   2726 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
   2727 EXPORT_SYMBOL(__per_cpu_offset);
   2728 
   2729 static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
   2730 				       size_t align)
   2731 {
   2732 	return  memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
   2733 }
   2734 
   2735 static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
   2736 {
   2737 	memblock_free_early(__pa(ptr), size);
   2738 }
   2739 
   2740 void __init setup_per_cpu_areas(void)
   2741 {
   2742 	unsigned long delta;
   2743 	unsigned int cpu;
   2744 	int rc;
   2745 
   2746 	/*
   2747 	 * Always reserve area for module percpu variables.  That's
   2748 	 * what the legacy allocator did.
   2749 	 */
   2750 	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
   2751 				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
   2752 				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
   2753 	if (rc < 0)
   2754 		panic("Failed to initialize percpu areas.");
   2755 
   2756 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
   2757 	for_each_possible_cpu(cpu)
   2758 		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
   2759 }
   2760 #endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
   2761 
   2762 #else	/* CONFIG_SMP */
   2763 
   2764 /*
   2765  * UP percpu area setup.
   2766  *
   2767  * UP always uses km-based percpu allocator with identity mapping.
   2768  * Static percpu variables are indistinguishable from the usual static
   2769  * variables and don't require any special preparation.
   2770  */
   2771 void __init setup_per_cpu_areas(void)
   2772 {
   2773 	const size_t unit_size =
   2774 		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
   2775 					 PERCPU_DYNAMIC_RESERVE));
   2776 	struct pcpu_alloc_info *ai;
   2777 	void *fc;
   2778 
   2779 	ai = pcpu_alloc_alloc_info(1, 1);
   2780 	fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
   2781 	if (!ai || !fc)
   2782 		panic("Failed to allocate memory for percpu areas.");
   2783 	/* kmemleak tracks the percpu allocations separately */
   2784 	kmemleak_free(fc);
   2785 
   2786 	ai->dyn_size = unit_size;
   2787 	ai->unit_size = unit_size;
   2788 	ai->atom_size = unit_size;
   2789 	ai->alloc_size = unit_size;
   2790 	ai->groups[0].nr_units = 1;
   2791 	ai->groups[0].cpu_map[0] = 0;
   2792 
   2793 	if (pcpu_setup_first_chunk(ai, fc) < 0)
   2794 		panic("Failed to initialize percpu areas.");
   2795 	pcpu_free_alloc_info(ai);
   2796 }
   2797 
   2798 #endif	/* CONFIG_SMP */
   2799 
   2800 /*
   2801  * pcpu_nr_pages - calculate total number of populated backing pages
   2802  *
   2803  * This reflects the number of pages populated to back chunks.  Metadata is
   2804  * excluded in the number exposed in meminfo as the number of backing pages
   2805  * scales with the number of cpus and can quickly outweigh the memory used for
   2806  * metadata.  It also keeps this calculation nice and simple.
   2807  *
   2808  * RETURNS:
   2809  * Total number of populated backing pages in use by the allocator.
   2810  */
   2811 unsigned long pcpu_nr_pages(void)
   2812 {
   2813 	return pcpu_nr_populated * pcpu_nr_units;
   2814 }
   2815 
   2816 /*
   2817  * Percpu allocator is initialized early during boot when neither slab or
   2818  * workqueue is available.  Plug async management until everything is up
   2819  * and running.
   2820  */
   2821 static int __init percpu_enable_async(void)
   2822 {
   2823 	pcpu_async_enabled = true;
   2824 	return 0;
   2825 }
   2826 subsys_initcall(percpu_enable_async);