whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

bio.c (55839B)


      1 /*
      2  * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
      3  *
      4  * This program is free software; you can redistribute it and/or modify
      5  * it under the terms of the GNU General Public License version 2 as
      6  * published by the Free Software Foundation.
      7  *
      8  * This program is distributed in the hope that it will be useful,
      9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     11  * GNU General Public License for more details.
     12  *
     13  * You should have received a copy of the GNU General Public Licens
     14  * along with this program; if not, write to the Free Software
     15  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
     16  *
     17  */
     18 #include <linux/mm.h>
     19 #include <linux/swap.h>
     20 #include <linux/bio.h>
     21 #include <linux/blkdev.h>
     22 #include <linux/uio.h>
     23 #include <linux/iocontext.h>
     24 #include <linux/slab.h>
     25 #include <linux/init.h>
     26 #include <linux/kernel.h>
     27 #include <linux/export.h>
     28 #include <linux/mempool.h>
     29 #include <linux/workqueue.h>
     30 #include <linux/cgroup.h>
     31 #include <linux/blk-cgroup.h>
     32 
     33 #include <trace/events/block.h>
     34 #include "blk.h"
     35 #include "blk-rq-qos.h"
     36 
     37 /*
     38  * Test patch to inline a certain number of bi_io_vec's inside the bio
     39  * itself, to shrink a bio data allocation from two mempool calls to one
     40  */
     41 #define BIO_INLINE_VECS		4
     42 
     43 /*
     44  * if you change this list, also change bvec_alloc or things will
     45  * break badly! cannot be bigger than what you can fit into an
     46  * unsigned short
     47  */
     48 #define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n }
     49 static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
     50 	BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max),
     51 };
     52 #undef BV
     53 
     54 /*
     55  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
     56  * IO code that does not need private memory pools.
     57  */
     58 struct bio_set fs_bio_set;
     59 EXPORT_SYMBOL(fs_bio_set);
     60 
     61 /*
     62  * Our slab pool management
     63  */
     64 struct bio_slab {
     65 	struct kmem_cache *slab;
     66 	unsigned int slab_ref;
     67 	unsigned int slab_size;
     68 	char name[8];
     69 };
     70 static DEFINE_MUTEX(bio_slab_lock);
     71 static struct bio_slab *bio_slabs;
     72 static unsigned int bio_slab_nr, bio_slab_max;
     73 
     74 static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
     75 {
     76 	unsigned int sz = sizeof(struct bio) + extra_size;
     77 	struct kmem_cache *slab = NULL;
     78 	struct bio_slab *bslab, *new_bio_slabs;
     79 	unsigned int new_bio_slab_max;
     80 	unsigned int i, entry = -1;
     81 
     82 	mutex_lock(&bio_slab_lock);
     83 
     84 	i = 0;
     85 	while (i < bio_slab_nr) {
     86 		bslab = &bio_slabs[i];
     87 
     88 		if (!bslab->slab && entry == -1)
     89 			entry = i;
     90 		else if (bslab->slab_size == sz) {
     91 			slab = bslab->slab;
     92 			bslab->slab_ref++;
     93 			break;
     94 		}
     95 		i++;
     96 	}
     97 
     98 	if (slab)
     99 		goto out_unlock;
    100 
    101 	if (bio_slab_nr == bio_slab_max && entry == -1) {
    102 		new_bio_slab_max = bio_slab_max << 1;
    103 		new_bio_slabs = krealloc(bio_slabs,
    104 					 new_bio_slab_max * sizeof(struct bio_slab),
    105 					 GFP_KERNEL);
    106 		if (!new_bio_slabs)
    107 			goto out_unlock;
    108 		bio_slab_max = new_bio_slab_max;
    109 		bio_slabs = new_bio_slabs;
    110 	}
    111 	if (entry == -1)
    112 		entry = bio_slab_nr++;
    113 
    114 	bslab = &bio_slabs[entry];
    115 
    116 	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
    117 	slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
    118 				 SLAB_HWCACHE_ALIGN, NULL);
    119 	if (!slab)
    120 		goto out_unlock;
    121 
    122 	bslab->slab = slab;
    123 	bslab->slab_ref = 1;
    124 	bslab->slab_size = sz;
    125 out_unlock:
    126 	mutex_unlock(&bio_slab_lock);
    127 	return slab;
    128 }
    129 
    130 static void bio_put_slab(struct bio_set *bs)
    131 {
    132 	struct bio_slab *bslab = NULL;
    133 	unsigned int i;
    134 
    135 	mutex_lock(&bio_slab_lock);
    136 
    137 	for (i = 0; i < bio_slab_nr; i++) {
    138 		if (bs->bio_slab == bio_slabs[i].slab) {
    139 			bslab = &bio_slabs[i];
    140 			break;
    141 		}
    142 	}
    143 
    144 	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
    145 		goto out;
    146 
    147 	WARN_ON(!bslab->slab_ref);
    148 
    149 	if (--bslab->slab_ref)
    150 		goto out;
    151 
    152 	kmem_cache_destroy(bslab->slab);
    153 	bslab->slab = NULL;
    154 
    155 out:
    156 	mutex_unlock(&bio_slab_lock);
    157 }
    158 
    159 unsigned int bvec_nr_vecs(unsigned short idx)
    160 {
    161 	return bvec_slabs[--idx].nr_vecs;
    162 }
    163 
    164 void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
    165 {
    166 	if (!idx)
    167 		return;
    168 	idx--;
    169 
    170 	BIO_BUG_ON(idx >= BVEC_POOL_NR);
    171 
    172 	if (idx == BVEC_POOL_MAX) {
    173 		mempool_free(bv, pool);
    174 	} else {
    175 		struct biovec_slab *bvs = bvec_slabs + idx;
    176 
    177 		kmem_cache_free(bvs->slab, bv);
    178 	}
    179 }
    180 
    181 struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
    182 			   mempool_t *pool)
    183 {
    184 	struct bio_vec *bvl;
    185 
    186 	/*
    187 	 * see comment near bvec_array define!
    188 	 */
    189 	switch (nr) {
    190 	case 1:
    191 		*idx = 0;
    192 		break;
    193 	case 2 ... 4:
    194 		*idx = 1;
    195 		break;
    196 	case 5 ... 16:
    197 		*idx = 2;
    198 		break;
    199 	case 17 ... 64:
    200 		*idx = 3;
    201 		break;
    202 	case 65 ... 128:
    203 		*idx = 4;
    204 		break;
    205 	case 129 ... BIO_MAX_PAGES:
    206 		*idx = 5;
    207 		break;
    208 	default:
    209 		return NULL;
    210 	}
    211 
    212 	/*
    213 	 * idx now points to the pool we want to allocate from. only the
    214 	 * 1-vec entry pool is mempool backed.
    215 	 */
    216 	if (*idx == BVEC_POOL_MAX) {
    217 fallback:
    218 		bvl = mempool_alloc(pool, gfp_mask);
    219 	} else {
    220 		struct biovec_slab *bvs = bvec_slabs + *idx;
    221 		gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
    222 
    223 		/*
    224 		 * Make this allocation restricted and don't dump info on
    225 		 * allocation failures, since we'll fallback to the mempool
    226 		 * in case of failure.
    227 		 */
    228 		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
    229 
    230 		/*
    231 		 * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
    232 		 * is set, retry with the 1-entry mempool
    233 		 */
    234 		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
    235 		if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
    236 			*idx = BVEC_POOL_MAX;
    237 			goto fallback;
    238 		}
    239 	}
    240 
    241 	(*idx)++;
    242 	return bvl;
    243 }
    244 
    245 void bio_uninit(struct bio *bio)
    246 {
    247 	bio_disassociate_blkg(bio);
    248 }
    249 EXPORT_SYMBOL(bio_uninit);
    250 
    251 static void bio_free(struct bio *bio)
    252 {
    253 	struct bio_set *bs = bio->bi_pool;
    254 	void *p;
    255 
    256 	bio_uninit(bio);
    257 
    258 	if (bs) {
    259 		bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
    260 
    261 		/*
    262 		 * If we have front padding, adjust the bio pointer before freeing
    263 		 */
    264 		p = bio;
    265 		p -= bs->front_pad;
    266 
    267 		mempool_free(p, &bs->bio_pool);
    268 	} else {
    269 		/* Bio was allocated by bio_kmalloc() */
    270 		kfree(bio);
    271 	}
    272 }
    273 
    274 /*
    275  * Users of this function have their own bio allocation. Subsequently,
    276  * they must remember to pair any call to bio_init() with bio_uninit()
    277  * when IO has completed, or when the bio is released.
    278  */
    279 void bio_init(struct bio *bio, struct bio_vec *table,
    280 	      unsigned short max_vecs)
    281 {
    282 	memset(bio, 0, sizeof(*bio));
    283 	atomic_set(&bio->__bi_remaining, 1);
    284 	atomic_set(&bio->__bi_cnt, 1);
    285 
    286 	bio->bi_io_vec = table;
    287 	bio->bi_max_vecs = max_vecs;
    288 }
    289 EXPORT_SYMBOL(bio_init);
    290 
    291 /**
    292  * bio_reset - reinitialize a bio
    293  * @bio:	bio to reset
    294  *
    295  * Description:
    296  *   After calling bio_reset(), @bio will be in the same state as a freshly
    297  *   allocated bio returned bio bio_alloc_bioset() - the only fields that are
    298  *   preserved are the ones that are initialized by bio_alloc_bioset(). See
    299  *   comment in struct bio.
    300  */
    301 void bio_reset(struct bio *bio)
    302 {
    303 	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
    304 
    305 	bio_uninit(bio);
    306 
    307 	memset(bio, 0, BIO_RESET_BYTES);
    308 	bio->bi_flags = flags;
    309 	atomic_set(&bio->__bi_remaining, 1);
    310 }
    311 EXPORT_SYMBOL(bio_reset);
    312 
    313 static struct bio *__bio_chain_endio(struct bio *bio)
    314 {
    315 	struct bio *parent = bio->bi_private;
    316 
    317 	if (!parent->bi_status)
    318 		parent->bi_status = bio->bi_status;
    319 	bio_put(bio);
    320 	return parent;
    321 }
    322 
    323 static void bio_chain_endio(struct bio *bio)
    324 {
    325 	bio_endio(__bio_chain_endio(bio));
    326 }
    327 
    328 /**
    329  * bio_chain - chain bio completions
    330  * @bio: the target bio
    331  * @parent: the @bio's parent bio
    332  *
    333  * The caller won't have a bi_end_io called when @bio completes - instead,
    334  * @parent's bi_end_io won't be called until both @parent and @bio have
    335  * completed; the chained bio will also be freed when it completes.
    336  *
    337  * The caller must not set bi_private or bi_end_io in @bio.
    338  */
    339 void bio_chain(struct bio *bio, struct bio *parent)
    340 {
    341 	BUG_ON(bio->bi_private || bio->bi_end_io);
    342 
    343 	bio->bi_private = parent;
    344 	bio->bi_end_io	= bio_chain_endio;
    345 	bio_inc_remaining(parent);
    346 }
    347 EXPORT_SYMBOL(bio_chain);
    348 
    349 static void bio_alloc_rescue(struct work_struct *work)
    350 {
    351 	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
    352 	struct bio *bio;
    353 
    354 	while (1) {
    355 		spin_lock(&bs->rescue_lock);
    356 		bio = bio_list_pop(&bs->rescue_list);
    357 		spin_unlock(&bs->rescue_lock);
    358 
    359 		if (!bio)
    360 			break;
    361 
    362 		generic_make_request(bio);
    363 	}
    364 }
    365 
    366 static void punt_bios_to_rescuer(struct bio_set *bs)
    367 {
    368 	struct bio_list punt, nopunt;
    369 	struct bio *bio;
    370 
    371 	if (WARN_ON_ONCE(!bs->rescue_workqueue))
    372 		return;
    373 	/*
    374 	 * In order to guarantee forward progress we must punt only bios that
    375 	 * were allocated from this bio_set; otherwise, if there was a bio on
    376 	 * there for a stacking driver higher up in the stack, processing it
    377 	 * could require allocating bios from this bio_set, and doing that from
    378 	 * our own rescuer would be bad.
    379 	 *
    380 	 * Since bio lists are singly linked, pop them all instead of trying to
    381 	 * remove from the middle of the list:
    382 	 */
    383 
    384 	bio_list_init(&punt);
    385 	bio_list_init(&nopunt);
    386 
    387 	while ((bio = bio_list_pop(&current->bio_list[0])))
    388 		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
    389 	current->bio_list[0] = nopunt;
    390 
    391 	bio_list_init(&nopunt);
    392 	while ((bio = bio_list_pop(&current->bio_list[1])))
    393 		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
    394 	current->bio_list[1] = nopunt;
    395 
    396 	spin_lock(&bs->rescue_lock);
    397 	bio_list_merge(&bs->rescue_list, &punt);
    398 	spin_unlock(&bs->rescue_lock);
    399 
    400 	queue_work(bs->rescue_workqueue, &bs->rescue_work);
    401 }
    402 
    403 /**
    404  * bio_alloc_bioset - allocate a bio for I/O
    405  * @gfp_mask:   the GFP_* mask given to the slab allocator
    406  * @nr_iovecs:	number of iovecs to pre-allocate
    407  * @bs:		the bio_set to allocate from.
    408  *
    409  * Description:
    410  *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
    411  *   backed by the @bs's mempool.
    412  *
    413  *   When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
    414  *   always be able to allocate a bio. This is due to the mempool guarantees.
    415  *   To make this work, callers must never allocate more than 1 bio at a time
    416  *   from this pool. Callers that need to allocate more than 1 bio must always
    417  *   submit the previously allocated bio for IO before attempting to allocate
    418  *   a new one. Failure to do so can cause deadlocks under memory pressure.
    419  *
    420  *   Note that when running under generic_make_request() (i.e. any block
    421  *   driver), bios are not submitted until after you return - see the code in
    422  *   generic_make_request() that converts recursion into iteration, to prevent
    423  *   stack overflows.
    424  *
    425  *   This would normally mean allocating multiple bios under
    426  *   generic_make_request() would be susceptible to deadlocks, but we have
    427  *   deadlock avoidance code that resubmits any blocked bios from a rescuer
    428  *   thread.
    429  *
    430  *   However, we do not guarantee forward progress for allocations from other
    431  *   mempools. Doing multiple allocations from the same mempool under
    432  *   generic_make_request() should be avoided - instead, use bio_set's front_pad
    433  *   for per bio allocations.
    434  *
    435  *   RETURNS:
    436  *   Pointer to new bio on success, NULL on failure.
    437  */
    438 struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
    439 			     struct bio_set *bs)
    440 {
    441 	gfp_t saved_gfp = gfp_mask;
    442 	unsigned front_pad;
    443 	unsigned inline_vecs;
    444 	struct bio_vec *bvl = NULL;
    445 	struct bio *bio;
    446 	void *p;
    447 
    448 	if (!bs) {
    449 		if (nr_iovecs > UIO_MAXIOV)
    450 			return NULL;
    451 
    452 		p = kmalloc(sizeof(struct bio) +
    453 			    nr_iovecs * sizeof(struct bio_vec),
    454 			    gfp_mask);
    455 		front_pad = 0;
    456 		inline_vecs = nr_iovecs;
    457 	} else {
    458 		/* should not use nobvec bioset for nr_iovecs > 0 */
    459 		if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
    460 				 nr_iovecs > 0))
    461 			return NULL;
    462 		/*
    463 		 * generic_make_request() converts recursion to iteration; this
    464 		 * means if we're running beneath it, any bios we allocate and
    465 		 * submit will not be submitted (and thus freed) until after we
    466 		 * return.
    467 		 *
    468 		 * This exposes us to a potential deadlock if we allocate
    469 		 * multiple bios from the same bio_set() while running
    470 		 * underneath generic_make_request(). If we were to allocate
    471 		 * multiple bios (say a stacking block driver that was splitting
    472 		 * bios), we would deadlock if we exhausted the mempool's
    473 		 * reserve.
    474 		 *
    475 		 * We solve this, and guarantee forward progress, with a rescuer
    476 		 * workqueue per bio_set. If we go to allocate and there are
    477 		 * bios on current->bio_list, we first try the allocation
    478 		 * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
    479 		 * bios we would be blocking to the rescuer workqueue before
    480 		 * we retry with the original gfp_flags.
    481 		 */
    482 
    483 		if (current->bio_list &&
    484 		    (!bio_list_empty(&current->bio_list[0]) ||
    485 		     !bio_list_empty(&current->bio_list[1])) &&
    486 		    bs->rescue_workqueue)
    487 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
    488 
    489 		p = mempool_alloc(&bs->bio_pool, gfp_mask);
    490 		if (!p && gfp_mask != saved_gfp) {
    491 			punt_bios_to_rescuer(bs);
    492 			gfp_mask = saved_gfp;
    493 			p = mempool_alloc(&bs->bio_pool, gfp_mask);
    494 		}
    495 
    496 		front_pad = bs->front_pad;
    497 		inline_vecs = BIO_INLINE_VECS;
    498 	}
    499 
    500 	if (unlikely(!p))
    501 		return NULL;
    502 
    503 	bio = p + front_pad;
    504 	bio_init(bio, NULL, 0);
    505 
    506 	if (nr_iovecs > inline_vecs) {
    507 		unsigned long idx = 0;
    508 
    509 		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
    510 		if (!bvl && gfp_mask != saved_gfp) {
    511 			punt_bios_to_rescuer(bs);
    512 			gfp_mask = saved_gfp;
    513 			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
    514 		}
    515 
    516 		if (unlikely(!bvl))
    517 			goto err_free;
    518 
    519 		bio->bi_flags |= idx << BVEC_POOL_OFFSET;
    520 	} else if (nr_iovecs) {
    521 		bvl = bio->bi_inline_vecs;
    522 	}
    523 
    524 	bio->bi_pool = bs;
    525 	bio->bi_max_vecs = nr_iovecs;
    526 	bio->bi_io_vec = bvl;
    527 	return bio;
    528 
    529 err_free:
    530 	mempool_free(p, &bs->bio_pool);
    531 	return NULL;
    532 }
    533 EXPORT_SYMBOL(bio_alloc_bioset);
    534 
    535 void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
    536 {
    537 	unsigned long flags;
    538 	struct bio_vec bv;
    539 	struct bvec_iter iter;
    540 
    541 	__bio_for_each_segment(bv, bio, iter, start) {
    542 		char *data = bvec_kmap_irq(&bv, &flags);
    543 		memset(data, 0, bv.bv_len);
    544 		flush_dcache_page(bv.bv_page);
    545 		bvec_kunmap_irq(data, &flags);
    546 	}
    547 }
    548 EXPORT_SYMBOL(zero_fill_bio_iter);
    549 
    550 /**
    551  * bio_put - release a reference to a bio
    552  * @bio:   bio to release reference to
    553  *
    554  * Description:
    555  *   Put a reference to a &struct bio, either one you have gotten with
    556  *   bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
    557  **/
    558 void bio_put(struct bio *bio)
    559 {
    560 	if (!bio_flagged(bio, BIO_REFFED))
    561 		bio_free(bio);
    562 	else {
    563 		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
    564 
    565 		/*
    566 		 * last put frees it
    567 		 */
    568 		if (atomic_dec_and_test(&bio->__bi_cnt))
    569 			bio_free(bio);
    570 	}
    571 }
    572 EXPORT_SYMBOL(bio_put);
    573 
    574 int bio_phys_segments(struct request_queue *q, struct bio *bio)
    575 {
    576 	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
    577 		blk_recount_segments(q, bio);
    578 
    579 	return bio->bi_phys_segments;
    580 }
    581 
    582 /**
    583  * 	__bio_clone_fast - clone a bio that shares the original bio's biovec
    584  * 	@bio: destination bio
    585  * 	@bio_src: bio to clone
    586  *
    587  *	Clone a &bio. Caller will own the returned bio, but not
    588  *	the actual data it points to. Reference count of returned
    589  * 	bio will be one.
    590  *
    591  * 	Caller must ensure that @bio_src is not freed before @bio.
    592  */
    593 void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
    594 {
    595 	BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
    596 
    597 	/*
    598 	 * most users will be overriding ->bi_disk with a new target,
    599 	 * so we don't set nor calculate new physical/hw segment counts here
    600 	 */
    601 	bio->bi_disk = bio_src->bi_disk;
    602 	bio->bi_partno = bio_src->bi_partno;
    603 	bio_set_flag(bio, BIO_CLONED);
    604 	if (bio_flagged(bio_src, BIO_THROTTLED))
    605 		bio_set_flag(bio, BIO_THROTTLED);
    606 	bio->bi_opf = bio_src->bi_opf;
    607 	bio->bi_ioprio = bio_src->bi_ioprio;
    608 	bio->bi_write_hint = bio_src->bi_write_hint;
    609 	bio->bi_iter = bio_src->bi_iter;
    610 	bio->bi_io_vec = bio_src->bi_io_vec;
    611 
    612 	bio_clone_blkg_association(bio, bio_src);
    613 	blkcg_bio_issue_init(bio);
    614 }
    615 EXPORT_SYMBOL(__bio_clone_fast);
    616 
    617 /**
    618  *	bio_clone_fast - clone a bio that shares the original bio's biovec
    619  *	@bio: bio to clone
    620  *	@gfp_mask: allocation priority
    621  *	@bs: bio_set to allocate from
    622  *
    623  * 	Like __bio_clone_fast, only also allocates the returned bio
    624  */
    625 struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
    626 {
    627 	struct bio *b;
    628 
    629 	b = bio_alloc_bioset(gfp_mask, 0, bs);
    630 	if (!b)
    631 		return NULL;
    632 
    633 	__bio_clone_fast(b, bio);
    634 
    635 	if (bio_integrity(bio)) {
    636 		int ret;
    637 
    638 		ret = bio_integrity_clone(b, bio, gfp_mask);
    639 
    640 		if (ret < 0) {
    641 			bio_put(b);
    642 			return NULL;
    643 		}
    644 	}
    645 
    646 	return b;
    647 }
    648 EXPORT_SYMBOL(bio_clone_fast);
    649 
    650 /**
    651  *	bio_add_pc_page	-	attempt to add page to bio
    652  *	@q: the target queue
    653  *	@bio: destination bio
    654  *	@page: page to add
    655  *	@len: vec entry length
    656  *	@offset: vec entry offset
    657  *
    658  *	Attempt to add a page to the bio_vec maplist. This can fail for a
    659  *	number of reasons, such as the bio being full or target block device
    660  *	limitations. The target block device must allow bio's up to PAGE_SIZE,
    661  *	so it is always possible to add a single page to an empty bio.
    662  *
    663  *	This should only be used by REQ_PC bios.
    664  */
    665 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
    666 		    *page, unsigned int len, unsigned int offset)
    667 {
    668 	int retried_segments = 0;
    669 	struct bio_vec *bvec;
    670 
    671 	/*
    672 	 * cloned bio must not modify vec list
    673 	 */
    674 	if (unlikely(bio_flagged(bio, BIO_CLONED)))
    675 		return 0;
    676 
    677 	if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
    678 		return 0;
    679 
    680 	/*
    681 	 * For filesystems with a blocksize smaller than the pagesize
    682 	 * we will often be called with the same page as last time and
    683 	 * a consecutive offset.  Optimize this special case.
    684 	 */
    685 	if (bio->bi_vcnt > 0) {
    686 		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
    687 
    688 		if (page == prev->bv_page &&
    689 		    offset == prev->bv_offset + prev->bv_len) {
    690 			prev->bv_len += len;
    691 			bio->bi_iter.bi_size += len;
    692 			goto done;
    693 		}
    694 
    695 		/*
    696 		 * If the queue doesn't support SG gaps and adding this
    697 		 * offset would create a gap, disallow it.
    698 		 */
    699 		if (bvec_gap_to_prev(q, prev, offset))
    700 			return 0;
    701 	}
    702 
    703 	if (bio_full(bio))
    704 		return 0;
    705 
    706 	/*
    707 	 * setup the new entry, we might clear it again later if we
    708 	 * cannot add the page
    709 	 */
    710 	bvec = &bio->bi_io_vec[bio->bi_vcnt];
    711 	bvec->bv_page = page;
    712 	bvec->bv_len = len;
    713 	bvec->bv_offset = offset;
    714 	bio->bi_vcnt++;
    715 	bio->bi_phys_segments++;
    716 	bio->bi_iter.bi_size += len;
    717 
    718 	/*
    719 	 * Perform a recount if the number of segments is greater
    720 	 * than queue_max_segments(q).
    721 	 */
    722 
    723 	while (bio->bi_phys_segments > queue_max_segments(q)) {
    724 
    725 		if (retried_segments)
    726 			goto failed;
    727 
    728 		retried_segments = 1;
    729 		blk_recount_segments(q, bio);
    730 	}
    731 
    732 	/* If we may be able to merge these biovecs, force a recount */
    733 	if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec))
    734 		bio_clear_flag(bio, BIO_SEG_VALID);
    735 
    736  done:
    737 	return len;
    738 
    739  failed:
    740 	bvec->bv_page = NULL;
    741 	bvec->bv_len = 0;
    742 	bvec->bv_offset = 0;
    743 	bio->bi_vcnt--;
    744 	bio->bi_iter.bi_size -= len;
    745 	blk_recount_segments(q, bio);
    746 	return 0;
    747 }
    748 EXPORT_SYMBOL(bio_add_pc_page);
    749 
    750 /**
    751  * __bio_try_merge_page - try appending data to an existing bvec.
    752  * @bio: destination bio
    753  * @page: page to add
    754  * @len: length of the data to add
    755  * @off: offset of the data in @page
    756  * @same_page: if %true only merge if the new data is in the same physical
    757  *		page as the last segment of the bio.
    758  *
    759  * Try to add the data at @page + @off to the last bvec of @bio.  This is a
    760  * a useful optimisation for file systems with a block size smaller than the
    761  * page size.
    762  *
    763  * Return %true on success or %false on failure.
    764  */
    765 bool __bio_try_merge_page(struct bio *bio, struct page *page,
    766 		unsigned int len, unsigned int off, bool same_page)
    767 {
    768 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
    769 		return false;
    770 
    771 	if (bio->bi_vcnt > 0) {
    772 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
    773 		phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
    774 			bv->bv_offset + bv->bv_len - 1;
    775 		phys_addr_t page_addr = page_to_phys(page);
    776 
    777 		if (vec_end_addr + 1 != page_addr + off)
    778 			return false;
    779 		if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
    780 			return false;
    781 
    782 		bv->bv_len += len;
    783 		bio->bi_iter.bi_size += len;
    784 		return true;
    785 	}
    786 	return false;
    787 }
    788 EXPORT_SYMBOL_GPL(__bio_try_merge_page);
    789 
    790 /**
    791  * __bio_add_page - add page to a bio in a new segment
    792  * @bio: destination bio
    793  * @page: page to add
    794  * @len: length of the data to add
    795  * @off: offset of the data in @page
    796  *
    797  * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
    798  * that @bio has space for another bvec.
    799  */
    800 void __bio_add_page(struct bio *bio, struct page *page,
    801 		unsigned int len, unsigned int off)
    802 {
    803 	struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
    804 
    805 	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
    806 	WARN_ON_ONCE(bio_full(bio));
    807 
    808 	bv->bv_page = page;
    809 	bv->bv_offset = off;
    810 	bv->bv_len = len;
    811 
    812 	bio->bi_iter.bi_size += len;
    813 	bio->bi_vcnt++;
    814 }
    815 EXPORT_SYMBOL_GPL(__bio_add_page);
    816 
    817 /**
    818  *	bio_add_page	-	attempt to add page to bio
    819  *	@bio: destination bio
    820  *	@page: page to add
    821  *	@len: vec entry length
    822  *	@offset: vec entry offset
    823  *
    824  *	Attempt to add a page to the bio_vec maplist. This will only fail
    825  *	if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
    826  */
    827 int bio_add_page(struct bio *bio, struct page *page,
    828 		 unsigned int len, unsigned int offset)
    829 {
    830 	if (!__bio_try_merge_page(bio, page, len, offset, false)) {
    831 		if (bio_full(bio))
    832 			return 0;
    833 		__bio_add_page(bio, page, len, offset);
    834 	}
    835 	return len;
    836 }
    837 EXPORT_SYMBOL(bio_add_page);
    838 
    839 static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
    840 {
    841 	const struct bio_vec *bv = iter->bvec;
    842 	unsigned int len;
    843 	size_t size;
    844 
    845 	if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len))
    846 		return -EINVAL;
    847 
    848 	len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
    849 	size = bio_add_page(bio, bv->bv_page, len,
    850 				bv->bv_offset + iter->iov_offset);
    851 	if (size == len) {
    852 		if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
    853 			struct page *page;
    854 			int i;
    855 
    856 			mp_bvec_for_each_page(page, bv, i)
    857 				get_page(page);
    858 		}
    859 
    860 		iov_iter_advance(iter, size);
    861 		return 0;
    862 	}
    863 
    864 	return -EINVAL;
    865 }
    866 
    867 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
    868 
    869 /**
    870  * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
    871  * @bio: bio to add pages to
    872  * @iter: iov iterator describing the region to be mapped
    873  *
    874  * Pins pages from *iter and appends them to @bio's bvec array. The
    875  * pages will have to be released using put_page() when done.
    876  * For multi-segment *iter, this function only adds pages from the
    877  * the next non-empty segment of the iov iterator.
    878  */
    879 static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
    880 {
    881 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
    882 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
    883 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
    884 	struct page **pages = (struct page **)bv;
    885 	ssize_t size, left;
    886 	unsigned len, i;
    887 	size_t offset;
    888 
    889 	/*
    890 	 * Move page array up in the allocated memory for the bio vecs as far as
    891 	 * possible so that we can start filling biovecs from the beginning
    892 	 * without overwriting the temporary page array.
    893 	*/
    894 	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
    895 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
    896 
    897 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
    898 	if (unlikely(size <= 0))
    899 		return size ? size : -EFAULT;
    900 
    901 	for (left = size, i = 0; left > 0; left -= len, i++) {
    902 		struct page *page = pages[i];
    903 
    904 		len = min_t(size_t, PAGE_SIZE - offset, left);
    905 		if (WARN_ON_ONCE(bio_add_page(bio, page, len, offset) != len))
    906 			return -EINVAL;
    907 		offset = 0;
    908 	}
    909 
    910 	iov_iter_advance(iter, size);
    911 	return 0;
    912 }
    913 
    914 /**
    915  * bio_iov_iter_get_pages - add user or kernel pages to a bio
    916  * @bio: bio to add pages to
    917  * @iter: iov iterator describing the region to be added
    918  *
    919  * This takes either an iterator pointing to user memory, or one pointing to
    920  * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
    921  * map them into the kernel. On IO completion, the caller should put those
    922  * pages. If we're adding kernel pages, and the caller told us it's safe to
    923  * do so, we just have to add the pages to the bio directly. We don't grab an
    924  * extra reference to those pages (the user should already have that), and we
    925  * don't put the page on IO completion. The caller needs to check if the bio is
    926  * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
    927  * released.
    928  *
    929  * The function tries, but does not guarantee, to pin as many pages as
    930  * fit into the bio, or are requested in *iter, whatever is smaller. If
    931  * MM encounters an error pinning the requested pages, it stops. Error
    932  * is returned only if 0 pages could be pinned.
    933  */
    934 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
    935 {
    936 	const bool is_bvec = iov_iter_is_bvec(iter);
    937 	unsigned short orig_vcnt = bio->bi_vcnt;
    938 
    939 	/*
    940 	 * If this is a BVEC iter, then the pages are kernel pages. Don't
    941 	 * release them on IO completion, if the caller asked us to.
    942 	 */
    943 	if (is_bvec && iov_iter_bvec_no_ref(iter))
    944 		bio_set_flag(bio, BIO_NO_PAGE_REF);
    945 
    946 	do {
    947 		int ret;
    948 
    949 		if (is_bvec)
    950 			ret = __bio_iov_bvec_add_pages(bio, iter);
    951 		else
    952 			ret = __bio_iov_iter_get_pages(bio, iter);
    953 
    954 		if (unlikely(ret))
    955 			return bio->bi_vcnt > orig_vcnt ? 0 : ret;
    956 
    957 	} while (iov_iter_count(iter) && !bio_full(bio));
    958 
    959 	return 0;
    960 }
    961 
    962 static void submit_bio_wait_endio(struct bio *bio)
    963 {
    964 	complete(bio->bi_private);
    965 }
    966 
    967 /**
    968  * submit_bio_wait - submit a bio, and wait until it completes
    969  * @bio: The &struct bio which describes the I/O
    970  *
    971  * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
    972  * bio_endio() on failure.
    973  *
    974  * WARNING: Unlike to how submit_bio() is usually used, this function does not
    975  * result in bio reference to be consumed. The caller must drop the reference
    976  * on his own.
    977  */
    978 int submit_bio_wait(struct bio *bio)
    979 {
    980 	DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
    981 
    982 	bio->bi_private = &done;
    983 	bio->bi_end_io = submit_bio_wait_endio;
    984 	bio->bi_opf |= REQ_SYNC;
    985 	submit_bio(bio);
    986 	wait_for_completion_io(&done);
    987 
    988 	return blk_status_to_errno(bio->bi_status);
    989 }
    990 EXPORT_SYMBOL(submit_bio_wait);
    991 
    992 /**
    993  * bio_advance - increment/complete a bio by some number of bytes
    994  * @bio:	bio to advance
    995  * @bytes:	number of bytes to complete
    996  *
    997  * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
    998  * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
    999  * be updated on the last bvec as well.
   1000  *
   1001  * @bio will then represent the remaining, uncompleted portion of the io.
   1002  */
   1003 void bio_advance(struct bio *bio, unsigned bytes)
   1004 {
   1005 	if (bio_integrity(bio))
   1006 		bio_integrity_advance(bio, bytes);
   1007 
   1008 	bio_advance_iter(bio, &bio->bi_iter, bytes);
   1009 }
   1010 EXPORT_SYMBOL(bio_advance);
   1011 
   1012 void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
   1013 			struct bio *src, struct bvec_iter *src_iter)
   1014 {
   1015 	struct bio_vec src_bv, dst_bv;
   1016 	void *src_p, *dst_p;
   1017 	unsigned bytes;
   1018 
   1019 	while (src_iter->bi_size && dst_iter->bi_size) {
   1020 		src_bv = bio_iter_iovec(src, *src_iter);
   1021 		dst_bv = bio_iter_iovec(dst, *dst_iter);
   1022 
   1023 		bytes = min(src_bv.bv_len, dst_bv.bv_len);
   1024 
   1025 		src_p = kmap_atomic(src_bv.bv_page);
   1026 		dst_p = kmap_atomic(dst_bv.bv_page);
   1027 
   1028 		memcpy(dst_p + dst_bv.bv_offset,
   1029 		       src_p + src_bv.bv_offset,
   1030 		       bytes);
   1031 
   1032 		kunmap_atomic(dst_p);
   1033 		kunmap_atomic(src_p);
   1034 
   1035 		flush_dcache_page(dst_bv.bv_page);
   1036 
   1037 		bio_advance_iter(src, src_iter, bytes);
   1038 		bio_advance_iter(dst, dst_iter, bytes);
   1039 	}
   1040 }
   1041 EXPORT_SYMBOL(bio_copy_data_iter);
   1042 
   1043 /**
   1044  * bio_copy_data - copy contents of data buffers from one bio to another
   1045  * @src: source bio
   1046  * @dst: destination bio
   1047  *
   1048  * Stops when it reaches the end of either @src or @dst - that is, copies
   1049  * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
   1050  */
   1051 void bio_copy_data(struct bio *dst, struct bio *src)
   1052 {
   1053 	struct bvec_iter src_iter = src->bi_iter;
   1054 	struct bvec_iter dst_iter = dst->bi_iter;
   1055 
   1056 	bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
   1057 }
   1058 EXPORT_SYMBOL(bio_copy_data);
   1059 
   1060 /**
   1061  * bio_list_copy_data - copy contents of data buffers from one chain of bios to
   1062  * another
   1063  * @src: source bio list
   1064  * @dst: destination bio list
   1065  *
   1066  * Stops when it reaches the end of either the @src list or @dst list - that is,
   1067  * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
   1068  * bios).
   1069  */
   1070 void bio_list_copy_data(struct bio *dst, struct bio *src)
   1071 {
   1072 	struct bvec_iter src_iter = src->bi_iter;
   1073 	struct bvec_iter dst_iter = dst->bi_iter;
   1074 
   1075 	while (1) {
   1076 		if (!src_iter.bi_size) {
   1077 			src = src->bi_next;
   1078 			if (!src)
   1079 				break;
   1080 
   1081 			src_iter = src->bi_iter;
   1082 		}
   1083 
   1084 		if (!dst_iter.bi_size) {
   1085 			dst = dst->bi_next;
   1086 			if (!dst)
   1087 				break;
   1088 
   1089 			dst_iter = dst->bi_iter;
   1090 		}
   1091 
   1092 		bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
   1093 	}
   1094 }
   1095 EXPORT_SYMBOL(bio_list_copy_data);
   1096 
   1097 struct bio_map_data {
   1098 	int is_our_pages;
   1099 	struct iov_iter iter;
   1100 	struct iovec iov[];
   1101 };
   1102 
   1103 static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
   1104 					       gfp_t gfp_mask)
   1105 {
   1106 	struct bio_map_data *bmd;
   1107 	if (data->nr_segs > UIO_MAXIOV)
   1108 		return NULL;
   1109 
   1110 	bmd = kmalloc(sizeof(struct bio_map_data) +
   1111 		       sizeof(struct iovec) * data->nr_segs, gfp_mask);
   1112 	if (!bmd)
   1113 		return NULL;
   1114 	memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
   1115 	bmd->iter = *data;
   1116 	bmd->iter.iov = bmd->iov;
   1117 	return bmd;
   1118 }
   1119 
   1120 /**
   1121  * bio_copy_from_iter - copy all pages from iov_iter to bio
   1122  * @bio: The &struct bio which describes the I/O as destination
   1123  * @iter: iov_iter as source
   1124  *
   1125  * Copy all pages from iov_iter to bio.
   1126  * Returns 0 on success, or error on failure.
   1127  */
   1128 static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
   1129 {
   1130 	int i;
   1131 	struct bio_vec *bvec;
   1132 	struct bvec_iter_all iter_all;
   1133 
   1134 	bio_for_each_segment_all(bvec, bio, i, iter_all) {
   1135 		ssize_t ret;
   1136 
   1137 		ret = copy_page_from_iter(bvec->bv_page,
   1138 					  bvec->bv_offset,
   1139 					  bvec->bv_len,
   1140 					  iter);
   1141 
   1142 		if (!iov_iter_count(iter))
   1143 			break;
   1144 
   1145 		if (ret < bvec->bv_len)
   1146 			return -EFAULT;
   1147 	}
   1148 
   1149 	return 0;
   1150 }
   1151 
   1152 /**
   1153  * bio_copy_to_iter - copy all pages from bio to iov_iter
   1154  * @bio: The &struct bio which describes the I/O as source
   1155  * @iter: iov_iter as destination
   1156  *
   1157  * Copy all pages from bio to iov_iter.
   1158  * Returns 0 on success, or error on failure.
   1159  */
   1160 static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
   1161 {
   1162 	int i;
   1163 	struct bio_vec *bvec;
   1164 	struct bvec_iter_all iter_all;
   1165 
   1166 	bio_for_each_segment_all(bvec, bio, i, iter_all) {
   1167 		ssize_t ret;
   1168 
   1169 		ret = copy_page_to_iter(bvec->bv_page,
   1170 					bvec->bv_offset,
   1171 					bvec->bv_len,
   1172 					&iter);
   1173 
   1174 		if (!iov_iter_count(&iter))
   1175 			break;
   1176 
   1177 		if (ret < bvec->bv_len)
   1178 			return -EFAULT;
   1179 	}
   1180 
   1181 	return 0;
   1182 }
   1183 
   1184 void bio_free_pages(struct bio *bio)
   1185 {
   1186 	struct bio_vec *bvec;
   1187 	int i;
   1188 	struct bvec_iter_all iter_all;
   1189 
   1190 	bio_for_each_segment_all(bvec, bio, i, iter_all)
   1191 		__free_page(bvec->bv_page);
   1192 }
   1193 EXPORT_SYMBOL(bio_free_pages);
   1194 
   1195 /**
   1196  *	bio_uncopy_user	-	finish previously mapped bio
   1197  *	@bio: bio being terminated
   1198  *
   1199  *	Free pages allocated from bio_copy_user_iov() and write back data
   1200  *	to user space in case of a read.
   1201  */
   1202 int bio_uncopy_user(struct bio *bio)
   1203 {
   1204 	struct bio_map_data *bmd = bio->bi_private;
   1205 	int ret = 0;
   1206 
   1207 	if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
   1208 		/*
   1209 		 * if we're in a workqueue, the request is orphaned, so
   1210 		 * don't copy into a random user address space, just free
   1211 		 * and return -EINTR so user space doesn't expect any data.
   1212 		 */
   1213 		if (!current->mm)
   1214 			ret = -EINTR;
   1215 		else if (bio_data_dir(bio) == READ)
   1216 			ret = bio_copy_to_iter(bio, bmd->iter);
   1217 		if (bmd->is_our_pages)
   1218 			bio_free_pages(bio);
   1219 	}
   1220 	kfree(bmd);
   1221 	bio_put(bio);
   1222 	return ret;
   1223 }
   1224 
   1225 /**
   1226  *	bio_copy_user_iov	-	copy user data to bio
   1227  *	@q:		destination block queue
   1228  *	@map_data:	pointer to the rq_map_data holding pages (if necessary)
   1229  *	@iter:		iovec iterator
   1230  *	@gfp_mask:	memory allocation flags
   1231  *
   1232  *	Prepares and returns a bio for indirect user io, bouncing data
   1233  *	to/from kernel pages as necessary. Must be paired with
   1234  *	call bio_uncopy_user() on io completion.
   1235  */
   1236 struct bio *bio_copy_user_iov(struct request_queue *q,
   1237 			      struct rq_map_data *map_data,
   1238 			      struct iov_iter *iter,
   1239 			      gfp_t gfp_mask)
   1240 {
   1241 	struct bio_map_data *bmd;
   1242 	struct page *page;
   1243 	struct bio *bio;
   1244 	int i = 0, ret;
   1245 	int nr_pages;
   1246 	unsigned int len = iter->count;
   1247 	unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0;
   1248 
   1249 	bmd = bio_alloc_map_data(iter, gfp_mask);
   1250 	if (!bmd)
   1251 		return ERR_PTR(-ENOMEM);
   1252 
   1253 	/*
   1254 	 * We need to do a deep copy of the iov_iter including the iovecs.
   1255 	 * The caller provided iov might point to an on-stack or otherwise
   1256 	 * shortlived one.
   1257 	 */
   1258 	bmd->is_our_pages = map_data ? 0 : 1;
   1259 
   1260 	nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
   1261 	if (nr_pages > BIO_MAX_PAGES)
   1262 		nr_pages = BIO_MAX_PAGES;
   1263 
   1264 	ret = -ENOMEM;
   1265 	bio = bio_kmalloc(gfp_mask, nr_pages);
   1266 	if (!bio)
   1267 		goto out_bmd;
   1268 
   1269 	ret = 0;
   1270 
   1271 	if (map_data) {
   1272 		nr_pages = 1 << map_data->page_order;
   1273 		i = map_data->offset / PAGE_SIZE;
   1274 	}
   1275 	while (len) {
   1276 		unsigned int bytes = PAGE_SIZE;
   1277 
   1278 		bytes -= offset;
   1279 
   1280 		if (bytes > len)
   1281 			bytes = len;
   1282 
   1283 		if (map_data) {
   1284 			if (i == map_data->nr_entries * nr_pages) {
   1285 				ret = -ENOMEM;
   1286 				break;
   1287 			}
   1288 
   1289 			page = map_data->pages[i / nr_pages];
   1290 			page += (i % nr_pages);
   1291 
   1292 			i++;
   1293 		} else {
   1294 			page = alloc_page(q->bounce_gfp | gfp_mask);
   1295 			if (!page) {
   1296 				ret = -ENOMEM;
   1297 				break;
   1298 			}
   1299 		}
   1300 
   1301 		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) {
   1302 			if (!map_data)
   1303 				__free_page(page);
   1304 			break;
   1305 		}
   1306 
   1307 		len -= bytes;
   1308 		offset = 0;
   1309 	}
   1310 
   1311 	if (ret)
   1312 		goto cleanup;
   1313 
   1314 	if (map_data)
   1315 		map_data->offset += bio->bi_iter.bi_size;
   1316 
   1317 	/*
   1318 	 * success
   1319 	 */
   1320 	if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) ||
   1321 	    (map_data && map_data->from_user)) {
   1322 		ret = bio_copy_from_iter(bio, iter);
   1323 		if (ret)
   1324 			goto cleanup;
   1325 	} else {
   1326 		if (bmd->is_our_pages)
   1327 			zero_fill_bio(bio);
   1328 		iov_iter_advance(iter, bio->bi_iter.bi_size);
   1329 	}
   1330 
   1331 	bio->bi_private = bmd;
   1332 	if (map_data && map_data->null_mapped)
   1333 		bio_set_flag(bio, BIO_NULL_MAPPED);
   1334 	return bio;
   1335 cleanup:
   1336 	if (!map_data)
   1337 		bio_free_pages(bio);
   1338 	bio_put(bio);
   1339 out_bmd:
   1340 	kfree(bmd);
   1341 	return ERR_PTR(ret);
   1342 }
   1343 
   1344 /**
   1345  *	bio_map_user_iov - map user iovec into bio
   1346  *	@q:		the struct request_queue for the bio
   1347  *	@iter:		iovec iterator
   1348  *	@gfp_mask:	memory allocation flags
   1349  *
   1350  *	Map the user space address into a bio suitable for io to a block
   1351  *	device. Returns an error pointer in case of error.
   1352  */
   1353 struct bio *bio_map_user_iov(struct request_queue *q,
   1354 			     struct iov_iter *iter,
   1355 			     gfp_t gfp_mask)
   1356 {
   1357 	int j;
   1358 	struct bio *bio;
   1359 	int ret;
   1360 	struct bio_vec *bvec;
   1361 	struct bvec_iter_all iter_all;
   1362 
   1363 	if (!iov_iter_count(iter))
   1364 		return ERR_PTR(-EINVAL);
   1365 
   1366 	bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES));
   1367 	if (!bio)
   1368 		return ERR_PTR(-ENOMEM);
   1369 
   1370 	while (iov_iter_count(iter)) {
   1371 		struct page **pages;
   1372 		ssize_t bytes;
   1373 		size_t offs, added = 0;
   1374 		int npages;
   1375 
   1376 		bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
   1377 		if (unlikely(bytes <= 0)) {
   1378 			ret = bytes ? bytes : -EFAULT;
   1379 			goto out_unmap;
   1380 		}
   1381 
   1382 		npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
   1383 
   1384 		if (unlikely(offs & queue_dma_alignment(q))) {
   1385 			ret = -EINVAL;
   1386 			j = 0;
   1387 		} else {
   1388 			for (j = 0; j < npages; j++) {
   1389 				struct page *page = pages[j];
   1390 				unsigned int n = PAGE_SIZE - offs;
   1391 				unsigned short prev_bi_vcnt = bio->bi_vcnt;
   1392 
   1393 				if (n > bytes)
   1394 					n = bytes;
   1395 
   1396 				if (!bio_add_pc_page(q, bio, page, n, offs))
   1397 					break;
   1398 
   1399 				/*
   1400 				 * check if vector was merged with previous
   1401 				 * drop page reference if needed
   1402 				 */
   1403 				if (bio->bi_vcnt == prev_bi_vcnt)
   1404 					put_page(page);
   1405 
   1406 				added += n;
   1407 				bytes -= n;
   1408 				offs = 0;
   1409 			}
   1410 			iov_iter_advance(iter, added);
   1411 		}
   1412 		/*
   1413 		 * release the pages we didn't map into the bio, if any
   1414 		 */
   1415 		while (j < npages)
   1416 			put_page(pages[j++]);
   1417 		kvfree(pages);
   1418 		/* couldn't stuff something into bio? */
   1419 		if (bytes)
   1420 			break;
   1421 	}
   1422 
   1423 	bio_set_flag(bio, BIO_USER_MAPPED);
   1424 
   1425 	/*
   1426 	 * subtle -- if bio_map_user_iov() ended up bouncing a bio,
   1427 	 * it would normally disappear when its bi_end_io is run.
   1428 	 * however, we need it for the unmap, so grab an extra
   1429 	 * reference to it
   1430 	 */
   1431 	bio_get(bio);
   1432 	return bio;
   1433 
   1434  out_unmap:
   1435 	bio_for_each_segment_all(bvec, bio, j, iter_all) {
   1436 		put_page(bvec->bv_page);
   1437 	}
   1438 	bio_put(bio);
   1439 	return ERR_PTR(ret);
   1440 }
   1441 
   1442 static void __bio_unmap_user(struct bio *bio)
   1443 {
   1444 	struct bio_vec *bvec;
   1445 	int i;
   1446 	struct bvec_iter_all iter_all;
   1447 
   1448 	/*
   1449 	 * make sure we dirty pages we wrote to
   1450 	 */
   1451 	bio_for_each_segment_all(bvec, bio, i, iter_all) {
   1452 		if (bio_data_dir(bio) == READ)
   1453 			set_page_dirty_lock(bvec->bv_page);
   1454 
   1455 		put_page(bvec->bv_page);
   1456 	}
   1457 
   1458 	bio_put(bio);
   1459 }
   1460 
   1461 /**
   1462  *	bio_unmap_user	-	unmap a bio
   1463  *	@bio:		the bio being unmapped
   1464  *
   1465  *	Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
   1466  *	process context.
   1467  *
   1468  *	bio_unmap_user() may sleep.
   1469  */
   1470 void bio_unmap_user(struct bio *bio)
   1471 {
   1472 	__bio_unmap_user(bio);
   1473 	bio_put(bio);
   1474 }
   1475 
   1476 static void bio_map_kern_endio(struct bio *bio)
   1477 {
   1478 	bio_put(bio);
   1479 }
   1480 
   1481 /**
   1482  *	bio_map_kern	-	map kernel address into bio
   1483  *	@q: the struct request_queue for the bio
   1484  *	@data: pointer to buffer to map
   1485  *	@len: length in bytes
   1486  *	@gfp_mask: allocation flags for bio allocation
   1487  *
   1488  *	Map the kernel address into a bio suitable for io to a block
   1489  *	device. Returns an error pointer in case of error.
   1490  */
   1491 struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
   1492 			 gfp_t gfp_mask)
   1493 {
   1494 	unsigned long kaddr = (unsigned long)data;
   1495 	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
   1496 	unsigned long start = kaddr >> PAGE_SHIFT;
   1497 	const int nr_pages = end - start;
   1498 	int offset, i;
   1499 	struct bio *bio;
   1500 
   1501 	bio = bio_kmalloc(gfp_mask, nr_pages);
   1502 	if (!bio)
   1503 		return ERR_PTR(-ENOMEM);
   1504 
   1505 	offset = offset_in_page(kaddr);
   1506 	for (i = 0; i < nr_pages; i++) {
   1507 		unsigned int bytes = PAGE_SIZE - offset;
   1508 
   1509 		if (len <= 0)
   1510 			break;
   1511 
   1512 		if (bytes > len)
   1513 			bytes = len;
   1514 
   1515 		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
   1516 				    offset) < bytes) {
   1517 			/* we don't support partial mappings */
   1518 			bio_put(bio);
   1519 			return ERR_PTR(-EINVAL);
   1520 		}
   1521 
   1522 		data += bytes;
   1523 		len -= bytes;
   1524 		offset = 0;
   1525 	}
   1526 
   1527 	bio->bi_end_io = bio_map_kern_endio;
   1528 	return bio;
   1529 }
   1530 EXPORT_SYMBOL(bio_map_kern);
   1531 
   1532 static void bio_copy_kern_endio(struct bio *bio)
   1533 {
   1534 	bio_free_pages(bio);
   1535 	bio_put(bio);
   1536 }
   1537 
   1538 static void bio_copy_kern_endio_read(struct bio *bio)
   1539 {
   1540 	char *p = bio->bi_private;
   1541 	struct bio_vec *bvec;
   1542 	int i;
   1543 	struct bvec_iter_all iter_all;
   1544 
   1545 	bio_for_each_segment_all(bvec, bio, i, iter_all) {
   1546 		memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
   1547 		p += bvec->bv_len;
   1548 	}
   1549 
   1550 	bio_copy_kern_endio(bio);
   1551 }
   1552 
   1553 /**
   1554  *	bio_copy_kern	-	copy kernel address into bio
   1555  *	@q: the struct request_queue for the bio
   1556  *	@data: pointer to buffer to copy
   1557  *	@len: length in bytes
   1558  *	@gfp_mask: allocation flags for bio and page allocation
   1559  *	@reading: data direction is READ
   1560  *
   1561  *	copy the kernel address into a bio suitable for io to a block
   1562  *	device. Returns an error pointer in case of error.
   1563  */
   1564 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
   1565 			  gfp_t gfp_mask, int reading)
   1566 {
   1567 	unsigned long kaddr = (unsigned long)data;
   1568 	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
   1569 	unsigned long start = kaddr >> PAGE_SHIFT;
   1570 	struct bio *bio;
   1571 	void *p = data;
   1572 	int nr_pages = 0;
   1573 
   1574 	/*
   1575 	 * Overflow, abort
   1576 	 */
   1577 	if (end < start)
   1578 		return ERR_PTR(-EINVAL);
   1579 
   1580 	nr_pages = end - start;
   1581 	bio = bio_kmalloc(gfp_mask, nr_pages);
   1582 	if (!bio)
   1583 		return ERR_PTR(-ENOMEM);
   1584 
   1585 	while (len) {
   1586 		struct page *page;
   1587 		unsigned int bytes = PAGE_SIZE;
   1588 
   1589 		if (bytes > len)
   1590 			bytes = len;
   1591 
   1592 		page = alloc_page(q->bounce_gfp | gfp_mask);
   1593 		if (!page)
   1594 			goto cleanup;
   1595 
   1596 		if (!reading)
   1597 			memcpy(page_address(page), p, bytes);
   1598 
   1599 		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
   1600 			break;
   1601 
   1602 		len -= bytes;
   1603 		p += bytes;
   1604 	}
   1605 
   1606 	if (reading) {
   1607 		bio->bi_end_io = bio_copy_kern_endio_read;
   1608 		bio->bi_private = data;
   1609 	} else {
   1610 		bio->bi_end_io = bio_copy_kern_endio;
   1611 	}
   1612 
   1613 	return bio;
   1614 
   1615 cleanup:
   1616 	bio_free_pages(bio);
   1617 	bio_put(bio);
   1618 	return ERR_PTR(-ENOMEM);
   1619 }
   1620 
   1621 /*
   1622  * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
   1623  * for performing direct-IO in BIOs.
   1624  *
   1625  * The problem is that we cannot run set_page_dirty() from interrupt context
   1626  * because the required locks are not interrupt-safe.  So what we can do is to
   1627  * mark the pages dirty _before_ performing IO.  And in interrupt context,
   1628  * check that the pages are still dirty.   If so, fine.  If not, redirty them
   1629  * in process context.
   1630  *
   1631  * We special-case compound pages here: normally this means reads into hugetlb
   1632  * pages.  The logic in here doesn't really work right for compound pages
   1633  * because the VM does not uniformly chase down the head page in all cases.
   1634  * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
   1635  * handle them at all.  So we skip compound pages here at an early stage.
   1636  *
   1637  * Note that this code is very hard to test under normal circumstances because
   1638  * direct-io pins the pages with get_user_pages().  This makes
   1639  * is_page_cache_freeable return false, and the VM will not clean the pages.
   1640  * But other code (eg, flusher threads) could clean the pages if they are mapped
   1641  * pagecache.
   1642  *
   1643  * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
   1644  * deferred bio dirtying paths.
   1645  */
   1646 
   1647 /*
   1648  * bio_set_pages_dirty() will mark all the bio's pages as dirty.
   1649  */
   1650 void bio_set_pages_dirty(struct bio *bio)
   1651 {
   1652 	struct bio_vec *bvec;
   1653 	int i;
   1654 	struct bvec_iter_all iter_all;
   1655 
   1656 	bio_for_each_segment_all(bvec, bio, i, iter_all) {
   1657 		if (!PageCompound(bvec->bv_page))
   1658 			set_page_dirty_lock(bvec->bv_page);
   1659 	}
   1660 }
   1661 
   1662 static void bio_release_pages(struct bio *bio)
   1663 {
   1664 	struct bio_vec *bvec;
   1665 	int i;
   1666 	struct bvec_iter_all iter_all;
   1667 
   1668 	bio_for_each_segment_all(bvec, bio, i, iter_all)
   1669 		put_page(bvec->bv_page);
   1670 }
   1671 
   1672 /*
   1673  * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
   1674  * If they are, then fine.  If, however, some pages are clean then they must
   1675  * have been written out during the direct-IO read.  So we take another ref on
   1676  * the BIO and re-dirty the pages in process context.
   1677  *
   1678  * It is expected that bio_check_pages_dirty() will wholly own the BIO from
   1679  * here on.  It will run one put_page() against each page and will run one
   1680  * bio_put() against the BIO.
   1681  */
   1682 
   1683 static void bio_dirty_fn(struct work_struct *work);
   1684 
   1685 static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
   1686 static DEFINE_SPINLOCK(bio_dirty_lock);
   1687 static struct bio *bio_dirty_list;
   1688 
   1689 /*
   1690  * This runs in process context
   1691  */
   1692 static void bio_dirty_fn(struct work_struct *work)
   1693 {
   1694 	struct bio *bio, *next;
   1695 
   1696 	spin_lock_irq(&bio_dirty_lock);
   1697 	next = bio_dirty_list;
   1698 	bio_dirty_list = NULL;
   1699 	spin_unlock_irq(&bio_dirty_lock);
   1700 
   1701 	while ((bio = next) != NULL) {
   1702 		next = bio->bi_private;
   1703 
   1704 		bio_set_pages_dirty(bio);
   1705 		if (!bio_flagged(bio, BIO_NO_PAGE_REF))
   1706 			bio_release_pages(bio);
   1707 		bio_put(bio);
   1708 	}
   1709 }
   1710 
   1711 void bio_check_pages_dirty(struct bio *bio)
   1712 {
   1713 	struct bio_vec *bvec;
   1714 	unsigned long flags;
   1715 	int i;
   1716 	struct bvec_iter_all iter_all;
   1717 
   1718 	bio_for_each_segment_all(bvec, bio, i, iter_all) {
   1719 		if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
   1720 			goto defer;
   1721 	}
   1722 
   1723 	if (!bio_flagged(bio, BIO_NO_PAGE_REF))
   1724 		bio_release_pages(bio);
   1725 	bio_put(bio);
   1726 	return;
   1727 defer:
   1728 	spin_lock_irqsave(&bio_dirty_lock, flags);
   1729 	bio->bi_private = bio_dirty_list;
   1730 	bio_dirty_list = bio;
   1731 	spin_unlock_irqrestore(&bio_dirty_lock, flags);
   1732 	schedule_work(&bio_dirty_work);
   1733 }
   1734 
   1735 void update_io_ticks(struct hd_struct *part, unsigned long now)
   1736 {
   1737 	unsigned long stamp;
   1738 again:
   1739 	stamp = READ_ONCE(part->stamp);
   1740 	if (unlikely(stamp != now)) {
   1741 		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
   1742 			__part_stat_add(part, io_ticks, 1);
   1743 		}
   1744 	}
   1745 	if (part->partno) {
   1746 		part = &part_to_disk(part)->part0;
   1747 		goto again;
   1748 	}
   1749 }
   1750 
   1751 void generic_start_io_acct(struct request_queue *q, int op,
   1752 			   unsigned long sectors, struct hd_struct *part)
   1753 {
   1754 	const int sgrp = op_stat_group(op);
   1755 
   1756 	part_stat_lock();
   1757 
   1758 	update_io_ticks(part, jiffies);
   1759 	part_stat_inc(part, ios[sgrp]);
   1760 	part_stat_add(part, sectors[sgrp], sectors);
   1761 	part_inc_in_flight(q, part, op_is_write(op));
   1762 
   1763 	part_stat_unlock();
   1764 }
   1765 EXPORT_SYMBOL(generic_start_io_acct);
   1766 
   1767 void generic_end_io_acct(struct request_queue *q, int req_op,
   1768 			 struct hd_struct *part, unsigned long start_time)
   1769 {
   1770 	unsigned long now = jiffies;
   1771 	unsigned long duration = now - start_time;
   1772 	const int sgrp = op_stat_group(req_op);
   1773 
   1774 	part_stat_lock();
   1775 
   1776 	update_io_ticks(part, now);
   1777 	part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
   1778 	part_stat_add(part, time_in_queue, duration);
   1779 	part_dec_in_flight(q, part, op_is_write(req_op));
   1780 
   1781 	part_stat_unlock();
   1782 }
   1783 EXPORT_SYMBOL(generic_end_io_acct);
   1784 
   1785 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
   1786 void bio_flush_dcache_pages(struct bio *bi)
   1787 {
   1788 	struct bio_vec bvec;
   1789 	struct bvec_iter iter;
   1790 
   1791 	bio_for_each_segment(bvec, bi, iter)
   1792 		flush_dcache_page(bvec.bv_page);
   1793 }
   1794 EXPORT_SYMBOL(bio_flush_dcache_pages);
   1795 #endif
   1796 
   1797 static inline bool bio_remaining_done(struct bio *bio)
   1798 {
   1799 	/*
   1800 	 * If we're not chaining, then ->__bi_remaining is always 1 and
   1801 	 * we always end io on the first invocation.
   1802 	 */
   1803 	if (!bio_flagged(bio, BIO_CHAIN))
   1804 		return true;
   1805 
   1806 	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
   1807 
   1808 	if (atomic_dec_and_test(&bio->__bi_remaining)) {
   1809 		bio_clear_flag(bio, BIO_CHAIN);
   1810 		return true;
   1811 	}
   1812 
   1813 	return false;
   1814 }
   1815 
   1816 /**
   1817  * bio_endio - end I/O on a bio
   1818  * @bio:	bio
   1819  *
   1820  * Description:
   1821  *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
   1822  *   way to end I/O on a bio. No one should call bi_end_io() directly on a
   1823  *   bio unless they own it and thus know that it has an end_io function.
   1824  *
   1825  *   bio_endio() can be called several times on a bio that has been chained
   1826  *   using bio_chain().  The ->bi_end_io() function will only be called the
   1827  *   last time.  At this point the BLK_TA_COMPLETE tracing event will be
   1828  *   generated if BIO_TRACE_COMPLETION is set.
   1829  **/
   1830 void bio_endio(struct bio *bio)
   1831 {
   1832 again:
   1833 	if (!bio_remaining_done(bio))
   1834 		return;
   1835 	if (!bio_integrity_endio(bio))
   1836 		return;
   1837 
   1838 	if (bio->bi_disk)
   1839 		rq_qos_done_bio(bio->bi_disk->queue, bio);
   1840 
   1841 	/*
   1842 	 * Need to have a real endio function for chained bios, otherwise
   1843 	 * various corner cases will break (like stacking block devices that
   1844 	 * save/restore bi_end_io) - however, we want to avoid unbounded
   1845 	 * recursion and blowing the stack. Tail call optimization would
   1846 	 * handle this, but compiling with frame pointers also disables
   1847 	 * gcc's sibling call optimization.
   1848 	 */
   1849 	if (bio->bi_end_io == bio_chain_endio) {
   1850 		bio = __bio_chain_endio(bio);
   1851 		goto again;
   1852 	}
   1853 
   1854 	if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
   1855 		trace_block_bio_complete(bio->bi_disk->queue, bio,
   1856 					 blk_status_to_errno(bio->bi_status));
   1857 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
   1858 	}
   1859 
   1860 	blk_throtl_bio_endio(bio);
   1861 	/* release cgroup info */
   1862 	bio_uninit(bio);
   1863 	if (bio->bi_end_io)
   1864 		bio->bi_end_io(bio);
   1865 }
   1866 EXPORT_SYMBOL(bio_endio);
   1867 
   1868 /**
   1869  * bio_split - split a bio
   1870  * @bio:	bio to split
   1871  * @sectors:	number of sectors to split from the front of @bio
   1872  * @gfp:	gfp mask
   1873  * @bs:		bio set to allocate from
   1874  *
   1875  * Allocates and returns a new bio which represents @sectors from the start of
   1876  * @bio, and updates @bio to represent the remaining sectors.
   1877  *
   1878  * Unless this is a discard request the newly allocated bio will point
   1879  * to @bio's bi_io_vec; it is the caller's responsibility to ensure that
   1880  * @bio is not freed before the split.
   1881  */
   1882 struct bio *bio_split(struct bio *bio, int sectors,
   1883 		      gfp_t gfp, struct bio_set *bs)
   1884 {
   1885 	struct bio *split;
   1886 
   1887 	BUG_ON(sectors <= 0);
   1888 	BUG_ON(sectors >= bio_sectors(bio));
   1889 
   1890 	split = bio_clone_fast(bio, gfp, bs);
   1891 	if (!split)
   1892 		return NULL;
   1893 
   1894 	split->bi_iter.bi_size = sectors << 9;
   1895 
   1896 	if (bio_integrity(split))
   1897 		bio_integrity_trim(split);
   1898 
   1899 	bio_advance(bio, split->bi_iter.bi_size);
   1900 
   1901 	if (bio_flagged(bio, BIO_TRACE_COMPLETION))
   1902 		bio_set_flag(split, BIO_TRACE_COMPLETION);
   1903 
   1904 	return split;
   1905 }
   1906 EXPORT_SYMBOL(bio_split);
   1907 
   1908 /**
   1909  * bio_trim - trim a bio
   1910  * @bio:	bio to trim
   1911  * @offset:	number of sectors to trim from the front of @bio
   1912  * @size:	size we want to trim @bio to, in sectors
   1913  */
   1914 void bio_trim(struct bio *bio, int offset, int size)
   1915 {
   1916 	/* 'bio' is a cloned bio which we need to trim to match
   1917 	 * the given offset and size.
   1918 	 */
   1919 
   1920 	size <<= 9;
   1921 	if (offset == 0 && size == bio->bi_iter.bi_size)
   1922 		return;
   1923 
   1924 	bio_clear_flag(bio, BIO_SEG_VALID);
   1925 
   1926 	bio_advance(bio, offset << 9);
   1927 
   1928 	bio->bi_iter.bi_size = size;
   1929 
   1930 	if (bio_integrity(bio))
   1931 		bio_integrity_trim(bio);
   1932 
   1933 }
   1934 EXPORT_SYMBOL_GPL(bio_trim);
   1935 
   1936 /*
   1937  * create memory pools for biovec's in a bio_set.
   1938  * use the global biovec slabs created for general use.
   1939  */
   1940 int biovec_init_pool(mempool_t *pool, int pool_entries)
   1941 {
   1942 	struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
   1943 
   1944 	return mempool_init_slab_pool(pool, pool_entries, bp->slab);
   1945 }
   1946 
   1947 /*
   1948  * bioset_exit - exit a bioset initialized with bioset_init()
   1949  *
   1950  * May be called on a zeroed but uninitialized bioset (i.e. allocated with
   1951  * kzalloc()).
   1952  */
   1953 void bioset_exit(struct bio_set *bs)
   1954 {
   1955 	if (bs->rescue_workqueue)
   1956 		destroy_workqueue(bs->rescue_workqueue);
   1957 	bs->rescue_workqueue = NULL;
   1958 
   1959 	mempool_exit(&bs->bio_pool);
   1960 	mempool_exit(&bs->bvec_pool);
   1961 
   1962 	bioset_integrity_free(bs);
   1963 	if (bs->bio_slab)
   1964 		bio_put_slab(bs);
   1965 	bs->bio_slab = NULL;
   1966 }
   1967 EXPORT_SYMBOL(bioset_exit);
   1968 
   1969 /**
   1970  * bioset_init - Initialize a bio_set
   1971  * @bs:		pool to initialize
   1972  * @pool_size:	Number of bio and bio_vecs to cache in the mempool
   1973  * @front_pad:	Number of bytes to allocate in front of the returned bio
   1974  * @flags:	Flags to modify behavior, currently %BIOSET_NEED_BVECS
   1975  *              and %BIOSET_NEED_RESCUER
   1976  *
   1977  * Description:
   1978  *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
   1979  *    to ask for a number of bytes to be allocated in front of the bio.
   1980  *    Front pad allocation is useful for embedding the bio inside
   1981  *    another structure, to avoid allocating extra data to go with the bio.
   1982  *    Note that the bio must be embedded at the END of that structure always,
   1983  *    or things will break badly.
   1984  *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
   1985  *    for allocating iovecs.  This pool is not needed e.g. for bio_clone_fast().
   1986  *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
   1987  *    dispatch queued requests when the mempool runs out of space.
   1988  *
   1989  */
   1990 int bioset_init(struct bio_set *bs,
   1991 		unsigned int pool_size,
   1992 		unsigned int front_pad,
   1993 		int flags)
   1994 {
   1995 	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
   1996 
   1997 	bs->front_pad = front_pad;
   1998 
   1999 	spin_lock_init(&bs->rescue_lock);
   2000 	bio_list_init(&bs->rescue_list);
   2001 	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
   2002 
   2003 	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
   2004 	if (!bs->bio_slab)
   2005 		return -ENOMEM;
   2006 
   2007 	if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
   2008 		goto bad;
   2009 
   2010 	if ((flags & BIOSET_NEED_BVECS) &&
   2011 	    biovec_init_pool(&bs->bvec_pool, pool_size))
   2012 		goto bad;
   2013 
   2014 	if (!(flags & BIOSET_NEED_RESCUER))
   2015 		return 0;
   2016 
   2017 	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
   2018 	if (!bs->rescue_workqueue)
   2019 		goto bad;
   2020 
   2021 	return 0;
   2022 bad:
   2023 	bioset_exit(bs);
   2024 	return -ENOMEM;
   2025 }
   2026 EXPORT_SYMBOL(bioset_init);
   2027 
   2028 /*
   2029  * Initialize and setup a new bio_set, based on the settings from
   2030  * another bio_set.
   2031  */
   2032 int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
   2033 {
   2034 	int flags;
   2035 
   2036 	flags = 0;
   2037 	if (src->bvec_pool.min_nr)
   2038 		flags |= BIOSET_NEED_BVECS;
   2039 	if (src->rescue_workqueue)
   2040 		flags |= BIOSET_NEED_RESCUER;
   2041 
   2042 	return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
   2043 }
   2044 EXPORT_SYMBOL(bioset_init_from_src);
   2045 
   2046 #ifdef CONFIG_BLK_CGROUP
   2047 
   2048 /**
   2049  * bio_disassociate_blkg - puts back the blkg reference if associated
   2050  * @bio: target bio
   2051  *
   2052  * Helper to disassociate the blkg from @bio if a blkg is associated.
   2053  */
   2054 void bio_disassociate_blkg(struct bio *bio)
   2055 {
   2056 	if (bio->bi_blkg) {
   2057 		blkg_put(bio->bi_blkg);
   2058 		bio->bi_blkg = NULL;
   2059 	}
   2060 }
   2061 EXPORT_SYMBOL_GPL(bio_disassociate_blkg);
   2062 
   2063 /**
   2064  * __bio_associate_blkg - associate a bio with the a blkg
   2065  * @bio: target bio
   2066  * @blkg: the blkg to associate
   2067  *
   2068  * This tries to associate @bio with the specified @blkg.  Association failure
   2069  * is handled by walking up the blkg tree.  Therefore, the blkg associated can
   2070  * be anything between @blkg and the root_blkg.  This situation only happens
   2071  * when a cgroup is dying and then the remaining bios will spill to the closest
   2072  * alive blkg.
   2073  *
   2074  * A reference will be taken on the @blkg and will be released when @bio is
   2075  * freed.
   2076  */
   2077 static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
   2078 {
   2079 	bio_disassociate_blkg(bio);
   2080 
   2081 	bio->bi_blkg = blkg_tryget_closest(blkg);
   2082 }
   2083 
   2084 /**
   2085  * bio_associate_blkg_from_css - associate a bio with a specified css
   2086  * @bio: target bio
   2087  * @css: target css
   2088  *
   2089  * Associate @bio with the blkg found by combining the css's blkg and the
   2090  * request_queue of the @bio.  This falls back to the queue's root_blkg if
   2091  * the association fails with the css.
   2092  */
   2093 void bio_associate_blkg_from_css(struct bio *bio,
   2094 				 struct cgroup_subsys_state *css)
   2095 {
   2096 	struct request_queue *q = bio->bi_disk->queue;
   2097 	struct blkcg_gq *blkg;
   2098 
   2099 	rcu_read_lock();
   2100 
   2101 	if (!css || !css->parent)
   2102 		blkg = q->root_blkg;
   2103 	else
   2104 		blkg = blkg_lookup_create(css_to_blkcg(css), q);
   2105 
   2106 	__bio_associate_blkg(bio, blkg);
   2107 
   2108 	rcu_read_unlock();
   2109 }
   2110 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
   2111 
   2112 #ifdef CONFIG_MEMCG
   2113 /**
   2114  * bio_associate_blkg_from_page - associate a bio with the page's blkg
   2115  * @bio: target bio
   2116  * @page: the page to lookup the blkcg from
   2117  *
   2118  * Associate @bio with the blkg from @page's owning memcg and the respective
   2119  * request_queue.  If cgroup_e_css returns %NULL, fall back to the queue's
   2120  * root_blkg.
   2121  */
   2122 void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
   2123 {
   2124 	struct cgroup_subsys_state *css;
   2125 
   2126 	if (!page->mem_cgroup)
   2127 		return;
   2128 
   2129 	rcu_read_lock();
   2130 
   2131 	css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
   2132 	bio_associate_blkg_from_css(bio, css);
   2133 
   2134 	rcu_read_unlock();
   2135 }
   2136 #endif /* CONFIG_MEMCG */
   2137 
   2138 /**
   2139  * bio_associate_blkg - associate a bio with a blkg
   2140  * @bio: target bio
   2141  *
   2142  * Associate @bio with the blkg found from the bio's css and request_queue.
   2143  * If one is not found, bio_lookup_blkg() creates the blkg.  If a blkg is
   2144  * already associated, the css is reused and association redone as the
   2145  * request_queue may have changed.
   2146  */
   2147 void bio_associate_blkg(struct bio *bio)
   2148 {
   2149 	struct cgroup_subsys_state *css;
   2150 
   2151 	rcu_read_lock();
   2152 
   2153 	if (bio->bi_blkg)
   2154 		css = &bio_blkcg(bio)->css;
   2155 	else
   2156 		css = blkcg_css();
   2157 
   2158 	bio_associate_blkg_from_css(bio, css);
   2159 
   2160 	rcu_read_unlock();
   2161 }
   2162 EXPORT_SYMBOL_GPL(bio_associate_blkg);
   2163 
   2164 /**
   2165  * bio_clone_blkg_association - clone blkg association from src to dst bio
   2166  * @dst: destination bio
   2167  * @src: source bio
   2168  */
   2169 void bio_clone_blkg_association(struct bio *dst, struct bio *src)
   2170 {
   2171 	rcu_read_lock();
   2172 
   2173 	if (src->bi_blkg)
   2174 		__bio_associate_blkg(dst, src->bi_blkg);
   2175 
   2176 	rcu_read_unlock();
   2177 }
   2178 EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
   2179 #endif /* CONFIG_BLK_CGROUP */
   2180 
   2181 static void __init biovec_init_slabs(void)
   2182 {
   2183 	int i;
   2184 
   2185 	for (i = 0; i < BVEC_POOL_NR; i++) {
   2186 		int size;
   2187 		struct biovec_slab *bvs = bvec_slabs + i;
   2188 
   2189 		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
   2190 			bvs->slab = NULL;
   2191 			continue;
   2192 		}
   2193 
   2194 		size = bvs->nr_vecs * sizeof(struct bio_vec);
   2195 		bvs->slab = kmem_cache_create(bvs->name, size, 0,
   2196                                 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
   2197 	}
   2198 }
   2199 
   2200 static int __init init_bio(void)
   2201 {
   2202 	bio_slab_max = 2;
   2203 	bio_slab_nr = 0;
   2204 	bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab),
   2205 			    GFP_KERNEL);
   2206 	if (!bio_slabs)
   2207 		panic("bio: can't allocate bios\n");
   2208 
   2209 	bio_integrity_init();
   2210 	biovec_init_slabs();
   2211 
   2212 	if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
   2213 		panic("bio: can't allocate bios\n");
   2214 
   2215 	if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
   2216 		panic("bio: can't create integrity pool\n");
   2217 
   2218 	return 0;
   2219 }
   2220 subsys_initcall(init_bio);