whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 32ee34eddad13cd44ad0cb3e659fe6fd49143b62
parent 7bbbf2c2fc84dd24cae8615b11a0c6cac12cbe94
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu, 27 Dec 2018 16:44:40 -0800

Merge tag 'for-4.21-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "New features:

   - swapfile support - after a long time it's here, with some
     limitations where COW design does not work well with the swap
     implementation (nodatacow file, no compression, cannot be
     snapshotted, not possible on multiple devices, ...), as this is the
     most restricted but working setup, we'll try to improve that in the
     future

   - metadata uuid - an optional incompat feature to assign a new
     filesystem UUID without overwriting all metadata blocks, stored
     only in superblock

   - more balance messages are printed to system log, initial is in the
     format of the command line that would be used to start it

  Fixes:

   - tag pages of a snapshot to better separate pages that are involved
     in the snapshot (and need to get synced) from newly dirtied pages
     that could slow down or even livelock the snapshot operation

   - improved check of filesystem id associated with a device during
     scan to detect duplicate devices that could be mixed up during
     mount

   - fix device replace state transitions, eg. when it ends up
     interrupted and reboot tries to restart balance too, or when
     start/cancel ioctls race

   - fix a crash due to a race when quotas are enabled during snapshot
     creation

   - GFP_NOFS/memalloc_nofs_* fixes due to GFP_KERNEL allocations in
     transaction context

   - fix fsync of files with multiple hard links in new directories

   - fix race of send with transaction commits that create snapshots

  Core changes:

   - cleanups:
      * further removals of now-dead fsync code
      * core function for finding free extent has been split and
        provides a base for further cleanups to make the logic more
        understandable
      * removed lot of indirect callbacks for data and metadata inodes
      * simplified refcounting and locking for cloned extent buffers
      * removed redundant function arguments
      * defines converted to enums where appropriate

   - separate reserve for delayed refs from global reserve, update logic
     to do less trickery and ad-hoc heuristics, move out some related
     expensive operations from transaction commit or file truncate

   - dev-replace switched from custom locking scheme to semaphore

   - remove first phase of balance that tried to make some space for the
     relocation by calling shrink and grow, this did not work as
     expected and only introduced more error states due to potential
     resize failures, slightly improves the runtime as the chunks on all
     devices are not needlessly enumerated

   - clone and deduplication now use generic helper that adds a few more
     checks that were missing from the original btrfs implementation of
     the ioctls"

* tag 'for-4.21-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (125 commits)
  btrfs: Fix typos in comments and strings
  btrfs: improve error handling of btrfs_add_link
  Btrfs: use generic_remap_file_range_prep() for cloning and deduplication
  btrfs: Refactor main loop in extent_readpages
  btrfs: Remove 1st shrink/grow phase from balance
  Btrfs: send, fix race with transaction commits that create snapshots
  Btrfs: use nofs context when initializing security xattrs to avoid deadlock
  btrfs: run delayed items before dropping the snapshot
  btrfs: catch cow on deleting snapshots
  btrfs: extent-tree: cleanup one-shot usage of @blocksize in do_walk_down
  Btrfs: scrub, move setup of nofs contexts higher in the stack
  btrfs: scrub: move scrub_setup_ctx allocation out of device_list_mutex
  btrfs: scrub: pass fs_info to scrub_setup_ctx
  btrfs: fix truncate throttling
  btrfs: don't run delayed refs in the end transaction logic
  btrfs: rework btrfs_check_space_for_delayed_refs
  btrfs: add new flushing states for the delayed refs rsv
  btrfs: update may_commit_transaction to use the delayed refs rsv
  btrfs: introduce delayed_refs_rsv
  btrfs: only track ref_heads in delayed_ref_updates
  ...

Diffstat:
Mfs/btrfs/backref.c | 13++-----------
Mfs/btrfs/btrfs_inode.h | 14+++++++++++++-
Mfs/btrfs/check-integrity.c | 24++++++++++++------------
Mfs/btrfs/compression.c | 26+++++++++++---------------
Mfs/btrfs/ctree.c | 46++++++++++++++++++++++++++++++++--------------
Mfs/btrfs/ctree.h | 263++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Mfs/btrfs/delayed-ref.c | 61++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mfs/btrfs/delayed-ref.h | 3++-
Mfs/btrfs/dev-replace.c | 191+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Mfs/btrfs/dev-replace.h | 8--------
Mfs/btrfs/disk-io.c | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mfs/btrfs/disk-io.h | 10+++++-----
Mfs/btrfs/extent-tree.c | 1254++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mfs/btrfs/extent_io.c | 414+++++++++++++++++++++++++++++++++++--------------------------------------------
Mfs/btrfs/extent_io.h | 66++++++++++++++++++++++--------------------------------------------
Mfs/btrfs/extent_map.c | 3++-
Mfs/btrfs/extent_map.h | 21++++++++++++++-------
Mfs/btrfs/file-item.c | 13+++----------
Mfs/btrfs/file.c | 29+++++------------------------
Mfs/btrfs/free-space-tree.c | 15++++++++++-----
Mfs/btrfs/inode.c | 665+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mfs/btrfs/ioctl.c | 643+++++++++++++++++++------------------------------------------------------------
Mfs/btrfs/lzo.c | 2+-
Mfs/btrfs/ordered-data.c | 30------------------------------
Mfs/btrfs/ordered-data.h | 47+++++++++++++++++++++++++----------------------
Mfs/btrfs/qgroup.c | 35++++++++++++++++++-----------------
Mfs/btrfs/qgroup.h | 6+++---
Mfs/btrfs/raid56.c | 2+-
Mfs/btrfs/reada.c | 16+++++++++-------
Mfs/btrfs/ref-verify.c | 6+++---
Mfs/btrfs/relocation.c | 50++++++++++++++++++--------------------------------
Mfs/btrfs/scrub.c | 85+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mfs/btrfs/send.c | 6+++---
Mfs/btrfs/super.c | 10+++++-----
Mfs/btrfs/sysfs.c | 14++++++++++++++
Mfs/btrfs/sysfs.h | 2+-
Mfs/btrfs/tests/btrfs-tests.c | 4+++-
Mfs/btrfs/tests/extent-io-tests.c | 29+++++++++++++++--------------
Mfs/btrfs/tests/inode-tests.c | 6------
Mfs/btrfs/transaction.c | 93++++++++++++++++++++++++++++++++-----------------------------------------------
Mfs/btrfs/transaction.h | 16+++++++---------
Mfs/btrfs/tree-checker.c | 6+++---
Mfs/btrfs/tree-log.c | 44++++++++++++++++++++------------------------
Mfs/btrfs/tree-log.h | 2--
Mfs/btrfs/volumes.c | 779+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mfs/btrfs/volumes.h | 25+++++++++++++++++++------
Mfs/btrfs/xattr.c | 8++++++++
Minclude/trace/events/btrfs.h | 4+++-
Minclude/uapi/linux/btrfs.h | 1+
Minclude/uapi/linux/btrfs_tree.h | 1+
50 files changed, 3030 insertions(+), 2198 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c @@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node) } /* - * We maintain three seperate rbtrees: one for direct refs, one for + * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not * have a key. Each tree does merge on insertion. * @@ -695,7 +695,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, } /* - * Now it's a direct ref, put it in the the direct tree. We must + * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); @@ -2020,9 +2020,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, ret = -ENOMEM; break; } - extent_buffer_get(eb); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(slot); @@ -2042,7 +2039,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } - btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } @@ -2083,10 +2079,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, ret = -ENOMEM; break; } - extent_buffer_get(eb); - - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item_size = btrfs_item_size_nr(eb, slot); @@ -2107,7 +2099,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, cur_offset += btrfs_inode_extref_name_len(eb, extref); cur_offset += sizeof(*extref); } - btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); offset++; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h @@ -20,7 +20,7 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE = 0, + BTRFS_INODE_ORDERED_DATA_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, @@ -29,6 +29,7 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, + BTRFS_INODE_SNAPSHOT_FLUSH, }; /* in memory btrfs inode */ @@ -147,6 +148,12 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Track the transaction id of the last transaction used to create a + * hard link for the inode. This is used by the log tree (fsync). + */ + u64 last_link_trans; + + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ @@ -253,6 +260,11 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) return false; } +static inline bool is_data_inode(struct inode *inode) +{ + return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; +} + static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, int mod) { diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c @@ -1202,24 +1202,24 @@ static void btrfsic_read_from_block_data( void *dstv, u32 offset, size_t len) { size_t cur; - size_t offset_in_page; + size_t pgoff; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(block_ctx->start); unsigned long i = (start_offset + offset) >> PAGE_SHIFT; WARN_ON(offset + len > block_ctx->len); - offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1); + pgoff = offset_in_page(start_offset + offset); while (len > 0) { - cur = min(len, ((size_t)PAGE_SIZE - offset_in_page)); + cur = min(len, ((size_t)PAGE_SIZE - pgoff)); BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); kaddr = block_ctx->datav[i]; - memcpy(dst, kaddr + offset_in_page, cur); + memcpy(dst, kaddr + pgoff, cur); dst += cur; len -= cur; - offset_in_page = 0; + pgoff = 0; i++; } } @@ -1601,7 +1601,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, BUG_ON(block_ctx->datav); BUG_ON(block_ctx->pagev); BUG_ON(block_ctx->mem_to_free); - if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", block_ctx->dev_bytenr); return -1; @@ -1720,7 +1720,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, num_pages = state->metablock_size >> PAGE_SHIFT; h = (struct btrfs_header *)datav[0]; - if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE)) + if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) return 1; for (i = 0; i < num_pages; i++) { @@ -1778,7 +1778,7 @@ again: return; } is_metadata = 1; - BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1)); + BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { @@ -2327,7 +2327,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, * write operations. Therefore it keeps the linkage * information for a block until a block is * rewritten. This can temporarily cause incorrect - * and even circular linkage informations. This + * and even circular linkage information. This * causes no harm unless such blocks are referenced * by the most recent super block. */ @@ -2892,12 +2892,12 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(fs_info->nodesize)) { pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", fs_info->nodesize, PAGE_SIZE); return -1; } - if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(fs_info->sectorsize)) { pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", fs_info->sectorsize, PAGE_SIZE); return -1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c @@ -229,7 +229,6 @@ static noinline void end_compressed_writeback(struct inode *inode, */ static void end_compressed_bio_write(struct bio *bio) { - struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; @@ -248,14 +247,10 @@ static void end_compressed_bio_write(struct bio *bio) * call back into the FS and do all the end_io operations */ inode = cb->inode; - tree = &BTRFS_I(inode)->io_tree; cb->compressed_pages[0]->mapping = cb->inode->i_mapping; - tree->ops->writepage_end_io_hook(cb->compressed_pages[0], - cb->start, - cb->start + cb->len - 1, - NULL, - bio->bi_status ? - BLK_STS_OK : BLK_STS_NOTSUPP); + btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], + cb->start, cb->start + cb->len - 1, + bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -306,7 +301,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - WARN_ON(start & ((u64)PAGE_SIZE - 1)); + WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; @@ -337,7 +332,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, + 0); page->mapping = NULL; if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < @@ -481,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (page->index == end_index) { char *userpage; - size_t zero_offset = isize & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; @@ -615,8 +611,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, - comp_bio, 0); + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, + comp_bio, 0); page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < @@ -1207,7 +1203,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, /* * Shannon Entropy calculation * - * Pure byte distribution analysis fails to determine compressiability of data. + * Pure byte distribution analysis fails to determine compressibility of data. * Try calculating entropy to estimate the average minimum number of bits * needed to encode the sampled data. * @@ -1271,7 +1267,7 @@ static u8 get4bits(u64 num, int shift) { /* * Use 4 bits as radix base - * Use 16 u32 counters for calculating new possition in buf array + * Use 16 u32 counters for calculating new position in buf array * * @array - array that will be sorted * @array_buf - buffer array to store sorting results diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c @@ -12,6 +12,7 @@ #include "transaction.h" #include "print-tree.h" #include "locking.h" +#include "volumes.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -224,7 +225,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer_fsid(cow, fs_info->fsid); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) @@ -1050,7 +1051,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer_fsid(cow, fs_info->fsid); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { @@ -1290,7 +1291,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); - extent_buffer_get(eb_rewin); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1362,7 +1362,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - extent_buffer_get(eb); btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); @@ -1415,7 +1414,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * * What is forced COW: * when we create snapshot during committing the transaction, - * after we've finished coping src root, we must COW the shared + * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ if (btrfs_header_generation(buf) == trans->transid && @@ -1441,6 +1440,10 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; + if (test_bit(BTRFS_ROOT_DELETING, &root->state)) + btrfs_err(fs_info, + "COW'ing blocks on a fs root that's being dropped"); + if (trans->transaction != fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n", trans->transid, @@ -2584,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, root_lock = BTRFS_READ_LOCK; if (p->search_commit_root) { - /* The commit roots are read only so we always do read locks */ - if (p->need_commit_sem) + /* + * The commit roots are read only so we always do read locks, + * and we always must hold the commit_root_sem when doing + * searches on them, the only exception is send where we don't + * want to block transaction commits for a long time, so + * we need to clone the commit root in order to avoid races + * with transaction commits that create a snapshot of one of + * the roots used by a send operation. + */ + if (p->need_commit_sem) { down_read(&fs_info->commit_root_sem); - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (p->need_commit_sem) + b = btrfs_clone_extent_buffer(root->commit_root); up_read(&fs_info->commit_root_sem); + if (!b) + return ERR_PTR(-ENOMEM); + + } else { + b = root->commit_root; + extent_buffer_get(b); + } + level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when * p->search_commit_root = 1. @@ -2717,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto done; + } while (b) { level = btrfs_header_level(b); @@ -3751,7 +3771,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete * it, therefore use right neighbor to insert the new item and - * no need to touch/dirty our left leaft. */ + * no need to touch/dirty our left leaf. */ btrfs_tree_unlock(left); free_extent_buffer(left); path->nodes[0] = right; @@ -5390,7 +5410,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } - extent_buffer_get(left_path->nodes[left_level]); right_level = btrfs_header_level(right_root->commit_root); right_root_level = right_level; @@ -5401,7 +5420,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } - extent_buffer_get(right_path->nodes[right_level]); up_read(&fs_info->commit_root_sem); if (left_level == 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h @@ -109,13 +109,26 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } /* - * File system states + * Runtime (in-memory) states of filesystem */ -#define BTRFS_FS_STATE_ERROR 0 -#define BTRFS_FS_STATE_REMOUNTING 1 -#define BTRFS_FS_STATE_TRANS_ABORTED 2 -#define BTRFS_FS_STATE_DEV_REPLACING 3 -#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 +enum { + /* Global indicator of serious filesystem errors */ + BTRFS_FS_STATE_ERROR, + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, +}; #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -195,9 +208,10 @@ struct btrfs_root_backup { * it currently lacks any block count etc etc */ struct btrfs_super_block { - u8 csum[BTRFS_CSUM_SIZE]; /* the first 4 fields must match struct btrfs_header */ - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific UUID, visible to user */ + u8 fsid[BTRFS_FSID_SIZE]; __le64 bytenr; /* this block number */ __le64 flags; @@ -234,8 +248,11 @@ struct btrfs_super_block { __le64 cache_generation; __le64 uuid_tree_generation; + /* the UUID written into btree blocks */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + /* future expansion */ - __le64 reserved[30]; + __le64 reserved[28]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); @@ -265,7 +282,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES) + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -316,7 +334,7 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ -enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; +enum { READA_NONE, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; @@ -360,9 +378,7 @@ struct btrfs_dev_replace { struct btrfs_device *tgtdev; struct mutex lock_finishing_cancel_unmount; - rwlock_t lock; - atomic_t blocking_readers; - wait_queue_head_t read_lock_wq; + struct rw_semaphore rwsem; struct btrfs_scrub_progress scrub_progress; @@ -443,13 +459,19 @@ struct btrfs_space_info { struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; -#define BTRFS_BLOCK_RSV_GLOBAL 1 -#define BTRFS_BLOCK_RSV_DELALLOC 2 -#define BTRFS_BLOCK_RSV_TRANS 3 -#define BTRFS_BLOCK_RSV_CHUNK 4 -#define BTRFS_BLOCK_RSV_DELOPS 5 -#define BTRFS_BLOCK_RSV_EMPTY 6 -#define BTRFS_BLOCK_RSV_TEMP 7 +/* + * Types of block reserves + */ +enum { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; struct btrfs_block_rsv { u64 size; @@ -509,18 +531,18 @@ struct btrfs_free_cluster { }; enum btrfs_caching_type { - BTRFS_CACHE_NO = 0, - BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FAST = 2, - BTRFS_CACHE_FINISHED = 3, - BTRFS_CACHE_ERROR = 4, + BTRFS_CACHE_NO, + BTRFS_CACHE_STARTED, + BTRFS_CACHE_FAST, + BTRFS_CACHE_FINISHED, + BTRFS_CACHE_ERROR, }; enum btrfs_disk_cache_state { - BTRFS_DC_WRITTEN = 0, - BTRFS_DC_ERROR = 1, - BTRFS_DC_CLEAR = 2, - BTRFS_DC_SETUP = 3, + BTRFS_DC_WRITTEN, + BTRFS_DC_ERROR, + BTRFS_DC_CLEAR, + BTRFS_DC_SETUP, }; struct btrfs_caching_control { @@ -712,41 +734,61 @@ struct btrfs_fs_devices; struct btrfs_balance_control; struct btrfs_delayed_root; -#define BTRFS_FS_BARRIER 1 -#define BTRFS_FS_CLOSING_START 2 -#define BTRFS_FS_CLOSING_DONE 3 -#define BTRFS_FS_LOG_RECOVERING 4 -#define BTRFS_FS_OPEN 5 -#define BTRFS_FS_QUOTA_ENABLED 6 -#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 -#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 -#define BTRFS_FS_BTREE_ERR 11 -#define BTRFS_FS_LOG1_ERR 12 -#define BTRFS_FS_LOG2_ERR 13 -#define BTRFS_FS_QUOTA_OVERRIDE 14 -/* Used to record internally whether fs has been frozen */ -#define BTRFS_FS_FROZEN 15 - -/* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ -#define BTRFS_FS_EXCL_OP 16 - /* - * To info transaction_kthread we need an immediate commit so it doesn't - * need to wait for commit_interval + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree. */ -#define BTRFS_FS_NEED_ASYNC_COMMIT 17 +struct btrfs_swapfile_pin { + struct rb_node node; + void *ptr; + struct inode *inode; + /* + * If true, ptr points to a struct btrfs_block_group_cache. Otherwise, + * ptr points to a struct btrfs_device. + */ + bool is_block_group; +}; -/* - * Indicate that balance has been set up from the ioctl and is in the main - * phase. The fs_info::balance_ctl is initialized. - */ -#define BTRFS_FS_BALANCE_RUNNING 18 +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + +enum { + BTRFS_FS_BARRIER, + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ + BTRFS_FS_EXCL_OP, + /* + * To info transaction_kthread we need an immediate commit so it + * doesn't need to wait for commit_interval + */ + BTRFS_FS_NEED_ASYNC_COMMIT, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, +}; struct btrfs_fs_info { - u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; struct btrfs_root *extent_root; @@ -790,6 +832,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv chunk_block_rsv; /* block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; + /* block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -1114,6 +1158,10 @@ struct btrfs_fs_info { u32 sectorsize; u32 stripesize; + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1133,22 +1181,24 @@ struct btrfs_subvolume_writers { /* * The state of btrfs root */ -/* - * btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So IN_TRANS_SETUP - * is used to tell us when more checks are required - */ -#define BTRFS_ROOT_IN_TRANS_SETUP 0 -#define BTRFS_ROOT_REF_COWS 1 -#define BTRFS_ROOT_TRACK_DIRTY 2 -#define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 -#define BTRFS_ROOT_DEFRAG_RUNNING 5 -#define BTRFS_ROOT_FORCE_COW 6 -#define BTRFS_ROOT_MULTI_LOG_TASKS 7 -#define BTRFS_ROOT_DIRTY 8 +enum { + /* + * btrfs_record_root_in_trans is a multi-step process, and it can race + * with the balancing code. But the race is very small, and only the + * first time the root is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ + BTRFS_ROOT_IN_TRANS_SETUP, + BTRFS_ROOT_REF_COWS, + BTRFS_ROOT_TRACK_DIRTY, + BTRFS_ROOT_IN_RADIX, + BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + BTRFS_ROOT_DEFRAG_RUNNING, + BTRFS_ROOT_FORCE_COW, + BTRFS_ROOT_MULTI_LOG_TASKS, + BTRFS_ROOT_DIRTY, + BTRFS_ROOT_DELETING, +}; /* * in ram representation of the tree. extent_root is used for all allocations @@ -1274,6 +1324,9 @@ struct btrfs_root { u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + /* Number of active swapfiles */ + atomic_t nr_swapfiles; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -2570,10 +2623,10 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) /* extent-tree.c */ enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID = 0, - BTRFS_REF_TYPE_BLOCK = 1, - BTRFS_REF_TYPE_DATA = 2, - BTRFS_REF_TYPE_ANY = 3, + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2599,7 +2652,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2713,10 +2766,12 @@ enum btrfs_reserve_flush_enum { enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + ALLOC_CHUNK = 7, + COMMIT_TRANS = 8, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); @@ -2767,6 +2822,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); @@ -3141,7 +3203,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, @@ -3150,9 +3212,16 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags); + void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits); +void btrfs_clear_delalloc_extent(struct inode *inode, + struct extent_state *state, unsigned *bits); +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split); +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); @@ -3189,6 +3258,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, unsigned long *nr_written, + struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -3428,6 +3503,16 @@ static inline void assfail(const char *expr, const char *file, int line) #define ASSERT(expr) ((void)0) #endif +/* + * Use that for functions that are conditionally exported for sanity tests but + * otherwise static + */ +#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define EXPORT_FOR_TESTS static +#else +#define EXPORT_FOR_TESTS +#endif + __cold static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c @@ -251,8 +251,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; } static bool merge_ref(struct btrfs_trans_handle *trans, @@ -400,6 +398,20 @@ again: return head; } +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + lockdep_assert_held(&delayed_refs->lock); + lockdep_assert_held(&head->lock); + + rb_erase_cached(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); + atomic_dec(&delayed_refs->num_entries); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -453,7 +465,6 @@ inserted: if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); - trans->delayed_ref_updates++; spin_unlock(&href->lock); return ret; } @@ -462,12 +473,14 @@ inserted: * helper function to update the accounting in the head ref * existing and update must have the same bytenr */ -static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, +static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -525,10 +538,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * versa we need to make sure to adjust pending_csums accordingly. */ if (existing->is_data) { - if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + u64 csum_leaves = + btrfs_csum_bytes_to_leaves(fs_info, + existing->num_bytes); + + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + } + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; + trans->delayed_ref_updates += csum_leaves; + } } spin_unlock(&existing->lock); } @@ -634,7 +655,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, && head_ref->qgroup_reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, existing, head_ref, + update_existing_head_ref(trans, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -645,8 +666,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } else { if (old_ref_mod) *old_ref_mod = 0; - if (head_ref->is_data && head_ref->ref_mod < 0) + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; + trans->delayed_ref_updates += + btrfs_csum_bytes_to_leaves(trans->fs_info, + head_ref->num_bytes); + } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -782,6 +807,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + trace_add_delayed_tree_ref(fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); @@ -863,6 +894,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); @@ -899,6 +936,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); + + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h @@ -261,7 +261,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } - +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c @@ -59,7 +59,6 @@ no_valid_dev_replace_entry_found: BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; - dev_replace->replace_state = 0; dev_replace->time_started = 0; dev_replace->time_stopped = 0; atomic64_set(&dev_replace->num_write_errors, 0); @@ -285,13 +284,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_dev_replace_item *ptr; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); if (!dev_replace->is_valid || !dev_replace->item_needs_writeback) { - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); return 0; } - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -349,7 +348,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_replace_item); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); if (dev_replace->srcdev) btrfs_set_dev_replace_src_devid(eb, ptr, dev_replace->srcdev->devid); @@ -372,7 +371,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, btrfs_set_dev_replace_cursor_right(eb, ptr, dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); btrfs_mark_buffer_dirty(eb); @@ -390,7 +389,7 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) { @@ -407,6 +406,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(src_device)) return PTR_ERR(src_device); + if (btrfs_pinned_by_swapfile(fs_info, src_device)) { + btrfs_warn_in_rcu(fs_info, + "cannot replace device %s (devid %llu) due to active swapfile", + btrfs_dev_name(src_device), src_device->devid); + return -ETXTBSY; + } + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); if (ret) @@ -426,7 +432,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, } need_unlock = true; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -464,7 +470,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->item_needs_writeback = 1; atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); @@ -478,7 +484,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) { ret = PTR_ERR(trans); need_unlock = true; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->srcdev = NULL; @@ -497,7 +503,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else { + } else if (ret != -ECANCELED) { WARN_ON(ret); } @@ -505,7 +511,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, leave: if (need_unlock) - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -533,8 +539,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, args->start.cont_reading_from_srcdev_mode); args->result = ret; /* don't warn if EINPROGRESS, someone else might be running scrub */ - if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) - ret = 0; + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) + return 0; return ret; } @@ -572,18 +579,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* don't allow cancel or unmount to disturb the finishing procedure */ mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return 0; } tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); /* * flush all outstanding I/O and inode extent mappings before the @@ -607,7 +614,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -622,12 +629,13 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - btrfs_err_in_rcu(fs_info, + if (scrub_ret != -ECANCELED) + btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); @@ -663,8 +671,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - btrfs_dev_replace_write_unlock(dev_replace); - + up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_remove_srcdev(src_device); @@ -761,7 +768,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); /* even if !dev_replace_is_valid, the values are good enough for * the replace_status ioctl */ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -773,7 +780,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, args->status.num_uncorrectable_read_errors = atomic64_read(&dev_replace->num_uncorrectable_read_errors); args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); } int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) @@ -790,46 +797,74 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) return -EROFS; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; - btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + up_write(&dev_replace->rwsem); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + up_write(&dev_replace->rwsem); + ret = btrfs_scrub_cancel(fs_info); + if (ret < 0) { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + } else { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + /* + * btrfs_dev_replace_finishing() will handle the + * cleanup part + */ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + } + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * Scrub doing the replace isn't running so we need to do the + * cleanup step of btrfs_dev_replace_finishing() here + */ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = ktime_get_real_seconds(); - dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - WARN_ON(ret); + up_write(&dev_replace->rwsem); - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + /* Scrub for replace must not be running in suspended state */ + ret = btrfs_scrub_cancel(fs_info); + ASSERT(ret != -ENOTCONN); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_info_in_rcu(fs_info, + "suspended dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + break; + default: + result = -EINVAL; + } -leave: mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return result; } @@ -839,7 +874,8 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -855,7 +891,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) break; } - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); } @@ -865,12 +901,13 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); return 0; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: break; @@ -884,10 +921,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); - btrfs_dev_replace_write_unlock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); return 0; } - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); /* * This could collide with a paused balance, but the exclusive op logic @@ -895,6 +934,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * dev-replace to start anyway. */ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + down_write(&dev_replace->rwsem); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); btrfs_info(fs_info, "cannot resume dev-replace, other exclusive operation running"); return 0; @@ -925,7 +968,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - WARN_ON(ret); + WARN_ON(ret && ret != -ECANCELED); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; @@ -948,7 +991,7 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) * something that can happen if the dev_replace * procedure is suspended by an umount and then * the tgtdev is missing (or "btrfs dev scan") was - * not called and the the filesystem is remounted + * not called and the filesystem is remounted * in degraded state. This does not stop the * dev_replace procedure. It needs to be canceled * manually if the cancellation is wanted. @@ -958,42 +1001,6 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) -{ - read_lock(&dev_replace->lock); -} - -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) -{ - read_unlock(&dev_replace->lock); -} - -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace) -{ -again: - wait_event(dev_replace->read_lock_wq, - atomic_read(&dev_replace->blocking_readers) == 0); - write_lock(&dev_replace->lock); - if (atomic_read(&dev_replace->blocking_readers)) { - write_unlock(&dev_replace->lock); - goto again; - } -} - -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) -{ - write_unlock(&dev_replace->lock); -} - -/* inc blocking cnt and release read lock */ -void btrfs_dev_replace_set_lock_blocking( - struct btrfs_dev_replace *dev_replace) -{ - /* only set blocking for read lock */ - atomic_inc(&dev_replace->blocking_readers); - read_unlock(&dev_replace->lock); -} - void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) { percpu_counter_inc(&fs_info->dev_replace.bio_counter); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h @@ -13,19 +13,11 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, - const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, - int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c @@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, len = buf->len - offset; while (len > 0) { + /* + * Note: we don't need to check for the err == 1 case here, as + * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' + * and 'min_len = 32' and the currently implemented mapping + * algorithm we cannot cross a page boundary. + */ err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); if (err) @@ -542,7 +548,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) if (WARN_ON(!PageUptodate(page))) return -EUCLEAN; - ASSERT(memcmp_extent_buffer(eb, fs_info->fsid, + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); return csum_tree_block(fs_info, eb, 0); @@ -557,7 +563,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); while (fs_devices) { - if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + u8 *metadata_uuid; + + /* + * Checking the incompat flag is only valid for the current + * fs. For seed devices it's forbidden to have their uuid + * changed so reading ->fsid in this case is fine + */ + if (fs_devices == fs_info->fs_devices && + btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; + + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { ret = 0; break; } @@ -660,19 +679,6 @@ out: return ret; } -static int btree_io_failed_hook(struct page *page, int failed_mirror) -{ - struct extent_buffer *eb; - - eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = failed_mirror; - atomic_dec(&eb->io_pages); - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, -EIO); - return -EIO; /* we fixed nothing */ -} - static void end_workqueue_bio(struct bio *bio) { struct btrfs_end_io_wq *end_io_wq = bio->bi_private; @@ -751,11 +757,22 @@ static void run_one_async_start(struct btrfs_work *work) async->status = ret; } +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; + struct inode *inode; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); + inode = async->private_data; /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -764,7 +781,12 @@ static void run_one_async_done(struct btrfs_work *work) return; } - btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num); + ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, + async->mirror_num, 1); + if (ret) { + async->bio->bi_status = ret; + bio_endio(async->bio); + } } static void run_one_async_free(struct btrfs_work *work) @@ -1178,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); + atomic_set(&root->nr_swapfiles, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -2118,10 +2141,8 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - rwlock_init(&fs_info->dev_replace.lock); - atomic_set(&fs_info->dev_replace.blocking_readers, 0); + init_rwsem(&fs_info->dev_replace.rwsem); init_waitqueue_head(&fs_info->dev_replace.replace_wait); - init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) @@ -2442,10 +2463,11 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, - "dev_item UUID does not match fsid: %pU != %pU", - fs_info->fsid, sb->dev_item.fsid); + "dev_item UUID does not match metadata fsid: %pU != %pU", + fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); ret = -EINVAL; } @@ -2656,6 +2678,9 @@ int open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); + btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, + BTRFS_BLOCK_RSV_DELREFS); + atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); @@ -2745,6 +2770,9 @@ int open_ctree(struct super_block *sb, fs_info->sectorsize = 4096; fs_info->stripesize = 4096; + spin_lock_init(&fs_info->swapfile_pins_lock); + fs_info->swapfile_pins = RB_ROOT; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2781,11 +2809,29 @@ int open_ctree(struct super_block *sb, * the whole block of INFO_SIZE */ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - memcpy(fs_info->super_for_commit, fs_info->super_copy, - sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); + disk_super = fs_info->super_copy; + + ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, + BTRFS_FSID_SIZE)); + + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) { + ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid, + fs_info->super_copy->metadata_uuid, + BTRFS_FSID_SIZE)); + } + + features = btrfs_super_flags(disk_super); + if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; + btrfs_set_super_flags(disk_super, features); + btrfs_info(fs_info, + "found metadata UUID change in progress flag, clearing"); + } + + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); ret = btrfs_validate_mount_super(fs_info); if (ret) { @@ -2794,7 +2840,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2906,7 +2951,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); + memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); @@ -3055,7 +3100,7 @@ retry_root_backup: if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, - "writeable mount is not allowed due to too many missing devices"); + "writable mount is not allowed due to too many missing devices"); goto fail_sysfs; } @@ -3724,7 +3769,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); - memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, + BTRFS_FSID_SIZE); flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); @@ -4031,7 +4077,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * This is a fast path so only do this check if we have sanity tests - * enabled. Normal people shouldn't be using umapped buffers as dirty + * enabled. Normal people shouldn't be using unmapped buffers as dirty * outside of the sanity tests. */ if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) @@ -4329,6 +4375,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, unpin = pinned_extents; again: while (1) { + struct extent_state *cached_state = NULL; + /* * The btrfs_finish_extent_commit() may get the same range as * ours between find_first_extent_bit and clear_extent_dirty. @@ -4337,13 +4385,14 @@ again: */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY, &cached_state); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - clear_extent_dirty(unpin, start, end); + clear_extent_dirty(unpin, start, end, &cached_state); + free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -4400,6 +4449,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4505,7 +4555,4 @@ static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, - .readpage_io_failed_hook = btree_io_failed_hook, - - /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h @@ -21,11 +21,11 @@ #define BTRFS_BDEV_BLOCKSIZE (4096) enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA = 0, - BTRFS_WQ_ENDIO_METADATA = 1, - BTRFS_WQ_ENDIO_FREE_SPACE = 2, - BTRFS_WQ_ENDIO_RAID56 = 3, - BTRFS_WQ_ENDIO_DIO_REPAIR = 4, + BTRFS_WQ_ENDIO_DATA, + BTRFS_WQ_ENDIO_METADATA, + BTRFS_WQ_ENDIO_FREE_SPACE, + BTRFS_WQ_ENDIO_RAID56, + BTRFS_WQ_ENDIO_DIO_REPAIR, }; static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c @@ -51,6 +51,24 @@ enum { CHUNK_ALLOC_FORCE = 2, }; +/* + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name) \ +static inline void update_##name(struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -1037,7 +1055,7 @@ out_free: /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requried, + * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2406,25 +2424,82 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref btrfs_delayed_ref_unlock(head); } -static int cleanup_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +static struct btrfs_delayed_extent_op *cleanup_extent_op( + struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; - int ret; if (!extent_op) - return 0; - head->extent_op = NULL; + return NULL; + if (head->must_insert_reserved) { + head->extent_op = NULL; btrfs_free_delayed_extent_op(extent_op); - return 0; + return NULL; } + return extent_op; +} + +static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = cleanup_extent_op(head); + if (!extent_op) + return 0; + head->extent_op = NULL; spin_unlock(&head->lock); ret = run_delayed_extent_op(trans, head, extent_op); btrfs_free_delayed_extent_op(extent_op); return ret ? ret : 1; } +static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + int nr_items = 1; /* Dropping this ref head update. */ + + if (head->total_ref_mod < 0) { + struct btrfs_space_info *space_info; + u64 flags; + + if (head->is_data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (head->is_system) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -head->num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + + /* + * We had csum deletions accounted for in our delayed refs rsv, + * we need to drop the csum leaves for this update from our + * delayed_refs_rsv. + */ + if (head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, + head->num_bytes); + } + } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, + head->qgroup_reserved); + btrfs_delayed_refs_rsv_release(fs_info, nr_items); +} + static int cleanup_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { @@ -2435,7 +2510,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; - ret = cleanup_extent_op(trans, head); + ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { unselect_delayed_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); @@ -2456,37 +2531,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - delayed_refs->num_heads--; - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); + btrfs_delete_ref_head(delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); - atomic_dec(&delayed_refs->num_entries); - - trace_run_delayed_ref_head(fs_info, head, 0); - - if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; - - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = __find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - } - } if (head->must_insert_reserved) { btrfs_pin_extent(fs_info, head->bytenr, @@ -2497,9 +2544,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); + cleanup_ref_head_accounting(trans, head); + + trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); return 0; @@ -2792,40 +2839,28 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_rsv *global_rsv; - u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; - unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; - u64 num_bytes, num_dirty_bgs_bytes; - int ret = 0; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - num_heads = heads_to_leaves(fs_info, num_heads); - if (num_heads > 1) - num_bytes += (num_heads - 1) * fs_info->nodesize; - num_bytes <<= 1; - num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * - fs_info->nodesize; - num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, - num_dirty_bgs); - global_rsv = &fs_info->global_block_rsv; + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); /* - * If we can't allocate any more chunks lets make sure we have _lots_ of - * wiggle room since running delayed refs can create more delayed refs. + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. */ - if (global_rsv->space_info->full) { - num_dirty_bgs_bytes <<= 1; - num_bytes <<= 1; - } - - spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) - ret = 1; - spin_unlock(&global_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); return ret; } @@ -2844,7 +2879,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans); + return btrfs_check_space_for_delayed_refs(trans->fs_info); } struct async_delayed_refs { @@ -3588,6 +3623,8 @@ again: */ mutex_lock(&trans->transaction->cache_write_mutex); while (!list_empty(&dirty)) { + bool drop_reserve = true; + cache = list_first_entry(&dirty, struct btrfs_block_group_cache, dirty_list); @@ -3660,6 +3697,7 @@ again: list_add_tail(&cache->dirty_list, &cur_trans->dirty_bgs); btrfs_get_block_group(cache); + drop_reserve = false; } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { @@ -3667,9 +3705,11 @@ again: } } - /* if its not on the io list, we need to put the block group */ + /* if it's not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); if (ret) break; @@ -3818,6 +3858,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4256,7 +4297,7 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - data_sinfo->bytes_may_use += bytes; + update_bytes_may_use(data_sinfo, bytes); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); @@ -4309,10 +4350,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - if (WARN_ON(data_sinfo->bytes_may_use < len)) - data_sinfo->bytes_may_use = 0; - else - data_sinfo->bytes_may_use -= len; + update_bytes_may_use(data_sinfo, -len); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); @@ -4637,7 +4675,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, /* * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. For raid56, the space info used + * space is actually usable. For raid56, the space info used * doesn't include the parity drive, so we don't have to * change the math */ @@ -4793,8 +4831,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *trans; - u64 bytes; + u64 bytes_needed; + u64 reclaim_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) @@ -4807,15 +4847,15 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes = (ticket) ? ticket->bytes : 0; + bytes_needed = (ticket) ? ticket->bytes : 0; spin_unlock(&space_info->lock); - if (!bytes) + if (!bytes_needed) return 0; /* See if there is enough pinned space to make this reservation */ if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, + bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; @@ -4827,14 +4867,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return -ENOSPC; spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size > bytes) - bytes = 0; - else - bytes -= delayed_rsv->size; + reclaim_bytes += delayed_rsv->reserved; spin_unlock(&delayed_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes_needed) + goto commit; + bytes_needed -= reclaim_bytes; + if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, + bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { return -ENOSPC; } @@ -4882,6 +4926,20 @@ static void flush_space(struct btrfs_fs_info *fs_info, shrink_delalloc(fs_info, num_bytes * 2, num_bytes, state == FLUSH_DELALLOC_WAIT); break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; case ALLOC_CHUNK: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -5108,7 +5166,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, list_del_init(&ticket->list); if (ticket->bytes && ticket->bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket->bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); } @@ -5154,13 +5212,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * If not things get more complicated. */ if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, system_chunk)) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; @@ -5223,7 +5281,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, if (ticket.bytes) { if (ticket.bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket.bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); @@ -5244,7 +5302,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to @@ -5354,6 +5412,90 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, + to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. @@ -5407,7 +5549,7 @@ again: flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); spin_unlock(&space_info->lock); @@ -5435,7 +5577,7 @@ again: ticket->bytes, 1); list_del_init(&ticket->list); num_bytes -= ticket->bytes; - space_info->bytes_may_use += ticket->bytes; + update_bytes_may_use(space_info, ticket->bytes); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -5443,7 +5585,7 @@ again: trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 1); - space_info->bytes_may_use += num_bytes; + update_bytes_may_use(space_info, num_bytes); ticket->bytes -= num_bytes; num_bytes = 0; } @@ -5629,11 +5771,11 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, /** * btrfs_inode_rsv_refill - refill the inode block rsv. * @inode - the inode we are refilling. - * @flush - the flusing restriction. + * @flush - the flushing restriction. * * Essentially the same as btrfs_block_rsv_refill, except it uses the * block_rsv->size as the minimum size. We'll either refill the missing amount - * or return if we already have enough space. This will also handle the resreve + * or return if we already have enough space. This will also handle the reserve * tracepoint for the reserved amount. */ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, @@ -5674,6 +5816,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, return ret; } +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = delayed_rsv; + + if (target->full || target == block_rsv) + target = global_rsv; + + if (block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} + /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. @@ -5688,7 +5855,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; u64 qgroup_to_release = 0; @@ -5698,8 +5864,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, - &qgroup_to_release); + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); @@ -5710,16 +5876,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) qgroup_to_release); } -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 released = 0; - if (global_rsv == block_rsv || - block_rsv->space_info != global_rsv->space_info) - global_rsv = NULL; - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, + num_bytes, NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5750,14 +5926,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = min(num_bytes, block_rsv->size - block_rsv->reserved); block_rsv->reserved += num_bytes; - sinfo->bytes_may_use += num_bytes; + update_bytes_may_use(sinfo, num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 1); } } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_may_use -= num_bytes; + update_bytes_may_use(sinfo, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; @@ -5784,9 +5960,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; - fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; - fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; if (fs_info->quota_root) @@ -5806,8 +5983,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); } +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} /* * To be called after all the new block groups attached to the transaction @@ -6100,6 +6303,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; int factor; + int ret = 0; /* block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -6113,8 +6317,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, while (total) { cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) - return -ENOENT; + if (!cache) { + ret = -ENOENT; + break; + } factor = btrfs_bg_type_to_factor(cache->flags); /* @@ -6151,7 +6357,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6173,6 +6379,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); trans->transaction->num_dirty_bgs++; + trans->delayed_ref_updates++; btrfs_get_block_group(cache); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -6190,7 +6397,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, total -= num_bytes; bytenr += num_bytes; } - return 0; + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; } static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) @@ -6222,7 +6432,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6431,7 +6641,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - space_info->bytes_may_use -= ram_bytes; + update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -6587,7 +6797,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - space_info->bytes_pinned -= len; + update_bytes_pinned(space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -6608,7 +6818,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - space_info->bytes_may_use += to_add; + update_bytes_may_use(space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; trace_btrfs_space_reservation(fs_info, @@ -6647,9 +6857,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) unpin = &fs_info->freed_extents[0]; while (!trans->aborted) { + struct extent_state *cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY, &cached_state); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; @@ -6659,9 +6871,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end); + clear_extent_dirty(unpin, start, end, &cached_state); unpin_extent_range(fs_info, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); + free_extent_state(cached_state); cond_resched(); } @@ -6955,12 +7168,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; - if (head->extent_op) { - if (!head->must_insert_reserved) - goto out; - btrfs_free_delayed_extent_op(head->extent_op); - head->extent_op = NULL; - } + if (cleanup_extent_op(head) != NULL) + goto out; /* * waiting for the lock here would deadlock. If someone else has it @@ -6969,22 +7178,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - /* - * at this point we have a head with no other entries. Go - * ahead and process it. - */ - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); - atomic_dec(&delayed_refs->num_entries); - - /* - * we don't take a ref on the node because we're removing it from the - * tree, so we just steal the ref the tree was holding. - */ - delayed_refs->num_heads--; - if (head->processing == 0) - delayed_refs->num_heads_ready--; + btrfs_delete_ref_head(delayed_refs, head); head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -6992,6 +7188,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; + cleanup_ref_head_accounting(trans, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -7239,6 +7436,345 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, } /* + * Structure used internally for find_free_extent() function. Wraps needed + * parameters. + */ +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; +}; + + +/* + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. + * Return -EAGAIN to inform caller that we need to re-search this block group + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group_cache **cluster_bg_ret) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_block_group_cache *cluster_bg; + u64 aligned_cluster; + u64 offset; + int ret; + + cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); + if (!cluster_bg) + goto refill_cluster; + if (cluster_bg != bg && (cluster_bg->ro || + !block_group_bits(cluster_bg, ffe_ctl->flags))) + goto release_cluster; + + offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, + ffe_ctl->num_bytes, cluster_bg->key.objectid, + &ffe_ctl->max_extent_size); + if (offset) { + /* We have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(cluster_bg, + ffe_ctl->search_start, ffe_ctl->num_bytes); + *cluster_bg_ret = cluster_bg; + ffe_ctl->found_offset = offset; + return 0; + } + WARN_ON(last_ptr->block_group != cluster_bg); + +release_cluster: + /* + * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so + * lets just skip it and let the allocator find whatever block it can + * find. If we reach this point, we will have tried the cluster + * allocator plenty of times and not have found anything, so we are + * likely way too fragmented for the clustering stuff to find anything. + * + * However, if the cluster is taken from the current block group, + * release the cluster first, so that we stand a better chance of + * succeeding in the unclustered allocation. + */ + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { + spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + return -ENOENT; + } + + /* This cluster didn't work out, free it and start over */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + if (cluster_bg != bg) + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + +refill_cluster: + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + return -ENOENT; + } + + aligned_cluster = max_t(u64, + ffe_ctl->empty_cluster + ffe_ctl->empty_size, + bg->full_stripe_len); + ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, + ffe_ctl->search_start, ffe_ctl->num_bytes, + aligned_cluster); + if (ret == 0) { + /* Now pull our allocation out of this cluster */ + offset = btrfs_alloc_from_cluster(bg, last_ptr, + ffe_ctl->num_bytes, ffe_ctl->search_start, + &ffe_ctl->max_extent_size); + if (offset) { + /* We found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(bg, + ffe_ctl->search_start, + ffe_ctl->num_bytes); + ffe_ctl->found_offset = offset; + return 0; + } + } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_clustered) { + spin_unlock(&last_ptr->refill_lock); + + ffe_ctl->retry_clustered = true; + wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + ffe_ctl->empty_size); + return -EAGAIN; + } + /* + * At this point we either didn't find a cluster or we weren't able to + * allocate a block from our cluster. Free the cluster we've been + * trying to use, and go to the next block group. + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + return 1; +} + +/* + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset + * Return -EAGAIN to inform caller that we need to re-search this block group + */ +static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl) +{ + u64 offset; + + /* + * We are doing an unclustered allocation, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get more + * space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } + if (ffe_ctl->cached) { + struct btrfs_free_space_ctl *free_space_ctl; + + free_space_ctl = bg->free_space_ctl; + spin_lock(&free_space_ctl->tree_lock); + if (free_space_ctl->free_space < + ffe_ctl->num_bytes + ffe_ctl->empty_cluster + + ffe_ctl->empty_size) { + ffe_ctl->total_free_space = max_t(u64, + ffe_ctl->total_free_space, + free_space_ctl->free_space); + spin_unlock(&free_space_ctl->tree_lock); + return 1; + } + spin_unlock(&free_space_ctl->tree_lock); + } + + offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, + ffe_ctl->num_bytes, ffe_ctl->empty_size, + &ffe_ctl->max_extent_size); + + /* + * If we didn't find a chunk, and we haven't failed on this block group + * before, and this block group is in the middle of caching and we are + * ok with waiting, then go ahead and wait for progress to be made, and + * set @retry_unclustered to true. + * + * If @retry_unclustered is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && + ffe_ctl->loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_size); + ffe_ctl->retry_unclustered = true; + return -EAGAIN; + } else if (!offset) { + return 1; + } + ffe_ctl->found_offset = offset; + return 0; +} + +/* + * Return >0 means caller needs to re-search for free extent + * Return 0 means we have the needed free extent. + * Return <0 means we failed to locate any free extent. + */ +static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + struct btrfs_free_cluster *last_ptr, + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl, + int full_search, bool use_cluster) +{ + struct btrfs_root *root = fs_info->extent_root; + int ret; + + if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && + ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) + ffe_ctl->orig_have_caching_bg = true; + + if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && + ffe_ctl->have_caching_bg) + return 1; + + if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) + return 1; + + if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } + return 0; + } + + /* + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { + ffe_ctl->index = 0; + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any uncached bgs and we've already done a + * full search through. + */ + if (ffe_ctl->orig_have_caching_bg || !full_search) + ffe_ctl->loop = LOOP_CACHING_WAIT; + else + ffe_ctl->loop = LOOP_ALLOC_CHUNK; + } else { + ffe_ctl->loop++; + } + + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + return ret; + } + + ret = do_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + + /* Do not bail out on ENOSPC since we can do more. */ + if (ret < 0 && ret != -ENOSPC) + btrfs_abort_transaction(trans, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans); + if (ret) + return ret; + } + + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (ffe_ctl->empty_size == 0 && + ffe_ctl->empty_cluster == 0) + return -ENOSPC; + ffe_ctl->empty_size = 0; + ffe_ctl->empty_cluster = 0; + } + return 1; + } + return -ENOSPC; +} + +/* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == start position @@ -7248,6 +7784,20 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * * If there is no suitable free space, we will record the max size of * the free space extent currently. + * + * The overall logic and call chain: + * + * find_free_extent() + * |- Iterate through all block groups + * | |- Get a valid block group + * | |- Try to do clustered allocation in that block group + * | |- Try to do unclustered allocation in that block group + * | |- Check if the result is valid + * | | |- If valid, then exit + * | |- Jump to next block group + * | + * |- Push harder to find free extents + * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, @@ -7255,24 +7805,28 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; - struct btrfs_root *root = fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - u64 search_start = 0; - u64 max_extent_size = 0; - u64 max_free_space = 0; - u64 empty_cluster = 0; + struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - int loop = 0; - int index = btrfs_bg_flags_to_raid_index(flags); - bool failed_cluster_refill = false; - bool failed_alloc = false; bool use_cluster = true; - bool have_caching_bg = false; - bool orig_have_caching_bg = false; bool full_search = false; WARN_ON(num_bytes < fs_info->sectorsize); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.search_start = 0; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.delalloc = delalloc; + ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); + ffe_ctl.have_caching_bg = false; + ffe_ctl.orig_have_caching_bg = false; + ffe_ctl.found_offset = 0; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -7308,7 +7862,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } - last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); + last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl.empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) @@ -7325,10 +7880,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&last_ptr->lock); } - search_start = max(search_start, first_logical_byte(fs_info, 0)); - search_start = max(search_start, hint_byte); - if (search_start == hint_byte) { - block_group = btrfs_lookup_block_group(fs_info, search_start); + ffe_ctl.search_start = max(ffe_ctl.search_start, + first_logical_byte(fs_info, 0)); + ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); + if (ffe_ctl.search_start == hint_byte) { + block_group = btrfs_lookup_block_group(fs_info, + ffe_ctl.search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -7350,7 +7907,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - index = btrfs_bg_flags_to_raid_index( + ffe_ctl.index = btrfs_bg_flags_to_raid_index( block_group->flags); btrfs_lock_block_group(block_group, delalloc); goto have_block_group; @@ -7360,21 +7917,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, } } search: - have_caching_bg = false; - if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags)) + ffe_ctl.have_caching_bg = false; + if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || + ffe_ctl.index == 0) full_search = true; down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups[index], - list) { - u64 offset; - int cached; - + list_for_each_entry(block_group, + &space_info->block_groups[ffe_ctl.index], list) { /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) continue; btrfs_grab_block_group(block_group, delalloc); - search_start = block_group->key.objectid; + ffe_ctl.search_start = block_group->key.objectid; /* * this can happen if we end up cycling through all the @@ -7398,9 +7953,9 @@ search: } have_block_group: - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { - have_caching_bg = true; + ffe_ctl.cached = block_group_cache_done(block_group); + if (unlikely(!ffe_ctl.cached)) { + ffe_ctl.have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -7414,322 +7969,92 @@ have_block_group: * lets look there */ if (last_ptr && use_cluster) { - struct btrfs_block_group_cache *used_block_group; - unsigned long aligned_cluster; - /* - * the refill lock keeps out other - * people trying to start a new cluster - */ - used_block_group = btrfs_lock_cluster(block_group, - last_ptr, - delalloc); - if (!used_block_group) - goto refill_cluster; - - if (used_block_group != block_group && - (used_block_group->ro || - !block_group_bits(used_block_group, flags))) - goto release_cluster; - - offset = btrfs_alloc_from_cluster(used_block_group, - last_ptr, - num_bytes, - used_block_group->key.objectid, - &max_extent_size); - if (offset) { - /* we have a block, we're done */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster( - used_block_group, - search_start, num_bytes); - if (used_block_group != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = used_block_group; - } - goto checks; - } - - WARN_ON(last_ptr->block_group != used_block_group); -release_cluster: - /* If we are on LOOP_NO_EMPTY_SIZE, we can't - * set up a new clusters, so lets just skip it - * and let the allocator find whatever block - * it can find. If we reach this point, we - * will have tried the cluster allocator - * plenty of times and not have found - * anything, so we are likely way too - * fragmented for the clustering stuff to find - * anything. - * - * However, if the cluster is taken from the - * current block group, release the cluster - * first, so that we stand a better chance of - * succeeding in the unclustered - * allocation. */ - if (loop >= LOOP_NO_EMPTY_SIZE && - used_block_group != block_group) { - spin_unlock(&last_ptr->refill_lock); - btrfs_release_block_group(used_block_group, - delalloc); - goto unclustered_alloc; - } + struct btrfs_block_group_cache *cluster_bg = NULL; - /* - * this cluster didn't work out, free it and - * start over - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - - if (used_block_group != block_group) - btrfs_release_block_group(used_block_group, - delalloc); -refill_cluster: - if (loop >= LOOP_NO_EMPTY_SIZE) { - spin_unlock(&last_ptr->refill_lock); - goto unclustered_alloc; - } - - aligned_cluster = max_t(unsigned long, - empty_cluster + empty_size, - block_group->full_stripe_len); + ret = find_free_extent_clustered(block_group, last_ptr, + &ffe_ctl, &cluster_bg); - /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(fs_info, block_group, - last_ptr, search_start, - num_bytes, - aligned_cluster); if (ret == 0) { - /* - * now pull our allocation out of this - * cluster - */ - offset = btrfs_alloc_from_cluster(block_group, - last_ptr, - num_bytes, - search_start, - &max_extent_size); - if (offset) { - /* we found one, proceed */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster( - block_group, search_start, - num_bytes); - goto checks; + if (cluster_bg && cluster_bg != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = cluster_bg; } - } else if (!cached && loop > LOOP_CACHING_NOWAIT - && !failed_cluster_refill) { - spin_unlock(&last_ptr->refill_lock); - - failed_cluster_refill = true; - wait_block_group_cache_progress(block_group, - num_bytes + empty_cluster + empty_size); + goto checks; + } else if (ret == -EAGAIN) { goto have_block_group; - } - - /* - * at this point we either didn't find a cluster - * or we weren't able to allocate a block from our - * cluster. Free the cluster we've been trying - * to use, and go to the next block group - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } - -unclustered_alloc: - /* - * We are doing an unclustered alloc, set the fragmented flag so - * we don't bother trying to setup a cluster again until we get - * more space. - */ - if (unlikely(last_ptr)) { - spin_lock(&last_ptr->lock); - last_ptr->fragmented = 1; - spin_unlock(&last_ptr->lock); - } - if (cached) { - struct btrfs_free_space_ctl *ctl = - block_group->free_space_ctl; - - spin_lock(&ctl->tree_lock); - if (ctl->free_space < - num_bytes + empty_cluster + empty_size) { - max_free_space = max(max_free_space, - ctl->free_space); - spin_unlock(&ctl->tree_lock); + } else if (ret > 0) { goto loop; } - spin_unlock(&ctl->tree_lock); + /* ret == -ENOENT case falls through */ } - offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size, - &max_extent_size); - /* - * If we didn't find a chunk, and we haven't failed on this - * block group before, and this block group is in the middle of - * caching and we are ok with waiting, then go ahead and wait - * for progress to be made, and set failed_alloc to true. - * - * If failed_alloc is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !failed_alloc && !cached && - loop > LOOP_CACHING_NOWAIT) { - wait_block_group_cache_progress(block_group, - num_bytes + empty_size); - failed_alloc = true; + ret = find_free_extent_unclustered(block_group, last_ptr, + &ffe_ctl); + if (ret == -EAGAIN) goto have_block_group; - } else if (!offset) { + else if (ret > 0) goto loop; - } + /* ret == 0 case falls through */ checks: - search_start = round_up(offset, fs_info->stripesize); + ffe_ctl.search_start = round_up(ffe_ctl.found_offset, + fs_info->stripesize); /* move on to the next group */ - if (search_start + num_bytes > + if (ffe_ctl.search_start + num_bytes > block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); goto loop; } - if (offset < search_start) - btrfs_add_free_space(block_group, offset, - search_start - offset); + if (ffe_ctl.found_offset < ffe_ctl.search_start) + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ - ins->objectid = search_start; + ins->objectid = ffe_ctl.search_start; ins->offset = num_bytes; - trace_btrfs_reserve_extent(block_group, search_start, num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, + num_bytes); btrfs_release_block_group(block_group, delalloc); break; loop: - failed_cluster_refill = false; - failed_alloc = false; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - index); + ffe_ctl.index); btrfs_release_block_group(block_group, delalloc); cond_resched(); } up_read(&space_info->groups_sem); - if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg - && !orig_have_caching_bg) - orig_have_caching_bg = true; - - if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) - goto search; - - if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, + full_search, use_cluster); + if (ret > 0) goto search; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { - index = 0; - if (loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (orig_have_caching_bg || !full_search) - loop = LOOP_CACHING_WAIT; - else - loop = LOOP_ALLOC_CHUNK; - } else { - loop++; - } - - if (loop == LOOP_ALLOC_CHUNK) { - struct btrfs_trans_handle *trans; - int exist = 0; - - trans = current->journal_info; - if (trans) - exist = 1; - else - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE); - - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - loop = LOOP_NO_EMPTY_SIZE; - - /* - * Do not bail out on ENOSPC since we - * can do more things. - */ - if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, ret); - else - ret = 0; - if (!exist) - btrfs_end_transaction(trans); - if (ret) - goto out; - } - - if (loop == LOOP_NO_EMPTY_SIZE) { - /* - * Don't loop again if we already have no empty_size and - * no empty_cluster. - */ - if (empty_size == 0 && - empty_cluster == 0) { - ret = -ENOSPC; - goto out; - } - empty_size = 0; - empty_cluster = 0; - } - - goto search; - } else if (!ins->objectid) { - ret = -ENOSPC; - } else if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } - ret = 0; - } -out: if (ret == -ENOSPC) { - if (!max_extent_size) - max_extent_size = max_free_space; + /* + * Use ffe_ctl->total_free_space as fallback if we can't find + * any contiguous hole. + */ + if (!ffe_ctl.max_extent_size) + ffe_ctl.max_extent_size = ffe_ctl.total_free_space; spin_lock(&space_info->lock); - space_info->max_extent_size = max_extent_size; + space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); - ins->offset = max_extent_size; + ins->offset = ffe_ctl.max_extent_size; } return ret; } @@ -8169,13 +8494,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(buf, owner); - write_extent_buffer_fsid(buf, fs_info->fsid); + write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different - * EXENT bit to differentiate dirty pages. + * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, @@ -8221,7 +8546,12 @@ again: goto again; } - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8544,7 +8874,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; - u32 blocksize; struct btrfs_key key; struct btrfs_key first_key; struct extent_buffer *next; @@ -8569,7 +8898,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); btrfs_node_key_to_cpu(path->nodes[level], &first_key, path->slots[level]); - blocksize = fs_info->nodesize; next = find_extent_buffer(fs_info, bytenr); if (!next) { @@ -8693,7 +9021,7 @@ skip: ret); } } - ret = btrfs_free_extent(trans, root, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, parent, root->root_key.objectid, level - 1, 0); if (ret) @@ -8944,9 +9272,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_free; } + err = btrfs_run_delayed_items(trans); + if (err) + goto out_end_trans; + if (block_rsv) trans->block_rsv = block_rsv; + /* + * This will help us catch people modifying the fs tree while we're + * dropping it. It is unsafe to mess with the fs tree while it's being + * dropped as we unlock the root node and parent nodes as we walk down + * the tree, assuming nothing will change. If something does change + * then we'll have stale information and drop references to blocks we've + * already dropped. + */ + set_bit(BTRFS_ROOT_DELETING, &root->state); if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -9421,7 +9762,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) } /* - * checks to see if its even possible to relocate this block group. + * Checks to see if it's even possible to relocate this block group. * * @return - -1 if it's not a good idea to relocate this block group, 0 if its * ok to go ahead and try. @@ -10049,7 +10390,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _alot_ of + * the space in and be done with it. This saves us _a_lot_ of * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { @@ -10154,6 +10495,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) add_block_group_free_space(trans, block_group); /* already aborted the transaction if it failed. */ next: + btrfs_delayed_refs_rsv_release(fs_info, 1); list_del_init(&block_group->bg_list); } btrfs_trans_release_chunk_metadata(trans); @@ -10231,6 +10573,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, link_block_group(cache); list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); set_avail_alloc_bits(fs_info, type); return 0; @@ -10268,6 +10612,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int factor; struct btrfs_caching_control *caching_ctl = NULL; bool remove_em; + bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!block_group); @@ -10315,7 +10660,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_lock(&trans->transaction->cache_write_mutex); /* - * make sure our free spache cache IO is done before remove the + * Make sure our free space cache IO is done before removing the * free space inode */ spin_lock(&trans->transaction->dirty_bgs_lock); @@ -10332,6 +10677,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (!list_empty(&block_group->dirty_list)) { list_del_init(&block_group->dirty_list); + remove_rsv = true; btrfs_put_block_group(block_group); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -10541,6 +10887,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); out: + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); btrfs_free_path(path); return ret; } @@ -10698,7 +11046,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - space_info->bytes_pinned -= block_group->pinned; + update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, @@ -10829,7 +11177,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, if (!blk_queue_discard(bdev_get_queue(device->bdev))) return 0; - /* Not writeable = nothing to do. */ + /* Not writable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c @@ -89,9 +89,18 @@ void btrfs_leak_debug_check(void) static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - if (tree->ops && tree->ops->check_extent_io_range) - tree->ops->check_extent_io_range(tree->private_data, caller, - start, end); + struct inode *inode = tree->private_data; + u64 isize; + + if (!inode || !is_data_inode(inode)) + return; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } } #else #define btrfs_leak_debug_add(new, head) do {} while (0) @@ -344,13 +353,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, return tree_search_for_insert(tree, offset, NULL, NULL); } -static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, - struct extent_state *other) -{ - if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->private_data, new, other); -} - /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -374,7 +376,10 @@ static void merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -386,7 +391,10 @@ static void merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -395,20 +403,6 @@ static void merge_state(struct extent_io_tree *tree, } } -static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->private_data, state, bits); -} - -static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(tree->private_data, state, bits); -} - static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits, struct extent_changeset *changeset); @@ -451,13 +445,6 @@ static int insert_state(struct extent_io_tree *tree, return 0; } -static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, - u64 split) -{ - if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->private_data, orig, split); -} - /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -477,7 +464,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, { struct rb_node *node; - split_cb(tree, orig, split); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_split_delalloc_extent(tree->private_data, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -504,7 +492,7 @@ static struct extent_state *next_state(struct extent_state *state) /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1). + * it will optionally wake up anyone waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree @@ -523,7 +511,10 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; } - clear_state_cb(tree, state, bits); + + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_clear_delalloc_extent(tree->private_data, state, bits); + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); state->state &= ~bits_to_clear; @@ -800,7 +791,9 @@ static void set_state_bits(struct extent_io_tree *tree, unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; - set_state_cb(tree, state, bits); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_set_delalloc_extent(tree->private_data, state, bits); + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; @@ -1459,16 +1452,16 @@ out: * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * true is returned if we find something, false if nothing was in the tree */ -static noinline u64 find_delalloc_range(struct extent_io_tree *tree, +static noinline bool find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; - u64 found = 0; + bool found = false; u64 total_bytes = 0; spin_lock(&tree->lock); @@ -1479,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, */ node = tree_search(tree, cur_start); if (!node) { - if (!found) - *end = (u64)-1; + *end = (u64)-1; goto out; } @@ -1500,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *cached_state = state; refcount_inc(&state->refs); } - found++; + found = true; *end = state->end; cur_start = state->end + 1; node = rb_next(node); @@ -1558,19 +1550,22 @@ static noinline int lock_delalloc_pages(struct inode *inode, } /* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, + * Find and lock a contiguous range of bytes in the file marked as delalloc, no + * more than @max_bytes. @Start and @end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * Return: true if we find something + * false if nothing was in the tree */ -static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, +EXPORT_FOR_TESTS +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; - u64 found; + bool found; struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1585,7 +1580,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return 0; + return false; } /* @@ -1605,6 +1600,7 @@ again: /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, delalloc_start, delalloc_end); + ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching @@ -1616,11 +1612,10 @@ again: loops = 1; goto again; } else { - found = 0; + found = false; goto out_failed; } } - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); @@ -1643,17 +1638,6 @@ out_failed: return found; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) -{ - return find_lock_delalloc_range(inode, tree, locked_page, start, end, - max_bytes); -} -#endif - static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, pgoff_t start_index, pgoff_t end_index, @@ -2349,13 +2333,11 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, } /* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed + * This is a generic handler for readpage errors. If other copies exist, read + * those and write back good data to the failed position. Does not investigate + * in remapping the failed extent elsewhere, hoping the device will be smart + * enough to do this as needed */ - static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int failed_mirror) @@ -2412,14 +2394,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); - struct extent_io_tree *tree; int ret = 0; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, end, NULL, - uptodate); + btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2522,6 +2499,8 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + bool data_inode = btrfs_ino(BTRFS_I(inode)) + != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2551,7 +2530,7 @@ static void end_bio_extent_readpage(struct bio *bio) len = bvec->bv_len; mirror = io_bio->mirror_num; - if (likely(uptodate && tree->ops)) { + if (likely(uptodate)) { ret = tree->ops->readpage_end_io_hook(io_bio, offset, page, start, end, mirror); @@ -2567,38 +2546,37 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (tree->ops) { - ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (ret == -EAGAIN) { - /* - * Data inode's readpage_io_failed_hook() always - * returns -EAGAIN. - * - * The generic bio_readpage_error handles errors - * the following way: If possible, new read - * requests are created and submitted and will - * end up in end_bio_extent_readpage as well (if - * we're lucky, not in the !uptodate case). In - * that case it returns 0 and we just go on with - * the next page in our bio. If it can't handle - * the error it will return -EIO and we remain - * responsible for that page. - */ - ret = bio_readpage_error(bio, offset, page, - start, end, mirror); - if (ret == 0) { - uptodate = !bio->bi_status; - offset += len; - continue; - } - } + if (data_inode) { /* - * metadata's readpage_io_failed_hook() always returns - * -EIO and fixes nothing. -EIO is also returned if - * data inode error could not be fixed. + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, + * not in the !uptodate case). In that case it returns + * 0 and we just go on with the next page in our bio. + * If it can't handle the error it will return -EIO and + * we remain responsible for that page. */ - ASSERT(ret == -EIO); + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); + if (ret == 0) { + uptodate = !bio->bi_status; + offset += len; + continue; + } + } else { + struct extent_buffer *eb; + + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, + &eb->bflags)) + btree_readahead_hook(eb, -EIO); + + ret = -EIO; } readpage_ok: if (likely(uptodate)) { @@ -2607,7 +2585,7 @@ readpage_ok: unsigned off; /* Zero out the end if this page straddles i_size */ - off = i_size & (PAGE_SIZE-1); + off = offset_in_page(i_size); if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); SetPageUptodate(page); @@ -2644,8 +2622,7 @@ readpage_ok: if (extent_len) endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -2782,8 +2759,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, else contig = bio_end_sector(bio) == sector; - if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size, - bio, bio_flags)) + ASSERT(tree->ops); + if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || @@ -2911,7 +2888,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (page->index == last_byte >> PAGE_SHIFT) { char *userpage; - size_t zero_offset = last_byte & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; @@ -3205,7 +3182,7 @@ static void update_nr_written(struct writeback_control *wbc, /* * helper for __extent_writepage, doing all of the delayed allocation setup. * - * This returns 1 if our fill_delalloc function did all the work required + * This returns 1 if btrfs_run_delalloc_range function did all the work required * to write the page (copy into inline extent). In this case the IO has * been started and the page is already unlocked. * @@ -3213,44 +3190,37 @@ static void update_nr_written(struct writeback_control *wbc, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct inode *inode, - struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, - u64 delalloc_start, - unsigned long *nr_written) + struct page *page, struct writeback_control *wbc, + u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = epd->tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; - u64 nr_delalloc; + bool found; u64 delalloc_to_write = 0; u64 delalloc_end = 0; int ret; int page_started = 0; - if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) - return 0; while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, + found = find_lock_delalloc_range(inode, tree, page, &delalloc_start, - &delalloc_end, - BTRFS_MAX_EXTENT_SIZE); - if (nr_delalloc == 0) { + &delalloc_end); + if (!found) { delalloc_start = delalloc_end + 1; continue; } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - nr_written, wbc); + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, + delalloc_end, &page_started, nr_written, wbc); /* File system has been set read-only */ if (ret) { SetPageError(page); - /* fill_delalloc should be return < 0 for error - * but just in case, we use > 0 here meaning the - * IO is started, so we don't want to return > 0 - * unless things are going well. + /* + * btrfs_run_delalloc_range should return < 0 for error + * but just in case, we use > 0 here meaning the IO is + * started, so we don't want to return > 0 unless + * things are going well. */ ret = ret < 0 ? ret : -EIO; goto done; @@ -3323,20 +3293,17 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, int nr = 0; bool compressed; - if (tree->ops && tree->ops->writepage_start_hook) { - ret = tree->ops->writepage_start_hook(page, start, - page_end); - if (ret) { - /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); + ret = btrfs_writepage_cow_fixup(page, start, page_end); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); - update_nr_written(wbc, nr_written); - unlock_page(page); - return 1; - } + update_nr_written(wbc, nr_written); + unlock_page(page); + return 1; } /* @@ -3347,9 +3314,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, end = page_end; if (i_size <= start) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); goto done; } @@ -3360,9 +3325,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 offset; if (cur >= i_size) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, cur, + page_end, 1); break; } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, @@ -3396,11 +3360,10 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, * end_io notification does not happen here for * compressed extents */ - if (!compressed && tree->ops && - tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - cur + iosize - 1, - NULL, 1); + if (!compressed) + btrfs_writepage_endio_finish_ordered(page, cur, + cur + iosize - 1, + 1); else if (compressed) { /* we don't want to end_page_writeback on * a compressed extent. this happens @@ -3469,7 +3432,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ClearPageError(page); - pg_offset = i_size & (PAGE_SIZE - 1); + pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); @@ -3491,11 +3454,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); - ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); - if (ret == 1) - goto done_unlocked; - if (ret) - goto done; + if (!epd->extent_locked) { + ret = writepage_delalloc(inode, page, wbc, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + } ret = __extent_writepage_io(inode, page, wbc, epd, i_size, nr_written, write_flags, &nr); @@ -3934,12 +3899,25 @@ static int extent_write_cache_pages(struct address_space *mapping, range_whole = 1; scanned = 1; } - if (wbc->sync_mode == WB_SYNC_ALL) + + /* + * We do the tagged writepage as long as the snapshot flush bit is set + * and we are the first one who do the filemap_flush() on this inode. + * + * The nr_to_write == LONG_MAX is needed to make sure other flushers do + * not race in and drop the bit. + */ + if (range_whole && wbc->nr_to_write == LONG_MAX && + test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &BTRFS_I(inode)->runtime_flags)) + wbc->tagged_writepages = 1; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && @@ -4084,10 +4062,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - start + PAGE_SIZE - 1, - NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, + start + PAGE_SIZE - 1, 1); unlock_page(page); } put_page(page); @@ -4118,42 +4094,36 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; unsigned long bio_flags = 0; struct page *pagepool[16]; - struct page *page; struct extent_map *em_cached = NULL; struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - page = list_entry(pages->prev, struct page, lru); + while (!list_empty(pages)) { + for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { + struct page *page = list_entry(pages->prev, + struct page, lru); - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - continue; + prefetchw(&page->flags); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, page->index, + readahead_gfp_mask(mapping))) { + put_page(page); + continue; + } + + pagepool[nr++] = page; } - pagepool[nr++] = page; - if (nr < ARRAY_SIZE(pagepool)) - continue; __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); - nr = 0; + &bio_flags, &prev_em_start); } - if (nr) - __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); - BUG_ON(!list_empty(pages)); if (bio) return submit_one_bio(bio, 0, bio_flags); return 0; @@ -4342,7 +4312,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, /* * Sanity check, extent_fiemap() should have ensured that new - * fiemap extent won't overlap with cahced one. + * fiemap extent won't overlap with cached one. * Not recoverable. * * NOTE: Physical address can overlap, due to compression @@ -4914,13 +4884,6 @@ again: check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); - /* - * We will free dummy extent buffer's if they come into - * free_extent_buffer with a ref count of 2, but if we are using this we - * want the buffers to stay in memory until we're done with them, so - * bump the ref count again. - */ - atomic_inc(&eb->refs); return eb; free_eb: btrfs_release_extent_buffer(eb); @@ -5102,7 +5065,9 @@ void free_extent_buffer(struct extent_buffer *eb) while (1) { refs = atomic_read(&eb->refs); - if (refs <= 3) + if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) + || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && + refs == 1)) break; old = atomic_cmpxchg(&eb->refs, refs, refs - 1); if (old == refs) @@ -5111,10 +5076,6 @@ void free_extent_buffer(struct extent_buffer *eb) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && - test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) - atomic_dec(&eb->refs); - - if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5340,7 +5301,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; if (start + len > eb->len) { @@ -5350,7 +5311,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, return; } - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5375,14 +5336,14 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5413,10 +5374,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb, char **map, unsigned long *map_start, unsigned long *map_len) { - size_t offset = start & (PAGE_SIZE - 1); + size_t offset; char *kaddr; struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; unsigned long end_i = (start_offset + start + min_len - 1) >> PAGE_SHIFT; @@ -5453,14 +5414,14 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5509,13 +5470,13 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5539,13 +5500,13 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, size_t offset; struct page *page; char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5584,13 +5545,12 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, size_t offset; struct page *page; char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; WARN_ON(src->len != dst_len); - offset = (start_offset + dst_offset) & - (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5626,7 +5586,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -5638,7 +5598,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, offset = start_offset + start + byte_offset; *page_index = offset >> PAGE_SHIFT; - *page_offset = offset & (PAGE_SIZE - 1); + *page_offset = offset_in_page(offset); } /** @@ -5780,7 +5740,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -5798,10 +5758,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } while (len > 0) { - dst_off_in_page = (start_offset + dst_offset) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_offset) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(start_offset + dst_offset); + src_off_in_page = offset_in_page(start_offset + src_offset); dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_SHIFT; @@ -5829,7 +5787,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -5853,10 +5811,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, dst_i = (start_offset + dst_end) >> PAGE_SHIFT; src_i = (start_offset + src_end) >> PAGE_SHIFT; - dst_off_in_page = (start_offset + dst_end) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_end) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(start_offset + dst_end); + src_off_in_page = offset_in_page(start_offset + src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h @@ -37,18 +37,22 @@ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_FLAG_SHIFT 16 -/* these are bit numbers for test/set bit */ -#define EXTENT_BUFFER_UPTODATE 0 -#define EXTENT_BUFFER_DIRTY 2 -#define EXTENT_BUFFER_CORRUPT 3 -#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ -#define EXTENT_BUFFER_TREE_REF 5 -#define EXTENT_BUFFER_STALE 6 -#define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ -#define EXTENT_BUFFER_UNMAPPED 9 -#define EXTENT_BUFFER_IN_TREE 10 -#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ +enum { + EXTENT_BUFFER_UPTODATE, + EXTENT_BUFFER_DIRTY, + EXTENT_BUFFER_CORRUPT, + /* this got triggered by readahead */ + EXTENT_BUFFER_READAHEAD, + EXTENT_BUFFER_TREE_REF, + EXTENT_BUFFER_STALE, + EXTENT_BUFFER_WRITEBACK, + /* read IO error */ + EXTENT_BUFFER_READ_ERR, + EXTENT_BUFFER_UNMAPPED, + EXTENT_BUFFER_IN_TREE, + /* write IO error */ + EXTENT_BUFFER_WRITE_ERR, +}; /* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) @@ -94,38 +98,13 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct extent_io_ops { /* - * The following callbacks must be allways defined, the function + * The following callbacks must be always defined, the function * pointer will be called unconditionally. */ extent_submit_bio_hook_t *submit_bio_hook; int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); - int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - - /* - * Optional hooks, called if the pointer is not NULL - */ - int (*fill_delalloc)(void *private_data, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc); - - int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate); - void (*set_bit_hook)(void *private_data, struct extent_state *state, - unsigned *bits); - void (*clear_bit_hook)(void *private_data, - struct extent_state *state, - unsigned *bits); - void (*merge_extent_hook)(void *private_data, - struct extent_state *new, - struct extent_state *other); - void (*split_extent_hook)(void *private_data, - struct extent_state *orig, u64 split); - void (*check_extent_io_range)(void *private_data, const char *caller, - u64 start, u64 end); }; struct extent_io_tree { @@ -353,11 +332,11 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end) + u64 end, struct extent_state **cached) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL); + EXTENT_DO_ACCOUNTING, 0, 0, cached); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -546,10 +525,9 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes); +bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c @@ -475,7 +475,8 @@ static struct extent_map *prev_extent_map(struct extent_map *em) return container_of(prev, struct extent_map, rb_node); } -/* helper for btfs_get_extent. Given an existing extent in the tree, +/* + * Helper for btrfs_get_extent. Given an existing extent in the tree, * the existing extent is the nearest extent to map_start, * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h @@ -11,13 +11,20 @@ #define EXTENT_MAP_INLINE ((u64)-2) #define EXTENT_MAP_DELALLOC ((u64)-1) -/* bits for the flags field */ -#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ -#define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ -#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ -#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ -#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ +/* bits for the extent_map::flags field */ +enum { + /* this entry not yet on disk, don't free it */ + EXTENT_FLAG_PINNED, + EXTENT_FLAG_COMPRESSED, + /* pre-allocated extent */ + EXTENT_FLAG_PREALLOC, + /* Logging this extent */ + EXTENT_FLAG_LOGGING, + /* Filling in a preallocated extent */ + EXTENT_FLAG_FILLING, + /* filesystem extent mapping type */ + EXTENT_FLAG_FS_MAPPING, +}; struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c @@ -142,11 +142,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) -{ - kfree(bio->csum_allocated); -} - static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { @@ -175,14 +170,12 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum_allocated = kmalloc_array(nblocks, - csum_size, GFP_NOFS); - if (!btrfs_bio->csum_allocated) { + btrfs_bio->csum = kmalloc_array(nblocks, csum_size, + GFP_NOFS); + if (!btrfs_bio->csum) { btrfs_free_path(path); return BLK_STS_RESOURCE; } - btrfs_bio->csum = btrfs_bio->csum_allocated; - btrfs_bio->end_io = btrfs_io_bio_endio_readpage; } else { btrfs_bio->csum = btrfs_bio->csum_inline; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c @@ -399,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, size_t copied = 0; size_t total_copied = 0; int pg = 0; - int offset = pos & (PAGE_SIZE - 1); + int offset = offset_in_page(pos); while (write_bytes > 0) { size_t count = min_t(size_t, @@ -1611,7 +1611,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { - size_t offset = pos & (PAGE_SIZE - 1); + size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_SIZE - @@ -2005,7 +2005,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by settattr when we are about to truncate + * ordered_data_close is set by setattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to * flush down new bytes that may have been written if the * application were using truncate to replace a file in place. @@ -2114,7 +2114,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * We have to do this here to avoid the priority inversion of waiting on - * IO of a lower priority task while holding a transaciton open. + * IO of a lower priority task while holding a transaction open. */ ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { @@ -2154,7 +2154,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * here we could get into a situation where we're waiting on IO to * happen that is blocked on a transaction trying to commit. With start * we inc the extwriter counter, so we wait for all extwriters to exit - * before we start blocking join'ers. This comment is to keep somebody + * before we start blocking joiners. This comment is to keep somebody * from thinking they are super smart and changing this to * btrfs_join_transaction *cough*Josef*cough*. */ @@ -2186,25 +2186,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); - /* - * If any of the ordered extents had an error, just return it to user - * space, so that the application knows some writes didn't succeed and - * can take proper action (retry for e.g.). Blindly committing the - * transaction in this case, would fool userspace that everything was - * successful. And we also want to make sure our log doesn't contain - * file extent items pointing to extents that weren't fully written to - - * just like in the non fast fsync path, where we check for the ordered - * operation's error flag before writing to the log tree and return -EIO - * if any of them had this flag set (btrfs_wait_ordered_range) - - * therefore we need to check for errors in the ordered operations, - * which are indicated by ctx.io_err. - */ - if (ctx.io_err) { - btrfs_end_transaction(trans); - ret = ctx.io_err; - goto out; - } - if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c @@ -74,11 +74,11 @@ out: return ret; } -struct btrfs_free_space_info * -search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, - struct btrfs_path *path, int cow) +EXPORT_FOR_TESTS +struct btrfs_free_space_info *search_free_space_info( + struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow) { struct btrfs_root *root = fs_info->free_space_root; struct btrfs_key key; @@ -176,6 +176,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) } } +EXPORT_FOR_TESTS int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -315,6 +316,7 @@ out: return ret; } +EXPORT_FOR_TESTS int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -487,6 +489,7 @@ out: return ret; } +EXPORT_FOR_TESTS int free_space_test_bit(struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 offset) { @@ -775,6 +778,7 @@ out: return ret; } +EXPORT_FOR_TESTS int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) @@ -968,6 +972,7 @@ out: return ret; } +EXPORT_FOR_TESTS int __add_to_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c @@ -27,6 +27,7 @@ #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> +#include <linux/swap.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -103,23 +104,23 @@ static void __endio_write_update_ordered(struct inode *inode, /* * Cleanup all submitted ordered extents in specified range to handle errors - * from the fill_dellaloc() callback. + * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered - * extent (btrfs_finish_ordered_io()). Also note that the caller of the - * fill_delalloc() callback already does proper cleanup for the first page of - * the range, that is, it invokes the callback writepage_end_io_hook() for the - * range of the first page. + * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, - const u64 offset, - const u64 bytes) + struct page *locked_page, + u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + u64 page_start = page_offset(locked_page); + u64 page_end = page_start + PAGE_SIZE - 1; + struct page *page; while (index <= end_index) { @@ -130,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, ClearPagePrivate2(page); put_page(page); } - return __endio_write_update_ordered(inode, offset + PAGE_SIZE, - bytes - PAGE_SIZE, false); + + /* + * In case this page belongs to the delalloc range being instantiated + * then skip it, since the first page of a range is going to be + * properly cleaned up by the caller of run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + offset += PAGE_SIZE; + bytes -= PAGE_SIZE; + } + + return __endio_write_update_ordered(inode, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -229,7 +240,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_SIZE - 1); + offset = offset_in_page(start); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); put_page(page); @@ -357,7 +368,7 @@ struct async_extent { struct async_cow { struct inode *inode; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct page *locked_page; u64 start; u64 end; @@ -538,8 +549,7 @@ again: &total_compressed); if (!ret) { - unsigned long offset = total_compressed & - (PAGE_SIZE - 1); + unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; char *kaddr; @@ -847,14 +857,13 @@ retry: ins.offset, async_extent->pages, async_extent->nr_pages, async_cow->write_flags)) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->i_mapping; - tree->ops->writepage_end_io_hook(p, start, end, - NULL, 0); + btrfs_writepage_endio_finish_ordered(p, start, end, 0); + p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, end, NULL, 0, @@ -1144,13 +1153,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) { struct btrfs_fs_info *fs_info; struct async_cow *async_cow; - struct btrfs_root *root; unsigned long nr_pages; async_cow = container_of(work, struct async_cow, work); - root = async_cow->root; - fs_info = root->fs_info; + fs_info = async_cow->fs_info; nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> PAGE_SHIFT; @@ -1179,7 +1186,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_cow *async_cow; - struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; @@ -1189,7 +1195,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ async_cow->inode = igrab(inode); - async_cow->root = root; + async_cow->fs_info = fs_info; async_cow->locked_page = locked_page; async_cow->start = start; async_cow->write_flags = write_flags; @@ -1372,7 +1378,8 @@ next_slot: * Do the same check as in btrfs_cross_ref_exist but * without the unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= + if (!nolock && + btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) @@ -1576,12 +1583,12 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) } /* - * extent_io.c call back to do delayed allocation processing + * Function to process delayed allocation (create CoW) for ranges which are + * being touched for the first time. */ -static int run_delalloc_range(void *private_data, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, unsigned long *nr_written, + struct writeback_control *wbc) { struct inode *inode = private_data; int ret; @@ -1605,14 +1612,14 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, write_flags); } if (ret) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + btrfs_cleanup_ordered_extents(inode, locked_page, start, + end - start + 1); return ret; } -static void btrfs_split_extent_hook(void *private_data, - struct extent_state *orig, u64 split) +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split) { - struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1625,7 +1632,7 @@ static void btrfs_split_extent_hook(void *private_data, u64 new_size; /* - * See the explanation in btrfs_merge_extent_hook, the same + * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; @@ -1642,16 +1649,13 @@ static void btrfs_split_extent_hook(void *private_data, } /* - * extent_io.c merge_extent_hook, used to track merged delayed allocation - * extents so we can keep track of new extents that are just merged onto old - * extents, such as when we are doing sequential writes, so we can properly - * account for the metadata space we'll need. + * Handle merged delayed allocation extents so we can keep track of new extents + * that are just merged onto old extents, such as when we are doing sequential + * writes, so we can properly account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(void *private_data, - struct extent_state *new, - struct extent_state *other) +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other) { - struct inode *inode = private_data; u64 new_size, old_size; u32 num_extents; @@ -1755,15 +1759,12 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, } /* - * extent_io.c set_bit_hook, used to track delayed allocation - * bytes in this file, and to maintain the list of inodes that - * have pending delalloc work to be done. + * Properly track delayed allocation bytes in the inode and to maintain the + * list of inodes that have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(void *private_data, - struct extent_state *state, unsigned *bits) +void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits) { - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1809,14 +1810,14 @@ static void btrfs_set_bit_hook(void *private_data, } /* - * extent_io.c clear_bit_hook, see set_bit_hook for why + * Once a range is no longer delalloc this function ensures that proper + * accounting happens. */ -static void btrfs_clear_bit_hook(void *private_data, - struct extent_state *state, - unsigned *bits) +void btrfs_clear_delalloc_extent(struct inode *vfs_inode, + struct extent_state *state, unsigned *bits) { - struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -1841,7 +1842,7 @@ static void btrfs_clear_bit_hook(void *private_data, /* * We don't reserve metadata space for space cache inodes so we - * don't need to call dellalloc_release_metadata if there is an + * don't need to call delalloc_release_metadata if there is an * error. */ if (*bits & EXTENT_CLEAR_META_RESV && @@ -1880,16 +1881,21 @@ static void btrfs_clear_bit_hook(void *private_data, } /* - * Merge bio hook, this must check the chunk tree to make sure we don't create - * bios that span stripes or chunks + * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit + * in a chunk's stripe. This function ensures that bios do not span a + * stripe/chunk * - * return 1 if page cannot be merged to bio - * return 0 if page can be merged to bio + * @page - The page we are about to add to the bio + * @size - size we want to add to the bio + * @bio - bio we want to ensure is smaller than a stripe + * @bio_flags - flags of the bio + * + * return 1 if page cannot be added to the bio + * return 0 if page can be added to the bio * return error otherwise */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags) +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1932,29 +1938,6 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, } /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num) -{ - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; - - ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); - } - return ret; -} - -/* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. * @@ -2056,7 +2039,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state, int dedupe) { - WARN_ON((end & (PAGE_SIZE - 1)) == 0); + WARN_ON(PAGE_ALIGNED(end)); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, extra_bits, cached_state); } @@ -2152,7 +2135,7 @@ out_page: * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ -static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3159,8 +3142,8 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate) +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3686,6 +3669,21 @@ cache_index: * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* + * Similar reasoning for last_link_trans, needs to be set otherwise + * for a case like the following: + * + * mkdir A + * touch foo + * ln foo A/bar + * echo 2 > /proc/sys/vm/drop_caches + * fsync foo + * <power failure> + * + * Would result in link bar and directory A not existing after the power + * failure. + */ + BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -4444,31 +4442,6 @@ out: return err; } -static int truncate_space_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytes_deleted) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - int ret; - - /* - * This is only used to apply pressure to the enospc system, we don't - * intend to use this reservation at all. - */ - bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); - bytes_deleted *= fs_info->nodesize; - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - bytes_deleted, BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, - bytes_deleted, 1); - trans->bytes_reserved += bytes_deleted; - } - return ret; - -} - /* * Return this if we need to call truncate_block for the last bit of the * truncate. @@ -4513,7 +4486,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; - bool should_end = false; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4544,7 +4516,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, /* * This function is also used to drop the items in the log tree before * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the loged items. So we shouldn't kill the delayed + * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ if (min_type == 0 && root == BTRFS_I(inode)->root) @@ -4726,15 +4698,7 @@ delete: btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans)) - btrfs_async_run_delayed_refs(fs_info, - trans->delayed_ref_updates * 2, - trans->transid, 0); if (be_nice) { - if (truncate_space_check(trans, root, - extent_num_bytes)) { - should_end = true; - } if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } @@ -4745,7 +4709,7 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || - should_throttle || should_end) { + should_throttle) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4757,23 +4721,24 @@ delete: pending_del_nr = 0; } btrfs_release_path(path); - if (should_throttle) { - unsigned long updates = trans->delayed_ref_updates; - if (updates) { - trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, - updates * 2); - if (ret) - break; - } - } + /* - * if we failed to refill our space rsv, bail out - * and let the transaction restart + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. */ - if (should_end) { - ret = -EAGAIN; - break; + if (should_throttle) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } } goto search_again; } else { @@ -4799,18 +4764,6 @@ out: } btrfs_free_path(path); - - if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { - unsigned long updates = trans->delayed_ref_updates; - int err; - - if (updates) { - trans->delayed_ref_updates = 0; - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) - ret = err; - } - } return ret; } @@ -5155,7 +5108,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the end less truncate */ + /* Disable nonlocked read DIO to avoid the endless truncate */ btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); @@ -5333,8 +5286,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans) && - !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) + if (!btrfs_check_space_for_delayed_refs(fs_info) && + !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) return trans; /* If not, commit and try again. */ @@ -6406,14 +6359,19 @@ fail_dir_item: err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); - + if (err) + btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, name_len, ino, parent_ino, &local_index); + if (err) + btrfs_abort_transaction(trans, err); } + + /* Return the original error code */ return ret; } @@ -6625,6 +6583,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; } + BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, true, NULL); @@ -6652,7 +6611,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; int err = 0; - int drop_on_err = 0; u64 objectid = 0; u64 index = 0; @@ -6678,7 +6636,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; } - drop_on_err = 1; /* these must be set before we unlock the inode */ inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -6699,7 +6656,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; d_instantiate_new(dentry, inode); - drop_on_err = 0; out_fail: btrfs_end_transaction(trans); @@ -8053,9 +8009,7 @@ static void btrfs_endio_direct_read(struct bio *bio) dio_bio->bi_status = err; dio_end_io(dio_bio); - - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(err)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -8098,7 +8052,7 @@ static void __endio_write_update_ordered(struct inode *inode, return; /* * Our bio might span multiple ordered extents. In this case - * we keep goin until we have accounted the whole dio. + * we keep going until we have accounted the whole dio. */ if (ordered_offset < offset + bytes) { ordered_bytes = offset + bytes - ordered_offset; @@ -8408,8 +8362,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (io_bio->end_io) - io_bio->end_io(io_bio, ret); + btrfs_io_bio_free_csum(io_bio); free_ordered: /* @@ -8912,7 +8865,7 @@ again: /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) - zero_start = size & ~PAGE_MASK; + zero_start = offset_in_page(size); else zero_start = PAGE_SIZE; @@ -9157,6 +9110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; + ei->last_link_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -9968,7 +9922,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9996,6 +9950,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr) } spin_unlock(&root->delalloc_lock); + if (snapshot) + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &binode->runtime_flags); work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); @@ -10029,7 +9986,7 @@ out: return ret; } -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -10037,7 +9994,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1); + ret = start_delalloc_inodes(root, -1, true); if (ret > 0) ret = 0; return ret; @@ -10066,7 +10023,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr); + ret = start_delalloc_inodes(root, nr, false); btrfs_put_fs_root(root); if (ret < 0) goto out; @@ -10445,26 +10402,6 @@ out: return ret; } -__attribute__((const)) -static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) -{ - return -EAGAIN; -} - -static void btrfs_check_extent_io_range(void *private_data, const char *caller, - u64 start, u64 end) -{ - struct inode *inode = private_data; - u64 isize; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} - void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { struct inode *inode = tree->private_data; @@ -10481,6 +10418,343 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) } } +#ifdef CONFIG_SWAP +/* + * Add an entry indicating a block group or device which is pinned by a + * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a + * negative errno on failure. + */ +static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, + bool is_block_group) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp, *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + + sp = kmalloc(sizeof(*sp), GFP_NOFS); + if (!sp) + return -ENOMEM; + sp->ptr = ptr; + sp->inode = inode; + sp->is_block_group = is_block_group; + + spin_lock(&fs_info->swapfile_pins_lock); + p = &fs_info->swapfile_pins.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_swapfile_pin, node); + if (sp->ptr < entry->ptr || + (sp->ptr == entry->ptr && sp->inode < entry->inode)) { + p = &(*p)->rb_left; + } else if (sp->ptr > entry->ptr || + (sp->ptr == entry->ptr && sp->inode > entry->inode)) { + p = &(*p)->rb_right; + } else { + spin_unlock(&fs_info->swapfile_pins_lock); + kfree(sp); + return 1; + } + } + rb_link_node(&sp->node, parent, p); + rb_insert_color(&sp->node, &fs_info->swapfile_pins); + spin_unlock(&fs_info->swapfile_pins_lock); + return 0; +} + +/* Free all of the entries pinned by this swapfile. */ +static void btrfs_free_swapfile_pins(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp; + struct rb_node *node, *next; + + spin_lock(&fs_info->swapfile_pins_lock); + node = rb_first(&fs_info->swapfile_pins); + while (node) { + next = rb_next(node); + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (sp->inode == inode) { + rb_erase(&sp->node, &fs_info->swapfile_pins); + if (sp->is_block_group) + btrfs_put_block_group(sp->ptr); + kfree(sp); + } + node = next; + } + spin_unlock(&fs_info->swapfile_pins_lock); +} + +struct btrfs_swap_info { + u64 start; + u64 block_start; + u64 block_len; + u64 lowest_ppage; + u64 highest_ppage; + unsigned long nr_pages; + int nr_extents; +}; + +static int btrfs_add_swap_extent(struct swap_info_struct *sis, + struct btrfs_swap_info *bsi) +{ + unsigned long nr_pages; + u64 first_ppage, first_ppage_reported, next_ppage; + int ret; + + first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, + PAGE_SIZE) >> PAGE_SHIFT; + + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + + first_ppage_reported = first_ppage; + if (bsi->start == 0) + first_ppage_reported++; + if (bsi->lowest_ppage > first_ppage_reported) + bsi->lowest_ppage = first_ppage_reported; + if (bsi->highest_ppage < (next_ppage - 1)) + bsi->highest_ppage = next_ppage - 1; + + ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); + if (ret < 0) + return ret; + bsi->nr_extents += ret; + bsi->nr_pages += nr_pages; + return 0; +} + +static void btrfs_swap_deactivate(struct file *file) +{ + struct inode *inode = file_inode(file); + + btrfs_free_swapfile_pins(inode); + atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; + struct extent_map *em = NULL; + struct btrfs_device *device = NULL; + struct btrfs_swap_info bsi = { + .lowest_ppage = (sector_t)-1ULL, + }; + int ret = 0; + u64 isize; + u64 start; + + /* + * If the swap file was just created, make sure delalloc is done. If the + * file changes again after this, the user is doing something stupid and + * we don't really care. + */ + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + return ret; + + /* + * The inode is locked, so these flags won't change after we check them. + */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { + btrfs_warn(fs_info, "swapfile must not be copy-on-write"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_warn(fs_info, "swapfile must not be checksummed"); + return -EINVAL; + } + + /* + * Balance or device remove/replace/resize can move stuff around from + * under us. The EXCL_OP flag makes sure they aren't running/won't run + * concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap file + * is active and moving the extents. Note that this also prevents a + * concurrent device add which isn't actually necessary, but it's not + * really worth the trouble to allow it. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + btrfs_warn(fs_info, + "cannot activate swapfile while exclusive operation is running"); + return -EBUSY; + } + /* + * Snapshots can create extents which require COW even if NODATACOW is + * set. We use this counter to prevent snapshots. We must increment it + * before walking the extents because we don't want a concurrent + * snapshot to run after we've already checked the extents. + */ + atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles); + + isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); + + lock_extent_bits(io_tree, 0, isize - 1, &cached_state); + start = 0; + while (start < isize) { + u64 logical_block_start, physical_block_start; + struct btrfs_block_group_cache *bg; + u64 len = isize - start; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->block_start == EXTENT_MAP_HOLE) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + if (em->block_start == EXTENT_MAP_INLINE) { + /* + * It's unlikely we'll ever actually find ourselves + * here, as a file small enough to fit inline won't be + * big enough to store more than the swap header, but in + * case something changes in the future, let's catch it + * here rather than later. + */ + btrfs_warn(fs_info, "swapfile must not be inline"); + ret = -EINVAL; + goto out; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + ret = -EINVAL; + goto out; + } + + logical_block_start = em->block_start + (start - em->start); + len = min(len, em->len - (start - em->start)); + free_extent_map(em); + em = NULL; + + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + } else { + btrfs_warn(fs_info, + "swapfile must not be copy-on-write"); + ret = -EINVAL; + goto out; + } + + em = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + btrfs_warn(fs_info, + "swapfile must have single data profile"); + ret = -EINVAL; + goto out; + } + + if (device == NULL) { + device = em->map_lookup->stripes[0].dev; + ret = btrfs_add_swapfile_pin(inode, device, false); + if (ret == 1) + ret = 0; + else if (ret) + goto out; + } else if (device != em->map_lookup->stripes[0].dev) { + btrfs_warn(fs_info, "swapfile must be on one device"); + ret = -EINVAL; + goto out; + } + + physical_block_start = (em->map_lookup->stripes[0].physical + + (logical_block_start - em->start)); + len = min(len, em->len - (logical_block_start - em->start)); + free_extent_map(em); + em = NULL; + + bg = btrfs_lookup_block_group(fs_info, logical_block_start); + if (!bg) { + btrfs_warn(fs_info, + "could not find block group containing swapfile"); + ret = -EINVAL; + goto out; + } + + ret = btrfs_add_swapfile_pin(inode, bg, true); + if (ret) { + btrfs_put_block_group(bg); + if (ret == 1) + ret = 0; + else + goto out; + } + + if (bsi.block_len && + bsi.block_start + bsi.block_len == physical_block_start) { + bsi.block_len += len; + } else { + if (bsi.block_len) { + ret = btrfs_add_swap_extent(sis, &bsi); + if (ret) + goto out; + } + bsi.start = start; + bsi.block_start = physical_block_start; + bsi.block_len = len; + } + + start += len; + } + + if (bsi.block_len) + ret = btrfs_add_swap_extent(sis, &bsi); + +out: + if (!IS_ERR_OR_NULL(em)) + free_extent_map(em); + + unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + + if (ret) + btrfs_swap_deactivate(file); + + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + + if (ret) + return ret; + + if (device) + sis->bdev = device->bdev; + *span = bsi.highest_ppage - bsi.lowest_ppage + 1; + sis->max = bsi.nr_pages; + sis->pages = bsi.nr_pages - 1; + sis->highest_bit = bsi.nr_pages - 1; + return bsi.nr_extents; +} +#else +static void btrfs_swap_deactivate(struct file *file) +{ +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + return -EOPNOTSUPP; +} +#endif + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10523,17 +10797,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, - .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, - - /* optional callbacks */ - .fill_delalloc = run_delalloc_range, - .writepage_end_io_hook = btrfs_writepage_end_io_hook, - .writepage_start_hook = btrfs_writepage_start_hook, - .set_bit_hook = btrfs_set_bit_hook, - .clear_bit_hook = btrfs_clear_bit_hook, - .merge_extent_hook = btrfs_merge_extent_hook, - .split_extent_hook = btrfs_split_extent_hook, - .check_extent_io_range = btrfs_check_extent_io_range, }; /* @@ -10558,6 +10821,8 @@ static const struct address_space_operations btrfs_aops = { .releasepage = btrfs_releasepage, .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c @@ -290,6 +290,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } else if (fsflags & FS_COMPR_FL) { const char *comp; + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_unlock; + } + binode->flags |= BTRFS_INODE_COMPRESS; binode->flags &= ~BTRFS_INODE_NOCOMPRESS; @@ -754,6 +759,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + if (atomic_read(&root->nr_swapfiles)) { + btrfs_warn(fs_info, + "cannot snapshot subvolume with active swapfile"); + return -ETXTBSY; + } + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); if (!pending_snapshot) return -ENOMEM; @@ -777,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, wait_event(root->subv_writers->wait, percpu_counter_sum(&root->subv_writers->counter) == 0); - ret = btrfs_start_delalloc_inodes(root); + ret = btrfs_start_delalloc_snapshot(root); if (ret) goto dec_and_free; @@ -1505,9 +1516,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } inode_lock(inode); - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = cluster_pages_for_defrag(inode, pages, i, cluster); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + } else { + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = cluster_pages_for_defrag(inode, pages, i, cluster); + } if (ret < 0) { inode_unlock(inode); goto out_ra; @@ -3135,7 +3150,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } rcu_read_unlock(); - memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); + memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; @@ -3191,92 +3206,6 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) -{ - struct page *page; - - page = grab_cache_page(inode->i_mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); - - if (!PageUptodate(page)) { - int ret; - - ret = btrfs_readpage(NULL, page); - if (ret) - return ERR_PTR(ret); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EIO); - } - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EAGAIN); - } - } - - return page; -} - -static int gather_extent_pages(struct inode *inode, struct page **pages, - int num_pages, u64 off) -{ - int i; - pgoff_t index = off >> PAGE_SHIFT; - - for (i = 0; i < num_pages; i++) { -again: - pages[i] = extent_same_get_page(inode, index + i); - if (IS_ERR(pages[i])) { - int err = PTR_ERR(pages[i]); - - if (err == -EAGAIN) - goto again; - pages[i] = NULL; - return err; - } - } - return 0; -} - -static int lock_extent_range(struct inode *inode, u64 off, u64 len, - bool retry_range_locking) -{ - /* - * Do any pending delalloc/csum calculations on inode, one way or - * another, and lock file content. - * The locking order is: - * - * 1) pages - * 2) range in the inode's io tree - */ - while (1) { - struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); - ordered = btrfs_lookup_first_ordered_extent(inode, - off + len - 1); - if ((!ordered || - ordered->file_offset + ordered->len <= off || - ordered->file_offset >= off + len) && - !test_range_bit(&BTRFS_I(inode)->io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); - if (ordered) - btrfs_put_ordered_extent(ordered); - if (!retry_range_locking) - return -EAGAIN; - btrfs_wait_ordered_range(inode, off, len); - } - return 0; -} - static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { inode_unlock(inode1); @@ -3292,261 +3221,32 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) inode_lock_nested(inode2, I_MUTEX_CHILD); } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len, - bool retry_range_locking) -{ - int ret; - - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - } - ret = lock_extent_range(inode1, loff1, len, retry_range_locking); - if (ret) - return ret; - ret = lock_extent_range(inode2, loff2, len, retry_range_locking); - if (ret) - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, - loff1 + len - 1); - return ret; -} - -struct cmp_pages { - int num_pages; - struct page **src_pages; - struct page **dst_pages; -}; - -static void btrfs_cmp_data_free(struct cmp_pages *cmp) -{ - int i; - struct page *pg; - - for (i = 0; i < cmp->num_pages; i++) { - pg = cmp->src_pages[i]; - if (pg) { - unlock_page(pg); - put_page(pg); - cmp->src_pages[i] = NULL; - } - pg = cmp->dst_pages[i]; - if (pg) { - unlock_page(pg); - put_page(pg); - cmp->dst_pages[i] = NULL; - } - } -} - -static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, - struct inode *dst, u64 dst_loff, - u64 len, struct cmp_pages *cmp) -{ - int ret; - int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; - - cmp->num_pages = num_pages; - - ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); - if (ret) - goto out; - - ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); - -out: - if (ret) - btrfs_cmp_data_free(cmp); - return ret; -} - -static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) -{ - int ret = 0; - int i; - struct page *src_page, *dst_page; - unsigned int cmp_len = PAGE_SIZE; - void *addr, *dst_addr; - - i = 0; - while (len) { - if (len < PAGE_SIZE) - cmp_len = len; - - BUG_ON(i >= cmp->num_pages); - - src_page = cmp->src_pages[i]; - dst_page = cmp->dst_pages[i]; - ASSERT(PageLocked(src_page)); - ASSERT(PageLocked(dst_page)); - - addr = kmap_atomic(src_page); - dst_addr = kmap_atomic(dst_page); - - flush_dcache_page(src_page); - flush_dcache_page(dst_page); - - if (memcmp(addr, dst_addr, cmp_len)) - ret = -EBADE; - - kunmap_atomic(addr); - kunmap_atomic(dst_addr); - - if (ret) - break; - - len -= cmp_len; - i++; - } - - return ret; -} - -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, - u64 olen) -{ - u64 len = *plen; - u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - - if (off + olen > inode->i_size || off + olen < off) - return -EINVAL; - - /* if we extend to eof, continue to block boundary */ - if (off + len == inode->i_size) - *plen = len = ALIGN(inode->i_size, bs) - off; - - /* Check that we are block aligned - btrfs_clone() requires this */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) - return -EINVAL; - - return 0; -} - static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff, - struct cmp_pages *cmp) + struct inode *dst, u64 dst_loff) { + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; int ret; u64 len = olen; - bool same_inode = (src == dst); - u64 same_lock_start = 0; - u64 same_lock_len = 0; - - ret = extent_same_check_offsets(src, loff, &len, olen); - if (ret) - return ret; - - ret = extent_same_check_offsets(dst, dst_loff, &len, olen); - if (ret) - return ret; - - if (same_inode) { - /* - * Single inode case wants the same checks, except we - * don't want our length pushed out past i_size as - * comparing that data range makes no sense. - * - * extent_same_check_offsets() will do this for an - * unaligned length at i_size, so catch it here and - * reject the request. - * - * This effectively means we require aligned extents - * for the single-inode case, whereas the other cases - * allow an unaligned length so long as it ends at - * i_size. - */ - if (len != olen) - return -EINVAL; - - /* Check for overlapping ranges */ - if (dst_loff + len > loff && dst_loff < loff + len) - return -EINVAL; - - same_lock_start = min_t(u64, loff, dst_loff); - same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; - } else { - /* - * If the source and destination inodes are different, the - * source's range end offset matches the source's i_size, that - * i_size is not a multiple of the sector size, and the - * destination range does not go past the destination's i_size, - * we must round down the length to the nearest sector size - * multiple. If we don't do this adjustment we end replacing - * with zeroes the bytes in the range that starts at the - * deduplication range's end offset and ends at the next sector - * size multiple. - */ - if (loff + olen == i_size_read(src) && - dst_loff + len < i_size_read(dst)) { - const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; - - len = round_down(i_size_read(src), sz) - loff; - if (len == 0) - return 0; - olen = len; - } - } - -again: - ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); - if (ret) - return ret; - if (same_inode) - ret = lock_extent_range(src, same_lock_start, same_lock_len, - false); - else - ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, - false); + if (loff + len == src->i_size) + len = ALIGN(src->i_size, bs) - loff; /* - * If one of the inodes has dirty pages in the respective range or - * ordered extents, we need to flush dellaloc and wait for all ordered - * extents in the range. We must unlock the pages and the ranges in the - * io trees to avoid deadlocks when flushing delalloc (requires locking - * pages) and when waiting for ordered extents to complete (they require - * range locking). + * For same inode case we don't want our length pushed out past i_size + * as comparing that data range makes no sense. + * + * This effectively means we require aligned extents for the single + * inode case, whereas the other cases allow an unaligned length so long + * as it ends at i_size. */ - if (ret == -EAGAIN) { - /* - * Ranges in the io trees already unlocked. Now unlock all - * pages before waiting for all IO to complete. - */ - btrfs_cmp_data_free(cmp); - if (same_inode) { - btrfs_wait_ordered_range(src, same_lock_start, - same_lock_len); - } else { - btrfs_wait_ordered_range(src, loff, len); - btrfs_wait_ordered_range(dst, dst_loff, len); - } - goto again; - } - ASSERT(ret == 0); - if (WARN_ON(ret)) { - /* ranges in the io trees already unlocked */ - btrfs_cmp_data_free(cmp); - return ret; - } - - /* pass original length for comparison so we stay within i_size */ - ret = btrfs_cmp_data(olen, cmp); - if (ret == 0) - ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); - - if (same_inode) - unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, - same_lock_start + same_lock_len - 1); - else - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + if (dst == src && len != olen) + return -EINVAL; - btrfs_cmp_data_free(cmp); + /* + * Lock destination range to serialize with concurrent readpages(). + */ + lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); return ret; } @@ -3557,58 +3257,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; - struct cmp_pages cmp; int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; - bool same_inode = (src == dst); u64 i, tail_len, chunk_count; - if (olen == 0) - return 0; - - if (same_inode) - inode_lock(src); - else - btrfs_double_inode_lock(src, dst); - /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) + return -EINVAL; + + if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) + return -ETXTBSY; tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); if (chunk_count == 0) num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; - /* - * If deduping ranges in the same inode, locking rules make it - * mandatory to always lock pages in ascending order to avoid deadlocks - * with concurrent tasks (such as starting writeback/delalloc). - */ - if (same_inode && dst_loff < loff) - swap(loff, dst_loff); - - /* - * We must gather up all the pages before we initiate our extent - * locking. We use an array for the page pointers. Size of the array is - * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. - */ - cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); - cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); - if (!cmp.src_pages || !cmp.dst_pages) { - ret = -ENOMEM; - goto out_free; - } - for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff, &cmp); + dst, dst_loff); if (ret) - goto out_free; + return ret; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; @@ -3616,17 +3285,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (tail_len > 0) ret = btrfs_extent_same_range(src, loff, tail_len, dst, - dst_loff, &cmp); - -out_free: - kvfree(cmp.src_pages); - kvfree(cmp.dst_pages); - -out_unlock: - if (same_inode) - inode_unlock(src); - else - btrfs_double_inode_unlock(src, dst); + dst_loff); return ret; } @@ -4213,11 +3872,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; u64 len = olen; u64 bs = fs_info->sb->s_blocksize; - int same_inode = src == inode; /* * TODO: @@ -4230,101 +3887,35 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * be either compressed or non-compressed. */ - if (btrfs_root_readonly(root)) - return -EROFS; - - if (file_src->f_path.mnt != file->f_path.mnt || - src->i_sb != inode->i_sb) - return -EXDEV; - - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - return -EISDIR; - - if (!same_inode) { - btrfs_double_inode_lock(src, inode); - } else { - inode_lock(src); - } - /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + return -EINVAL; + + if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) + return -ETXTBSY; - /* determine range to clone */ - ret = -EINVAL; - if (off + len > src->i_size || off + len < off) - goto out_unlock; - if (len == 0) - olen = len = src->i_size - off; /* - * If we extend to eof, continue to block boundary if and only if the - * destination end offset matches the destination file's size, otherwise - * we would be corrupting data by placing the eof block into the middle - * of a file. + * VFS's generic_remap_file_range_prep() protects us from cloning the + * eof block into the middle of a file, which would result in corruption + * if the file size is not blocksize aligned. So we don't need to check + * for that case here. */ - if (off + len == src->i_size) { - if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size) - goto out_unlock; + if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; - } - - if (len == 0) { - ret = 0; - goto out_unlock; - } - - /* verify the end result is block aligned */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || - !IS_ALIGNED(destoff, bs)) - goto out_unlock; - - /* verify if ranges are overlapped within the same file */ - if (same_inode) { - if (destoff + len > off && destoff < off + len) - goto out_unlock; - } if (destoff > inode->i_size) { ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) - goto out_unlock; + return ret; } /* - * Lock the target range too. Right after we replace the file extent - * items in the fs tree (which now point to the cloned data), we might - * have a worker replace them with extent items relative to a write - * operation that was issued before this clone operation (i.e. confront - * with inode.c:btrfs_finish_ordered_io). + * Lock destination range to serialize with concurrent readpages(). */ - if (same_inode) { - u64 lock_start = min_t(u64, off, destoff); - u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - - ret = lock_extent_range(src, lock_start, lock_len, true); - } else { - ret = btrfs_double_extent_lock(src, off, inode, destoff, len, - true); - } - ASSERT(ret == 0); - if (WARN_ON(ret)) { - /* ranges in the io trees already unlocked */ - goto out_unlock; - } - + lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - - if (same_inode) { - u64 lock_start = min_t(u64, off, destoff); - u64 lock_end = max_t(u64, off, destoff) + len - 1; - - unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); - } else { - btrfs_double_extent_unlock(src, off, inode, destoff, len); - } + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. @@ -4332,11 +3923,87 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, truncate_inode_pages_range(&inode->i_data, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); -out_unlock: + + return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + bool same_inode = inode_out == inode_in; + u64 wb_len; + int ret; + + if (!(remap_flags & REMAP_FILE_DEDUP)) { + struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + + if (btrfs_root_readonly(root_out)) + return -EROFS; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + } + + if (same_inode) + inode_lock(inode_in); + else + btrfs_double_inode_lock(inode_in, inode_out); + + /* + * Now that the inodes are locked, we need to start writeback ourselves + * and can not rely on the writeback from the VFS's generic helper + * generic_remap_file_range_prep() because: + * + * 1) For compression we must call filemap_fdatawrite_range() range + * twice (btrfs_fdatawrite_range() does it for us), and the generic + * helper only calls it once; + * + * 2) filemap_fdatawrite_range(), called by the generic helper only + * waits for the writeback to complete, i.e. for IO to be done, and + * not for the ordered extents to complete. We need to wait for them + * to complete so that new file extent items are in the fs tree. + */ + if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) + wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + else + wb_len = ALIGN(*len, bs); + + /* + * Since we don't lock ranges, wait for ongoing lockless dio writes (as + * any in progress could create its ordered extents after we wait for + * existing ordered extents below). + */ + inode_dio_wait(inode_in); if (!same_inode) - btrfs_double_inode_unlock(src, inode); + inode_dio_wait(inode_out); + + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + wb_len); + if (ret < 0) + goto out_unlock; + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + wb_len); + if (ret < 0) + goto out_unlock; + + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); + if (ret < 0 || *len == 0) + goto out_unlock; + + return 0; + + out_unlock: + if (same_inode) + inode_unlock(inode_in); else - inode_unlock(src); + btrfs_double_inode_unlock(inode_in, inode_out); + return ret; } @@ -4344,29 +4011,29 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + bool same_inode = dst_inode == src_inode; int ret; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (remap_flags & REMAP_FILE_DEDUP) { - struct inode *src = file_inode(src_file); - struct inode *dst = file_inode(dst_file); - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { - /* - * Btrfs does not support blocksize < page_size. As a - * result, btrfs_cmp_data() won't correctly handle - * this situation without an update. - */ - return -EINVAL; - } + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, + &len, remap_flags); + if (ret < 0 || len == 0) + return ret; - ret = btrfs_extent_same(src, off, len, dst, destoff); - } else { + if (remap_flags & REMAP_FILE_DEDUP) + ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); - } + + if (same_inode) + inode_unlock(src_inode); + else + btrfs_double_inode_unlock(src_inode, dst_inode); + return ret < 0 ? ret : len; } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c @@ -27,7 +27,7 @@ * Records the total size (including the header) of compressed data. * * 2. Segment(s) - * Variable size. Each segment includes one segment header, followd by data + * Variable size. Each segment includes one segment header, followed by data * payload. * One regular LZO compressed extent can have one or more segments. * For inlined LZO compressed extent, only one segment is allowed. diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c @@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; - bool dec_pending_ordered = false; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) - dec_pending_ordered = true; spin_unlock_irq(&tree->lock); - /* - * The current running transaction is waiting on us, we need to let it - * know that we're complete and wake it up. - */ - if (dec_pending_ordered) { - struct btrfs_transaction *trans; - - /* - * The checks for trans are just a formality, it should be set, - * but if it isn't we don't want to deref/assert under the spin - * lock, so be nice and check if trans is set, but ASSERT() so - * if it isn't set a developer will notice. - */ - spin_lock(&fs_info->trans_lock); - trans = fs_info->running_transaction; - if (trans) - refcount_inc(&trans->use_count); - spin_unlock(&fs_info->trans_lock); - - ASSERT(trans); - if (trans) { - if (atomic_dec_and_test(&trans->pending_ordered)) - wake_up(&trans->pending_wait); - btrfs_put_transaction(trans); - } - } - spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h @@ -37,28 +37,31 @@ struct btrfs_ordered_sum { * rbtree, just before waking any waiters. It is used to indicate the * IO is done and any metadata is inserted into the tree. */ -#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ - -#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ - -#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ - -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ - -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ - -#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ - -#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ - -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent - * has done its due diligence in updating - * the isize. */ -#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */ - -#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to - * complete in the current transaction. */ -#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */ +enum { + /* set when all the pages are written */ + BTRFS_ORDERED_IO_DONE, + /* set when removed from the tree */ + BTRFS_ORDERED_COMPLETE, + /* set when we want to write in place */ + BTRFS_ORDERED_NOCOW, + /* writing a zlib compressed extent */ + BTRFS_ORDERED_COMPRESSED, + /* set when writing to preallocated extent */ + BTRFS_ORDERED_PREALLOC, + /* set when we're doing DIO with this extent */ + BTRFS_ORDERED_DIRECT, + /* We had an io error when writing this out */ + BTRFS_ORDERED_IOERR, + /* + * indicates whether this ordered extent has done its due diligence in + * updating the isize + */ + BTRFS_ORDERED_UPDATED_ISIZE, + /* Set when we have to truncate an extent */ + BTRFS_ORDERED_TRUNCATED, + /* Regular IO for COW */ + BTRFS_ORDERED_REGULAR, +}; struct btrfs_ordered_extent { /* logical offset in the file */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c @@ -30,7 +30,7 @@ * - sync * - copy also limits on subvol creation * - limit - * - caches fuer ulists + * - caches for ulists * - performance benchmarks * - check all ioctl parameters */ @@ -522,7 +522,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) __del_qgroup_rb(qgroup); } /* - * we call btrfs_free_qgroup_config() when umounting + * We call btrfs_free_qgroup_config() when unmounting * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ @@ -1013,16 +1013,22 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } - spin_lock(&fs_info->qgroup_lock); - fs_info->quota_root = quota_root; - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - spin_unlock(&fs_info->qgroup_lock); ret = btrfs_commit_transaction(trans); trans = NULL; if (ret) goto out_free_path; + /* + * Set quota enabled flag after committing the transaction, to avoid + * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot + * creation. + */ + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + spin_unlock(&fs_info->qgroup_lock); + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1122,7 +1128,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. * - * In this case, all exclsuive extents will also be exlusive for parent, so + * In this case, all exclusive extents will also be exclusive for parent, so * excl/rfer just get added/removed. * * So is qgroup reservation space, which should also be added/removed to @@ -1749,14 +1755,14 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) * * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. - * They should be marked during preivous (@dst_level = 1) iteration. + * They should be marked during previous (@dst_level = 1) iteration. * * 3) Mark file extents in leaves dirty * We don't have good way to pick out new file extents only. * So we still follow the old method by scanning all file extents in * the leave. * - * This function can free us from keeping two pathes, thus later we only need + * This function can free us from keeping two paths, thus later we only need * to care about how to iterate all new tree blocks in reloc tree. */ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, @@ -1895,7 +1901,7 @@ out: * * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace * above tree blocks along with their counter parts in file tree. - * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * While during search, old tree blocks OO(c) will be skipped as tree block swap * won't affect OO(c). */ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, @@ -2020,7 +2026,7 @@ out: * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, * and then go down @src_eb (pointed by @src_parent and @src_slot) to find - * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * the counterpart of the tree block, then mark both tree blocks as qgroup dirty, * and skip all tree blocks whose generation is smaller than last_snapshot. * * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), @@ -3104,9 +3110,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_rescan_lock); goto out; } - extent_buffer_get(scratch_leaf); - btrfs_tree_read_lock(scratch_leaf); - btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK); slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3132,10 +3135,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, goto out; } out: - if (scratch_leaf) { - btrfs_tree_read_unlock_blocking(scratch_leaf); + if (scratch_leaf) free_extent_buffer(scratch_leaf); - } if (done && !ret) { ret = 1; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h @@ -70,7 +70,7 @@ struct btrfs_qgroup_extent_record { * be converted into META_PERTRANS. */ enum btrfs_qgroup_rsv_type { - BTRFS_QGROUP_RSV_DATA = 0, + BTRFS_QGROUP_RSV_DATA, BTRFS_QGROUP_RSV_META_PERTRANS, BTRFS_QGROUP_RSV_META_PREALLOC, BTRFS_QGROUP_RSV_LAST, @@ -81,10 +81,10 @@ enum btrfs_qgroup_rsv_type { * * Each type should have different reservation behavior. * E.g, data follows its io_tree flag modification, while - * *currently* meta is just reserve-and-clear during transcation. + * *currently* meta is just reserve-and-clear during transaction. * * TODO: Add new type for reservation which can survive transaction commit. - * Currect metadata reservation behavior is not suitable for such case. + * Current metadata reservation behavior is not suitable for such case. */ struct btrfs_qgroup_rsv { u64 values[BTRFS_QGROUP_RSV_LAST]; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c @@ -1980,7 +1980,7 @@ cleanup_io: * - In case of single failure, where rbio->failb == -1: * * Cache this rbio iff the above read reconstruction is - * excuted without problems. + * executed without problems. */ if (err == BLK_STS_OK && rbio->failb < 0) cache_rbio_pages(rbio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c @@ -376,26 +376,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + /* Insert extent in reada tree + all per-device trees, all or nothing */ + down_read(&fs_info->dev_replace.rwsem); ret = radix_tree_preload(GFP_KERNEL); - if (ret) + if (ret) { + up_read(&fs_info->dev_replace.rwsem); goto error; + } - /* insert extent in reada_tree + all per-device trees, all or nothing */ - btrfs_dev_replace_read_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); + up_read(&fs_info->dev_replace.rwsem); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); + up_read(&fs_info->dev_replace.rwsem); goto error; } radix_tree_preload_end(); @@ -437,13 +439,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, } radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); goto error; } have_zone = 1; } spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); if (!have_zone) goto error; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c @@ -43,7 +43,7 @@ struct ref_entry { * back to the delayed ref action. We hold the ref we are changing in the * action so we can account for the history properly, and we record the root we * were called with since it could be different from ref_root. We also store - * stack traces because thats how I roll. + * stack traces because that's how I roll. */ struct ref_action { int action; @@ -56,7 +56,7 @@ struct ref_action { /* * One of these for every block we reference, it holds the roots and references - * to it as well as all of the ref actions that have occured to it. We never + * to it as well as all of the ref actions that have occurred to it. We never * free it until we unmount the file system in order to make sure re-allocations * are happening properly. */ @@ -859,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * This shouldn't happen because we will add our re * above when we lookup the be with !parent, but just in * case catch this case so we don't panic because I - * didn't thik of some other corner case. + * didn't think of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", root->root_key.objectid, be->bytenr); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c @@ -2631,7 +2631,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in + * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + fs_info->nodesize * @@ -4185,37 +4185,13 @@ static struct reloc_control *alloc_reloc_control(void) static void describe_relocation(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - char buf[128]; /* prefixed by a '|' that'll be dropped */ - u64 flags = block_group->flags; + char buf[128] = {'\0'}; - /* Shouldn't happen */ - if (!flags) { - strcpy(buf, "|NONE"); - } else { - char *bp = buf; - -#define DESCRIBE_FLAG(f, d) \ - if (flags & BTRFS_BLOCK_GROUP_##f) { \ - bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ - flags &= ~BTRFS_BLOCK_GROUP_##f; \ - } - DESCRIBE_FLAG(DATA, "data"); - DESCRIBE_FLAG(SYSTEM, "system"); - DESCRIBE_FLAG(METADATA, "metadata"); - DESCRIBE_FLAG(RAID0, "raid0"); - DESCRIBE_FLAG(RAID1, "raid1"); - DESCRIBE_FLAG(DUP, "dup"); - DESCRIBE_FLAG(RAID10, "raid10"); - DESCRIBE_FLAG(RAID5, "raid5"); - DESCRIBE_FLAG(RAID6, "raid6"); - if (flags) - snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); -#undef DESCRIBE_FLAG - } + btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf + 1); + block_group->key.objectid, buf); } /* @@ -4223,6 +4199,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { + struct btrfs_block_group_cache *bg; struct btrfs_root *extent_root = fs_info->extent_root; struct reloc_control *rc; struct inode *inode; @@ -4231,14 +4208,23 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + bg = btrfs_lookup_block_group(fs_info, group_start); + if (!bg) + return -ENOENT; + + if (btrfs_pinned_by_swapfile(fs_info, bg)) { + btrfs_put_block_group(bg); + return -ETXTBSY; + } + rc = alloc_reloc_control(); - if (!rc) + if (!rc) { + btrfs_put_block_group(bg); return -ENOMEM; + } rc->extent_root = extent_root; - - rc->block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!rc->block_group); + rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group); if (ret) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c @@ -339,7 +339,9 @@ static struct full_stripe_lock *insert_full_stripe_lock( } } - /* Insert new lock */ + /* + * Insert new lock. + */ ret = kmalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return ERR_PTR(-ENOMEM); @@ -568,12 +570,11 @@ static void scrub_put_ctx(struct scrub_ctx *sctx) scrub_free_ctx(sctx); } -static noinline_for_stack -struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) +static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( + struct btrfs_fs_info *fs_info, int is_dev_replace) { struct scrub_ctx *sctx; int i; - struct btrfs_fs_info *fs_info = dev->fs_info; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) @@ -582,7 +583,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; - sctx->fs_info = dev->fs_info; + sctx->fs_info = fs_info; for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; @@ -832,6 +833,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int page_num; int success; bool full_stripe_locked; + unsigned int nofs_flag; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -857,6 +859,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) dev = sblock_to_check->pagev[0]->dev; /* + * We must use GFP_NOFS because the scrub task might be waiting for a + * worker task executing this function and in turn a transaction commit + * might be waiting the scrub task to pause (which needs to wait for all + * the worker tasks to complete before pausing). + * We do allocations in the workers through insert_full_stripe_lock() + * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * this function. + */ + nofs_flag = memalloc_nofs_save(); + /* * For RAID5/6, race can happen for a different device scrub thread. * For data corruption, Parity and Data threads will both try * to recovery the data. @@ -865,6 +877,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); if (ret < 0) { + memalloc_nofs_restore(nofs_flag); spin_lock(&sctx->stat_lock); if (ret == -ENOMEM) sctx->stat.malloc_errors++; @@ -904,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, - sizeof(*sblocks_for_recheck), GFP_NOFS); + sizeof(*sblocks_for_recheck), GFP_KERNEL); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1202,6 +1215,7 @@ out: } ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + memalloc_nofs_restore(nofs_flag); if (ret < 0) return ret; return 0; @@ -3540,7 +3554,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!ret && sctx->is_dev_replace) { /* * If we are doing a device replace wait for any tasks - * that started dellaloc right before we set the block + * that started delalloc right before we set the block * group to RO mode, as they might have just allocated * an extent from it or decided they could do a nocow * write. And if any such tasks did that, wait for their @@ -3596,11 +3610,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&fs_info->dev_replace.rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&dev_replace->rwsem); + ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache); @@ -3636,10 +3651,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&fs_info->dev_replace.rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&fs_info->dev_replace.rwsem); if (ro_set) btrfs_dec_block_group_ro(cache); @@ -3772,6 +3787,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; + unsigned int nofs_flag; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -3813,13 +3829,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } + /* Allocate outside of device_list_mutex */ + sctx = scrub_setup_ctx(fs_info, is_dev_replace); + if (IS_ERR(sctx)) + return PTR_ERR(sctx); mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; + ret = -ENODEV; + goto out_free_ctx; } if (!is_dev_replace && !readonly && @@ -3827,7 +3848,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); - return -EROFS; + ret = -EROFS; + goto out_free_ctx; } mutex_lock(&fs_info->scrub_lock); @@ -3835,34 +3857,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EIO; + ret = -EIO; + goto out_free_ctx; } - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EINPROGRESS; + ret = -EINPROGRESS; + goto out_free_ctx; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return ret; + goto out_free_ctx; } - sctx = scrub_setup_ctx(dev, is_dev_replace); - if (IS_ERR(sctx)) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); - return PTR_ERR(sctx); - } sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3875,6 +3892,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); + /* + * In order to avoid deadlock with reclaim when there is a transaction + * trying to pause scrub, make sure we use GFP_NOFS for all the + * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() + * invoked by our callees. The pausing request is done when the + * transaction commit starts, and it blocks the transaction until scrub + * is paused (done at specific points at scrub_stripe() or right above + * before incrementing fs_info->scrubs_running). + */ + nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { /* * by holding device list mutex, we can @@ -3887,6 +3914,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); + memalloc_nofs_restore(nofs_flag); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); @@ -3905,6 +3933,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_put_ctx(sctx); return ret; + +out_free_ctx: + scrub_free_ctx(sctx); + + return ret; } void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c @@ -2238,7 +2238,7 @@ out: * inodes "orphan" name instead of the real name and stop. Same with new inodes * that were not created yet and overwritten inodes/refs. * - * When do we have have orphan inodes: + * When do we have orphan inodes: * 1. When an inode is freshly created and thus no valid refs are available yet * 2. When a directory lost all it's refs (deleted) but still has dir items * inside which were not processed yet (pending for move/delete). If anyone @@ -3854,7 +3854,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) /* * We may have refs where the parent directory does not exist * yet. This happens if the parent directories inum is higher - * the the current inum. To handle this case, we create the + * than the current inum. To handle this case, we create the * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ @@ -4775,7 +4775,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_key key; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; - unsigned pg_offset = offset & ~PAGE_MASK; + unsigned pg_offset = offset_in_page(offset); ssize_t ret = 0; key.objectid = sctx->cur_ino; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c @@ -93,7 +93,7 @@ const char *btrfs_decode_error(int errno) /* * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the approciate error response. + * invokes the appropriate error response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, @@ -151,7 +151,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * although there is no way to update the progress. It would add the * risk of a deadlock, therefore the canceling is omitted. The only * penalty is that some I/O remains active until the procedure - * completes. The next time when the filesystem is mounted writeable + * completes. The next time when the filesystem is mounted writable * again, the device replace operation continues. */ } @@ -1848,7 +1848,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (!btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, - "too many missing devices, writeable remount is not allowed"); + "too many missing devices, writable remount is not allowed"); ret = -EACCES; goto restore; } @@ -2090,7 +2090,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_free_data = 0; u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)fs_info->fsid; + __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; @@ -2312,7 +2312,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * device_list_mutex here as we only read the device data and the list * is protected by RCU. Even if a device is deleted during the list * traversals, we'll get valid data, the freeing callback will wait at - * least until until the rcu_read_unlock. + * least until the rcu_read_unlock. */ rcu_read_lock(); cur_devices = fs_info->fs_devices; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c @@ -191,6 +191,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); static struct attribute *btrfs_supported_feature_attrs[] = { @@ -204,6 +205,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), NULL }; @@ -505,12 +507,24 @@ static ssize_t quota_override_store(struct kobject *kobj, BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); +static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%pU\n", + fs_info->fs_devices->metadata_uuid); +} + +BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), BTRFS_ATTR_PTR(, sectorsize), BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), + BTRFS_ATTR_PTR(, metadata_uuid), NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h @@ -9,7 +9,7 @@ extern u64 btrfs_debugfs_test; enum btrfs_feature_set { - FEAT_COMPAT = 0, + FEAT_COMPAT, FEAT_COMPAT_RO, FEAT_INCOMPAT, FEAT_MAX diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c @@ -174,8 +174,10 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; - if (root->node) + if (root->node) { + /* One for allocate_extent_buffer */ free_extent_buffer(root->node); + } kfree(root); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c @@ -62,10 +62,11 @@ static int test_find_delalloc(u32 sectorsize) struct page *page; struct page *locked_page = NULL; unsigned long index = 0; - u64 total_dirty = SZ_256M; - u64 max_bytes = SZ_128M; + /* In this test we need at least 2 file extents at its maximum size */ + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + u64 total_dirty = 2 * max_bytes; u64 start, end, test_start; - u64 found; + bool found; int ret = -EINVAL; test_msg("running find delalloc tests"); @@ -76,7 +77,7 @@ static int test_find_delalloc(u32 sectorsize) return -ENOMEM; } - extent_io_tree_init(&tmp, inode); + extent_io_tree_init(&tmp, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -106,8 +107,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("should have found at least one delalloc"); goto out_bits; @@ -137,8 +138,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("couldn't find delalloc in our range"); goto out_bits; @@ -171,8 +172,8 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (found) { test_err("found range when we shouldn't have"); goto out_bits; @@ -192,8 +193,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("didn't find our range"); goto out_bits; @@ -233,8 +234,8 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("didn't find our range"); goto out_bits; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c @@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } - /* - * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidentally release our page. - */ - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); ret = -EINVAL; @@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); BTRFS_I(inode)->root = root; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c @@ -233,14 +233,12 @@ loop: extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); - init_waitqueue_head(&cur_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ refcount_set(&cur_trans->use_count, 2); - atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = ktime_get_seconds(); @@ -456,7 +454,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool enforce_qgroups) { struct btrfs_fs_info *fs_info = root->fs_info; - + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; @@ -485,13 +483,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { + struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; + u64 delayed_refs_bytes = 0; + qgroup_reserved = num_items * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, enforce_qgroups); if (ret) return ERR_PTR(ret); + /* + * We want to reserve all the bytes we may need all at once, so + * we only do 1 enospc flushing cycle per transaction start. We + * accomplish this by simply assuming we'll do 2 x num_items + * worth of delayed refs updates in this trans handle, and + * refill that amount for whatever is missing in the reserve. + */ num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + if (delayed_refs_rsv->full == 0) { + delayed_refs_bytes = num_bytes; + num_bytes <<= 1; + } + /* * Do the reservation for the relocation root creation */ @@ -500,8 +513,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - num_bytes, flush); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + if (ret) + goto reserve_fail; + if (delayed_refs_bytes) { + btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, + delayed_refs_bytes); + num_bytes -= delayed_refs_bytes; + } + } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && + !delayed_refs_rsv->full) { + /* + * Some people call with btrfs_start_transaction(root, 0) + * because they can be throttled, but have some other mechanism + * for reserving space. We still want these guys to refill the + * delayed block_rsv so just add 1 items worth of reservation + * here. + */ + ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); if (ret) goto reserve_fail; } @@ -670,7 +699,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) /* * btrfs_attach_transaction_barrier() - catch the running transaction * - * It is similar to the above function, the differentia is this one + * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully * complete. */ @@ -760,7 +789,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans)) + if (btrfs_check_space_for_delayed_refs(fs_info)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -769,22 +798,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - int updates; - int err; smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED || cur_trans->delayed_refs.flushing) return 1; - updates = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (updates) { - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) /* Error code will also eval true */ - return err; - } - return should_end_transaction(trans); } @@ -814,11 +833,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - u64 transid = trans->transid; - unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; - int must_run_delayed_refs = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); @@ -832,27 +848,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans); - trans->delayed_ref_updates = 0; - if (!trans->sync) { - must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans); - cur = max_t(unsigned long, cur, 32); - - /* - * don't make the caller wait if they are from a NOLOCK - * or ATTACH transaction, it will deadlock with commit - */ - if (must_run_delayed_refs == 1 && - (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) - must_run_delayed_refs = 2; - } - - btrfs_trans_release_metadata(trans); - trans->block_rsv = NULL; - - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); - btrfs_trans_release_chunk_metadata(trans); if (lock && should_end_transaction(trans) && @@ -894,10 +889,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(info, cur, transid, - must_run_delayed_refs == 1); - } return err; } @@ -1338,7 +1329,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return 0; /* - * Ensure dirty @src will be commited. Or, after comming + * Ensure dirty @src will be committed. Or, after coming * commit_fs_roots() and switch_commit_roots(), any dirty but not * recorded root will never be updated again, causing an outdated root * item. @@ -1842,7 +1833,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - DEFINE_WAIT(wait); WARN_ON(refcount_read(&trans->use_count) > 1); @@ -1911,13 +1901,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } -static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) -{ - wait_event(cur_trans->pending_wait, - atomic_read(&cur_trans->pending_ordered) == 0); -} - int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2052,8 +2035,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(fs_info); - btrfs_wait_pending_ordered(cur_trans); - btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h @@ -12,13 +12,13 @@ #include "ctree.h" enum btrfs_trans_state { - TRANS_STATE_RUNNING = 0, - TRANS_STATE_BLOCKED = 1, - TRANS_STATE_COMMIT_START = 2, - TRANS_STATE_COMMIT_DOING = 3, - TRANS_STATE_UNBLOCKED = 4, - TRANS_STATE_COMPLETED = 5, - TRANS_STATE_MAX = 6, + TRANS_STATE_RUNNING, + TRANS_STATE_BLOCKED, + TRANS_STATE_COMMIT_START, + TRANS_STATE_COMMIT_DOING, + TRANS_STATE_UNBLOCKED, + TRANS_STATE_COMPLETED, + TRANS_STATE_MAX, }; #define BTRFS_TRANS_HAVE_FREE_BGS 0 @@ -39,7 +39,6 @@ struct btrfs_transaction { */ atomic_t num_writers; refcount_t use_count; - atomic_t pending_ordered; unsigned long flags; @@ -51,7 +50,6 @@ struct btrfs_transaction { time64_t start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; - wait_queue_head_t pending_wait; struct list_head pending_snapshots; struct list_head pending_chunks; struct list_head switch_commits; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c @@ -27,10 +27,10 @@ * * @type: leaf or node * @identifier: the necessary info to locate the leaf/node. - * It's recommened to decode key.objecitd/offset if it's + * It's recommended to decode key.objecitd/offset if it's * meaningful. * @reason: describe the error - * @bad_value: optional, it's recommened to output bad value and its + * @bad_value: optional, it's recommended to output bad value and its * expected value (range). * * Since comma is used to separate the components, only space is allowed @@ -130,7 +130,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, } /* - * Support for new compression/encrption must introduce incompat flag, + * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c @@ -1144,7 +1144,7 @@ next: } btrfs_release_path(path); - /* look for a conflicing name */ + /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); if (di && !IS_ERR(di)) { @@ -3149,7 +3149,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); /* - * nobody else is going to jump in and write the the ctree + * Nobody else is going to jump in and write the ctree * super here because the log_commit atomic below is protecting * us. We must be called with a transaction handle pinning * the running transaction open, so a full commit can't hop @@ -3201,8 +3201,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log) { int ret; - u64 start; - u64 end; struct walk_control wc = { .free = 1, .process_func = process_one_buffer @@ -3216,18 +3214,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, btrfs_handle_fs_error(log->fs_info, ret, NULL); } - while (1) { - ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, - NULL); - if (ret) - break; - - clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); - } - + clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); free_extent_buffer(log->node); kfree(log); } @@ -4383,7 +4371,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4392,8 +4379,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; - logged_start = start; - logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { /* @@ -4434,11 +4419,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em->start >= i_size_read(&inode->vfs_inode)) continue; - if (em->start < logged_start) - logged_start = em->start; - if ((em->start + em->len - 1) > logged_end) - logged_end = em->start + em->len - 1; - /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -5778,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + /* + * If a new hard link was added to the inode in the current transaction + * and its link count is now greater than 1, we need to fallback to a + * transaction commit, otherwise we can end up not logging all its new + * parents for all the hard links. Here just from the dentry used to + * fsync, we can not visit the ancestor inodes for all the other hard + * links to figure out if any is new, so we fallback to a transaction + * commit (instead of adding a lot of complexity of scanning a btree, + * since this scenario is not a common use case). + */ + if (inode->vfs_inode.i_nlink > 1 && + inode->last_link_trans > last_committed) { + ret = -EMLINK; + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h @@ -15,7 +15,6 @@ struct btrfs_log_ctx { int log_ret; int log_transid; - int io_err; bool log_new_dentries; struct inode *inode; struct list_head list; @@ -26,7 +25,6 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, { ctx->log_ret = 0; ctx->log_transid = 0; - ctx->io_err = 0; ctx->log_new_dentries = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c @@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid10", .bg_flag = BTRFS_BLOCK_GROUP_RAID10, .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, @@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid1", .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, @@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, + .nparity = 0, .raid_name = "dup", .bg_flag = BTRFS_BLOCK_GROUP_DUP, .mindev_error = 0, @@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "raid0", .bg_flag = BTRFS_BLOCK_GROUP_RAID0, .mindev_error = 0, @@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "single", .bg_flag = 0, .mindev_error = 0, @@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_min = 2, .tolerated_failures = 1, .devs_increment = 1, - .ncopies = 2, + .ncopies = 1, + .nparity = 1, .raid_name = "raid5", .bg_flag = BTRFS_BLOCK_GROUP_RAID5, .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, @@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_min = 3, .tolerated_failures = 2, .devs_increment = 1, - .ncopies = 3, + .ncopies = 1, + .nparity = 2, .raid_name = "raid6", .bg_flag = BTRFS_BLOCK_GROUP_RAID6, .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, @@ -123,6 +130,60 @@ const char *get_raid_name(enum btrfs_raid_types type) return btrfs_raid_array[type].raid_name; } +/* + * Fill @buf with textual description of @bg_flags, no more than @size_buf + * bytes including terminating null byte. + */ +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(flag, desc) \ + do { \ + if (flags & (flag)) { \ + ret = snprintf(bp, size_bp, "%s|", (desc)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + flags &= ~(flag); \ + } \ + } while (0) + + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, + btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + + if (flags) { + ret = snprintf(bp, size_bp, "0x%llx|", flags); + size_bp -= ret; + } + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ + + /* + * The text is trimmed, it's up to the caller to provide sufficiently + * large buffer + */ +out_overflow:; +} + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); @@ -151,7 +212,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * the mutex can be very coarse and can cover long-running operations * * protects: updates to fs_devices counters like missing devices, rw devices, - * seeding, structure cloning, openning/closing devices at mount/umount time + * seeding, structure cloning, opening/closing devices at mount/umount time * * global::fs_devs - add, remove, updates to the global list * @@ -238,13 +299,15 @@ struct list_head *btrfs_get_fs_uuids(void) /* * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the uuid to fs_devices::fsid + * @fsid: if not NULL, copy the UUID to fs_devices::fsid + * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, + const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devs; @@ -261,6 +324,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + if (metadata_fsid) + memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); + else if (fsid) + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); + return fs_devs; } @@ -368,13 +436,57 @@ static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, return NULL; } -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +static noinline struct btrfs_fs_devices *find_fsid( + const u8 *fsid, const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devices; + ASSERT(fsid); + + if (metadata_fsid) { + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by first scanning + * a device which didn't have its fsid/metadata_uuid changed + * at all and the CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(metadata_fsid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by a device that + * has an outdated pair of fsid/metadata_uuid and + * CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(fs_devices->metadata_uuid, + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && + memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + } + + /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; + if (metadata_fsid) { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 + && memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } else { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } } return NULL; } @@ -709,6 +821,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->generation = btrfs_super_generation(disk_super); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + if (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { + pr_err( + "BTRFS: Invalid seeding and uuid-changed device detected\n"); + goto error_brelse; + } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = 1; } else { @@ -744,6 +863,51 @@ error_brelse: } /* + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices + * being created with a disk that has already completed its fsid change. + */ +static struct btrfs_fs_devices *find_fsid_inprogress( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->fsid, + BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { + return fs_devices; + } + } + + return NULL; +} + + +static struct btrfs_fs_devices *find_fsid_changed( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handles the case where scanned device is part of an fs that had + * multiple successful changes of FSID but curently device didn't + * observe it. Meaning our fsid will be different than theirs. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE) != 0) { + return fs_devices; + } + } + + return NULL; +} +/* * Add new device to list of registered devices * * Returns: @@ -755,14 +919,46 @@ static noinline struct btrfs_device *device_list_add(const char *path, bool *new_device_added) { struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices; + struct btrfs_fs_devices *fs_devices = NULL; struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + if (fsid_change_in_progress) { + if (!has_metadata_uuid) { + /* + * When we have an image which has CHANGING_FSID_V2 set + * it might belong to either a filesystem which has + * disks with completed fsid change or it might belong + * to fs with no UUID changes in effect, handle both. + */ + fs_devices = find_fsid_inprogress(disk_super); + if (!fs_devices) + fs_devices = find_fsid(disk_super->fsid, NULL); + } else { + fs_devices = find_fsid_changed(disk_super); + } + } else if (has_metadata_uuid) { + fs_devices = find_fsid(disk_super->fsid, + disk_super->metadata_uuid); + } else { + fs_devices = find_fsid(disk_super->fsid, NULL); + } + - fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid); + if (has_metadata_uuid) + fs_devices = alloc_fs_devices(disk_super->fsid, + disk_super->metadata_uuid); + else + fs_devices = alloc_fs_devices(disk_super->fsid, NULL); + + fs_devices->fsid_change = fsid_change_in_progress; + if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); @@ -774,6 +970,21 @@ static noinline struct btrfs_device *device_list_add(const char *path, mutex_lock(&fs_devices->device_list_mutex); device = find_device(fs_devices, devid, disk_super->dev_item.uuid); + + /* + * If this disk has been pulled into an fs devices created by + * a device which had the CHANGING_FSID_V2 flag then replace the + * metadata_uuid/fsid values of the fs_devices. + */ + if (has_metadata_uuid && fs_devices->fsid_change && + found_transid > fs_devices->latest_generation) { + memcpy(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + fs_devices->fsid_change = false; + } } if (!device) { @@ -850,6 +1061,35 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EEXIST); } + /* + * We are going to replace the device path for a given devid, + * make sure it's the same device if the device is mounted + */ + if (device->bdev) { + struct block_device *path_bdev; + + path_bdev = lookup_bdev(path); + if (IS_ERR(path_bdev)) { + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_CAST(path_bdev); + } + + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_warn_in_rcu(device->fs_info, + "duplicate device fsid:devid for %pU:%llu old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + return ERR_PTR(-EEXIST); + } + bdput(path_bdev); + btrfs_info_in_rcu(device->fs_info, + "device fsid %pU devid %llu moved old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + } + name = rcu_string_strdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); @@ -869,8 +1109,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, * it back. We need it to pick the disk with largest generation * (as above). */ - if (!fs_devices->opened) + if (!fs_devices->opened) { device->generation = found_transid; + fs_devices->latest_generation = max_t(u64, found_transid, + fs_devices->latest_generation); + } fs_devices->total_devices = btrfs_super_num_devices(disk_super); @@ -884,7 +1127,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *device; struct btrfs_device *orig_dev; - fs_devices = alloc_fs_devices(orig->fsid); + fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; @@ -1193,7 +1436,7 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, p = kmap(*page); /* align our pointer to the offset of the super block */ - *disk_super = p + (bytenr & ~PAGE_MASK); + *disk_super = p + offset_in_page(bytenr); if (btrfs_super_bytenr(*disk_super) != bytenr || btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { @@ -1709,7 +1952,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); + write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, + ptr, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -1862,12 +2106,12 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) { u64 num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { ASSERT(num_devices > 1); num_devices--; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return num_devices; } @@ -1900,6 +2144,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, goto out; } + if (btrfs_pinned_by_swapfile(fs_info, device)) { + btrfs_warn_in_rcu(fs_info, + "cannot remove device %s (devid %llu) due to active swapfile", + rcu_str_deref(device->name), device->devid); + ret = -ETXTBSY; + goto out; + } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto out; @@ -2132,7 +2384,13 @@ static struct btrfs_device *btrfs_find_device_by_path( disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + device = btrfs_find_device(fs_info, devid, dev_uuid, + disk_super->metadata_uuid); + else + device = btrfs_find_device(fs_info, devid, + dev_uuid, disk_super->fsid); + brelse(bh); if (!device) device = ERR_PTR(-ENOENT); @@ -2202,7 +2460,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; - seed_devices = alloc_fs_devices(NULL); + seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); @@ -2238,7 +2496,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); - memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); mutex_unlock(&fs_devices->device_list_mutex); @@ -2480,7 +2738,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path * so rename the fsid on the sysfs */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - fs_info->fsid); + fs_info->fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_info, "sysfs: failed to create fsid for sprout"); @@ -2718,8 +2976,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) return ret; } -static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +/* + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * @logical: Logical block offset in bytes. + * @length: Length of extent in bytes. + * + * Return: Chunk mapping or ERR_PTR. + */ +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -2756,7 +3021,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the @@ -2797,13 +3062,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) mutex_unlock(&fs_info->chunk_mutex); } - if (map->stripes[i].dev) { - ret = btrfs_update_device(trans, map->stripes[i].dev); - if (ret) { - mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = btrfs_update_device(trans, device); + if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; } } mutex_unlock(&fs_devices->device_list_mutex); @@ -3437,17 +3700,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_root *dev_root = fs_info->dev_root; - struct list_head *devices; - struct btrfs_device *device; - u64 old_size; - u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_trans_handle *trans; struct extent_buffer *leaf; int slot; int ret; @@ -3462,53 +3719,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_sys = 0; int chunk_reserved = 0; - /* step one make some room on all the devices */ - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - old_size = btrfs_device_get_total_bytes(device); - size_to_free = div_factor(old_size, 1); - size_to_free = min_t(u64, size_to_free, SZ_1M); - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || - btrfs_device_get_total_bytes(device) - - btrfs_device_get_bytes_used(device) > size_to_free || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) - continue; - - ret = btrfs_shrink_device(device, old_size - size_to_free); - if (ret == -ENOSPC) - break; - if (ret) { - /* btrfs_shrink_device never returns ret > 0 */ - WARN_ON(ret > 0); - goto error; - } - - trans = btrfs_start_transaction(dev_root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_info_in_rcu(fs_info, - "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - ret = btrfs_grow_device(trans, device, old_size); - if (ret) { - btrfs_end_transaction(trans); - /* btrfs_grow_device never returns ret > 0 */ - WARN_ON(ret > 0); - btrfs_info_in_rcu(fs_info, - "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - btrfs_end_transaction(trans); - } - - /* step two, relocate all the chunks */ path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3638,10 +3848,15 @@ again: ret = btrfs_relocate_chunk(fs_info, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto error; if (ret == -ENOSPC) { enospc_errors++; + } else if (ret == -ETXTBSY) { + btrfs_info(fs_info, + "skipping relocation of block group %llu due to active swapfile", + found_key.offset); + ret = 0; + } else if (ret) { + goto error; } else { spin_lock(&fs_info->balance_lock); bctl->stat.completed++; @@ -3712,6 +3927,162 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, } /* + * Fill @buf with textual description of balance filter flags @bargs, up to + * @size_buf including the terminating null. The output may be trimmed if it + * does not fit into the provided buffer. + */ +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, + u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_APPEND_NOARG(a) \ + do { \ + ret = snprintf(bp, size_bp, (a)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_2ARG(a, v1, v2) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_CONVERT) { + int index = btrfs_bg_flags_to_raid_index(bargs->target); + + CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); + } + + if (flags & BTRFS_BALANCE_ARGS_SOFT) + CHECK_APPEND_NOARG("soft,"); + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) + CHECK_APPEND_2ARG("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + + if (flags & BTRFS_BALANCE_ARGS_DEVID) + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) + CHECK_APPEND_2ARG("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + + if (flags & BTRFS_BALANCE_ARGS_VRANGE) + CHECK_APPEND_2ARG("vrange=%llu..%llu,", + bargs->vstart, bargs->vend); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT) + CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) + CHECK_APPEND_2ARG("limit=%u..%u,", + bargs->limit_min, bargs->limit_max); + + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) + CHECK_APPEND_2ARG("stripes=%u..%u,", + bargs->stripes_min, bargs->stripes_max); + +#undef CHECK_APPEND_2ARG +#undef CHECK_APPEND_1ARG +#undef CHECK_APPEND_NOARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ + else + buf[0] = '\0'; +} + +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) +{ + u32 size_buf = 1024; + char tmp_buf[192] = {'\0'}; + char *buf; + char *bp; + u32 size_bp = size_buf; + int ret; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + buf = kzalloc(size_buf, GFP_KERNEL); + if (!buf) + return; + + bp = buf; + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (bctl->flags & BTRFS_BALANCE_FORCE) + CHECK_APPEND_1ARG("%s", "-f "); + + if (bctl->flags & BTRFS_BALANCE_DATA) { + describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-d%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_METADATA) { + describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-m%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_SYSTEM) { + describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-s%s ", tmp_buf); + } + +#undef CHECK_APPEND_1ARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ + btrfs_info(fs_info, "balance: %s %s", + (bctl->flags & BTRFS_BALANCE_RESUME) ? + "resume" : "start", buf); + + kfree(buf); +} + +/* * Should be called with balance mutexe held */ int btrfs_balance(struct btrfs_fs_info *fs_info, @@ -3724,6 +4095,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; + bool reducing_integrity; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3803,24 +4175,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - btrfs_info(fs_info, - "balance: force reducing metadata integrity"); - } else { - btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); - ret = -EINVAL; - goto out; - } - } + !(bctl->meta.target & allowed))) + reducing_integrity = true; + else + reducing_integrity = false; + + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - /* if we're not converting, the target field is uninitialized */ - meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->meta.target : fs_info->avail_metadata_alloc_bits; - data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->data.target : fs_info->avail_data_alloc_bits; + if (reducing_integrity) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, + "balance: force reducing metadata integrity"); + } else { + btrfs_err(fs_info, + "balance: reduces metadata integrity, use --force if you want this"); + ret = -EINVAL; + goto out; + } + } + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { int meta_index = btrfs_bg_flags_to_raid_index(meta_target); @@ -3850,11 +4228,19 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); + describe_balance_start_or_resume(fs_info); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + btrfs_info(fs_info, "balance: paused"); + else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { @@ -3887,10 +4273,8 @@ static int balance_kthread(void *data) int ret = 0; mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - btrfs_info(fs_info, "balance: resuming"); + if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); - } mutex_unlock(&fs_info->balance_mutex); return ret; @@ -4433,10 +4817,16 @@ again: ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto done; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { failed++; + } else if (ret) { + if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "could not shrink block group %llu due to active swapfile", + chunk_offset); + } + goto done; + } } while (key.offset-- > 0); if (failed && !retried) { @@ -4602,11 +4992,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int devs_min; /* min devs needed */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to + store parity information */ int ret; u64 max_stripe_size; u64 max_chunk_size; u64 stripe_size; - u64 num_bytes; + u64 chunk_size; int ndevs; int i; int j; @@ -4628,6 +5020,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devs_min = btrfs_raid_array[index].devs_min; devs_increment = btrfs_raid_array[index].devs_increment; ncopies = btrfs_raid_array[index].ncopies; + nparity = btrfs_raid_array[index].nparity; if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; @@ -4654,7 +5047,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, BUG_ON(1); } - /* we don't want a chunk larger than 10% of writeable space */ + /* We don't want a chunk larger than 10% of writable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); @@ -4757,30 +5150,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * this will have to be fixed for RAID1 and RAID10 over * more drives */ - data_stripes = num_stripes / ncopies; - - if (type & BTRFS_BLOCK_GROUP_RAID5) - data_stripes = num_stripes - 1; - - if (type & BTRFS_BLOCK_GROUP_RAID6) - data_stripes = num_stripes - 2; + data_stripes = (num_stripes - nparity) / ncopies; /* * Use the number of data stripes to figure out how big this chunk * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size + * and compare that answer with the max chunk size. If it's higher, + * we try to reduce stripe_size. */ if (stripe_size * data_stripes > max_chunk_size) { - stripe_size = div_u64(max_chunk_size, data_stripes); - - /* bump the answer up to a 16MB boundary */ - stripe_size = round_up(stripe_size, SZ_16M); - /* - * But don't go higher than the limits we found while searching - * for free extents + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. */ - stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size = min(round_up(div_u64(max_chunk_size, + data_stripes), SZ_16M), stripe_size); } @@ -4808,9 +5193,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->type = type; map->sub_stripes = sub_stripes; - num_bytes = stripe_size * data_stripes; + chunk_size = stripe_size * data_stripes; - trace_btrfs_chunk_alloc(info, map, start, num_bytes); + trace_btrfs_chunk_alloc(info, map, start, chunk_size); em = alloc_extent_map(); if (!em) { @@ -4821,7 +5206,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; em->start = start; - em->len = num_bytes; + em->len = chunk_size; em->block_start = 0; em->block_len = em->len; em->orig_block_len = stripe_size; @@ -4839,14 +5224,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, refcount_inc(&em->refs); write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); + ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); if (ret) goto error_del_extent; - for (i = 0; i < map->num_stripes; i++) { - num_bytes = map->stripes[i].dev->bytes_used + stripe_size; - btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); - } + for (i = 0; i < map->num_stripes; i++) + btrfs_device_set_bytes_used(map->stripes[i].dev, + map->stripes[i].dev->bytes_used + stripe_size); atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); @@ -4890,7 +5274,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em = get_chunk_map(fs_info, chunk_offset, chunk_size); + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); if (IS_ERR(em)) return PTR_ERR(em); @@ -4971,10 +5355,10 @@ out: } /* - * Chunk allocation falls into two parts. The first part does works - * that make the new allocated chunk useable, but not do any operation - * that modifies the chunk tree. The second part does the works that - * require modifying the chunk tree. This division is important for the + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the * bootstrap process of adding storage to a seed btrfs. */ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) @@ -5032,7 +5416,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) int miss_ndevs = 0; int i; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) return 1; @@ -5092,7 +5476,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) /* * We could return errors for these cases, but that could get @@ -5122,11 +5506,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = 1; free_extent_map(em); - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && fs_info->dev_replace.tgtdev) ret++; - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return ret; } @@ -5138,7 +5522,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5155,7 +5539,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5314,7 +5698,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, /* discard always return a bbio */ ASSERT(bbio_ret); - em = get_chunk_map(fs_info, logical, length); + em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5640,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, return __btrfs_map_block_for_discard(fs_info, logical, *length, bbio_ret); - em = get_chunk_map(fs_info, logical, *length); + em = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5699,17 +6083,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, *length = em->len - offset; } - /* This is for when we're called from btrfs_merge_bio_hook() and all - it cares about is the length */ + /* + * This is for when we're called from btrfs_bio_fits_in_stripe and all + * it cares about is the length + */ if (!bbio_ret) goto out; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + /* + * Hold the semaphore for read during the whole operation, write is + * requested at commit time but must wait. + */ if (!dev_replace_is_ongoing) - btrfs_dev_replace_read_unlock(dev_replace); - else - btrfs_dev_replace_set_lock_blocking(dev_replace); + up_read(&dev_replace->rwsem); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !need_full_stripe(op) && dev_replace->tgtdev != NULL) { @@ -5904,12 +6292,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } out: if (dev_replace_is_ongoing) { - ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); - btrfs_dev_replace_read_lock(dev_replace); - /* Barrier implied by atomic_dec_and_test */ - if (atomic_dec_and_test(&dev_replace->blocking_readers)) - cond_wake_up_nomb(&dev_replace->read_lock_wq); - btrfs_dev_replace_read_unlock(dev_replace); + lockdep_assert_held(&dev_replace->rwsem); + /* Unlock and let waiting writers proceed */ + up_read(&dev_replace->rwsem); } free_extent_map(em); return ret; @@ -5943,7 +6328,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 rmap_len; int i, j, nr = 0; - em = get_chunk_map(fs_info, chunk_start, 1); + em = btrfs_get_chunk_map(fs_info, chunk_start, 1); if (IS_ERR(em)) return -EIO; @@ -6083,12 +6468,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, int should_queue = 1; struct btrfs_pending_bios *pending_bios; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || - !device->bdev) { - bio_io_error(bio); - return; - } - /* don't bother with additional async steps for reads, right now */ if (bio_op(bio) == REQ_OP_READ) { btrfsic_submit_bio(bio); @@ -6217,7 +6596,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || + if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, + &dev->dev_state) || (bio_op(first_bio) == REQ_OP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); @@ -6245,7 +6625,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || - !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { + !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { device = find_device(cur_devices, devid, uuid); if (device) return device; @@ -6574,12 +6954,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = fs_devices->seed; } - fs_devices = find_fsid(fsid); + fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid); + fs_devices = alloc_fs_devices(fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; @@ -6629,7 +7009,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); @@ -6876,7 +7256,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (missing > max_tolerated) { if (!failing_dev) btrfs_warn(fs_info, - "chunk %llu missing %d devices, max tolerance is %d for writeable mount", + "chunk %llu missing %d devices, max tolerance is %d for writable mount", em->start, missing, max_tolerated); free_extent_map(em); ret = false; @@ -7387,6 +7767,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; struct extent_map *em; struct map_lookup *map; + struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; @@ -7436,6 +7817,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, physical_offset, devid); ret = -EUCLEAN; } + + /* Make sure no dev extent is beyond device bondary */ + dev = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!dev) { + btrfs_err(fs_info, "failed to find devid %llu", devid); + ret = -EUCLEAN; + goto out; + } + if (physical_offset + physical_len > dev->disk_total_bytes) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", + devid, physical_offset, physical_len, + dev->disk_total_bytes); + ret = -EUCLEAN; + goto out; + } out: free_extent_map(em); return ret; @@ -7478,6 +7875,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; + u64 prev_devid = 0; + u64 prev_dev_ext_end = 0; int ret = 0; key.objectid = 1; @@ -7522,10 +7921,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext); + /* Check if this dev extent overlaps with the previous one */ + if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", + devid, physical_offset, prev_dev_ext_end); + ret = -EUCLEAN; + goto out; + } + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; + prev_devid = devid; + prev_dev_ext_end = physical_offset + physical_len; + ret = btrfs_next_item(root, path); if (ret < 0) goto out; @@ -7541,3 +7952,27 @@ out: btrfs_free_path(path); return ret; } + +/* + * Check whether the given block group or device is pinned by any inode being + * used as a swapfile. + */ +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) +{ + struct btrfs_swapfile_pin *sp; + struct rb_node *node; + + spin_lock(&fs_info->swapfile_pins_lock); + node = fs_info->swapfile_pins.rb_node; + while (node) { + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (ptr < sp->ptr) + node = node->rb_left; + else if (ptr > sp->ptr) + node = node->rb_right; + else + break; + } + spin_unlock(&fs_info->swapfile_pins_lock); + return node != NULL; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h @@ -210,6 +210,8 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + bool fsid_change; struct list_head fs_list; u64 num_devices; @@ -218,6 +220,10 @@ struct btrfs_fs_devices { u64 missing_devices; u64 total_rw_bytes; u64 total_devices; + + /* Highest generation number of seen devices */ + u64 latest_generation; + struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex @@ -261,15 +267,12 @@ struct btrfs_fs_devices { * we allocate are actually btrfs_io_bios. We'll cram as much of * struct btrfs_bio as we can into this over time. */ -typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { unsigned int mirror_num; unsigned int stripe_index; u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - u8 *csum_allocated; - btrfs_io_bio_end_io_t *end_io; struct bvec_iter iter; /* * This member must come last, bio_alloc_bioset will allocate enough @@ -283,15 +286,20 @@ static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) return container_of(bio, struct btrfs_io_bio, bio); } +static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio) +{ + if (io_bio->csum != io_bio->csum_inline) { + kfree(io_bio->csum); + io_bio->csum = NULL; + } +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; u64 length; /* only used for discard mappings */ }; -struct btrfs_bio; -typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); - struct btrfs_bio { refcount_t refs; atomic_t stripes_pending; @@ -331,6 +339,8 @@ struct btrfs_raid_attr { int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to store + * parity information */ int mindev_error; /* error code if min devs requisite is unmet */ const char raid_name[8]; /* name of the raid */ u64 bg_flag; /* block group flag of the raid */ @@ -430,6 +440,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); +void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); @@ -462,6 +473,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c @@ -11,6 +11,7 @@ #include <linux/security.h> #include <linux/posix_acl_xattr.h> #include <linux/iversion.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode, { const struct xattr *xattr; struct btrfs_trans_handle *trans = fs_info; + unsigned int nofs_flag; char *name; int err = 0; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); @@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode, if (err < 0) break; } + memalloc_nofs_restore(nofs_flag); return err; } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h @@ -92,7 +92,7 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); #define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_FSID_SIZE) #define TP_fast_assign_fsid(fs_info) \ - memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE) + memcpy(__entry->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) #define TP_STRUCT__entry_btrfs(args...) \ TP_STRUCT__entry( \ @@ -1048,6 +1048,8 @@ TRACE_EVENT(btrfs_trigger_flush, { FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS"}, \ { FLUSH_DELALLOC, "FLUSH_DELALLOC"}, \ { FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT"}, \ + { FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \ + { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \ { ALLOC_CHUNK, "ALLOC_CHUNK"}, \ { COMMIT_TRANS, "COMMIT_TRANS"}) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h @@ -269,6 +269,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) #define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) +#define BTRFS_FEATURE_INCOMPAT_METADATA_UUID (1ULL << 10) struct btrfs_ioctl_feature_flags { __u64 compat_flags; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h @@ -458,6 +458,7 @@ struct btrfs_free_space_header { #define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) #define BTRFS_SUPER_FLAG_METADUMP_V2 (1ULL << 34) #define BTRFS_SUPER_FLAG_CHANGING_FSID (1ULL << 35) +#define BTRFS_SUPER_FLAG_CHANGING_FSID_V2 (1ULL << 36) /*