whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 85b5d4bcab8b46664f8e1993bd5919cb0f24a3ca
parent 11743c56785c751c087eecdb98713eef796609e0
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue, 30 Oct 2018 08:27:13 -0700

Merge tag 'for-4.20-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull more btrfs updates from David Sterba:
 "This contains a few minor updates and fixes that were under testing or
  arrived shortly after the merge window freeze, mostly stable material"

* tag 'for-4.20-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  Btrfs: fix use-after-free when dumping free space
  Btrfs: fix use-after-free during inode eviction
  btrfs: move the dio_sem higher up the callchain
  btrfs: don't run delayed_iputs in commit
  btrfs: fix insert_reserved error handling
  btrfs: only free reserved extent if we didn't insert it
  btrfs: don't use ctl->free_space for max_extent_size
  btrfs: set max_extent_size properly
  btrfs: reset max_extent_size properly
  MAINTAINERS: update my email address for btrfs
  btrfs: delayed-ref: extract find_first_ref_head from find_ref_head
  Btrfs: fix deadlock when writing out free space caches
  Btrfs: fix assertion on fsync of regular file when using no-holes feature
  Btrfs: fix null pointer dereference on compressed write path error

Diffstat:
MMAINTAINERS | 2+-
Mfs/btrfs/ctree.c | 17+++++++++++++++++
Mfs/btrfs/delayed-ref.c | 50+++++++++++++++++++++++++++-----------------------
Mfs/btrfs/extent-tree.c | 37+++++++++++++++++--------------------
Mfs/btrfs/file.c | 12++++++++++++
Mfs/btrfs/free-space-cache.c | 32++++++++++++++++++++++----------
Mfs/btrfs/inode.c | 15+++++++++++++--
Mfs/btrfs/transaction.c | 9---------
Mfs/btrfs/tree-log.c | 5++---
9 files changed, 111 insertions(+), 68 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS @@ -3160,7 +3160,7 @@ F: drivers/gpio/gpio-bt8xx.c BTRFS FILE SYSTEM M: Chris Mason <clm@fb.com> -M: Josef Bacik <jbacik@fb.com> +M: Josef Bacik <josef@toxicpanda.com> M: David Sterba <dsterba@suse.com> L: linux-btrfs@vger.kernel.org W: http://btrfs.wiki.kernel.org/ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c @@ -1014,9 +1014,26 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; + /* + * If we are COWing a node/leaf from the extent, chunk or device trees, + * make sure that we do not finish block group creation of pending block + * groups. We do this to avoid a deadlock. + * COWing can result in allocation of a new chunk, and flushing pending + * block groups (btrfs_create_pending_block_groups()) can be triggered + * when finishing allocation of a new chunk. Creation of a pending block + * group modifies the extent, chunk and device trees, therefore we could + * deadlock with ourselves since we are holding a lock on an extent + * buffer that btrfs_create_pending_block_groups() may try to COW later. + */ + if (root == fs_info->extent_root || + root == fs_info->chunk_root || + root == fs_info->dev_root) + trans->can_flush_pending_bgs = false; + cow = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, &disk_key, level, search_start, empty_size); + trans->can_flush_pending_bgs = true; if (IS_ERR(cow)) return PTR_ERR(cow); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c @@ -164,14 +164,27 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, return NULL; } +static struct btrfs_delayed_ref_head *find_first_ref_head( + struct btrfs_delayed_ref_root *dr) +{ + struct rb_node *n; + struct btrfs_delayed_ref_head *entry; + + n = rb_first_cached(&dr->href_root); + if (!n) + return NULL; + + entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); + + return entry; +} + /* - * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot. - * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. But if no bigger one is found then the first node of the - * ref head tree will be returned. + * Find a head entry based on bytenr. This returns the delayed ref head if it + * was able to find one, or NULL if nothing was in that spot. If return_bigger + * is given, the next bigger entry is returned if no exact match is found. */ -static struct btrfs_delayed_ref_head* find_ref_head( +static struct btrfs_delayed_ref_head *find_ref_head( struct btrfs_delayed_ref_root *dr, u64 bytenr, bool return_bigger) { @@ -195,10 +208,9 @@ static struct btrfs_delayed_ref_head* find_ref_head( if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) - n = rb_first_cached(&dr->href_root); + return NULL; entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - return entry; } return entry; } @@ -355,33 +367,25 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; - u64 start; - bool loop = false; again: - start = delayed_refs->run_delayed_start; - head = find_ref_head(delayed_refs, start, true); - if (!head && !loop) { + head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, + true); + if (!head && delayed_refs->run_delayed_start != 0) { delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; - head = find_ref_head(delayed_refs, start, true); - if (!head) - return NULL; - } else if (!head && loop) { - return NULL; + head = find_first_ref_head(delayed_refs); } + if (!head) + return NULL; while (head->processing) { struct rb_node *node; node = rb_next(&head->href_node); if (!node) { - if (loop) + if (delayed_refs->run_delayed_start == 0) return NULL; delayed_refs->run_delayed_start = 0; - start = 0; - loop = true; goto again; } head = rb_entry(node, struct btrfs_delayed_ref_head, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c @@ -2366,6 +2366,9 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, insert_reserved); else BUG(); + if (ret && insert_reserved) + btrfs_pin_extent(trans->fs_info, node->bytenr, + node->num_bytes, 1); return ret; } @@ -2954,7 +2957,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; - bool can_flush_pending_bgs = trans->can_flush_pending_bgs; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2971,7 +2973,6 @@ again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, count); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3002,7 +3003,6 @@ again: goto again; } out: - trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -4568,6 +4568,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, goto out; } else { ret = 1; + space_info->max_extent_size = 0; } space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; @@ -4589,11 +4590,9 @@ out: * the block groups that were made dirty during the lifetime of the * transaction. */ - if (trans->can_flush_pending_bgs && - trans->chunk_bytes_reserved >= (u64)SZ_2M) { + if (trans->chunk_bytes_reserved >= (u64)SZ_2M) btrfs_create_pending_block_groups(trans); - btrfs_trans_release_chunk_metadata(trans); - } + return ret; } @@ -6464,6 +6463,7 @@ static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; if (delalloc) cache->delalloc_bytes -= num_bytes; @@ -7260,6 +7260,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; u64 max_extent_size = 0; + u64 max_free_space = 0; u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; @@ -7555,8 +7556,8 @@ unclustered_alloc: spin_lock(&ctl->tree_lock); if (ctl->free_space < num_bytes + empty_cluster + empty_size) { - if (ctl->free_space > max_extent_size) - max_extent_size = ctl->free_space; + max_free_space = max(max_free_space, + ctl->free_space); spin_unlock(&ctl->tree_lock); goto loop; } @@ -7723,6 +7724,8 @@ loop: } out: if (ret == -ENOSPC) { + if (!max_extent_size) + max_extent_size = max_free_space; spin_lock(&space_info->lock); space_info->max_extent_size = max_extent_size; spin_unlock(&space_info->lock); @@ -8004,21 +8007,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); - if (!path) { - btrfs_free_and_pin_reserved_extent(fs_info, - extent_key.objectid, - fs_info->nodesize); + if (!path) return -ENOMEM; - } path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, &extent_key, size); if (ret) { btrfs_free_path(path); - btrfs_free_and_pin_reserved_extent(fs_info, - extent_key.objectid, - fs_info->nodesize); return ret; } @@ -10132,9 +10128,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) struct btrfs_block_group_item item; struct btrfs_key key; int ret = 0; - bool can_flush_pending_bgs = trans->can_flush_pending_bgs; - trans->can_flush_pending_bgs = false; + if (!trans->can_flush_pending_bgs) + return; + while (!list_empty(&trans->new_bgs)) { block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group_cache, @@ -10159,7 +10156,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) next: list_del_init(&block_group->bg_list); } - trans->can_flush_pending_bgs = can_flush_pending_bgs; + btrfs_trans_release_chunk_metadata(trans); } int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c @@ -2078,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; inode_lock(inode); + + /* + * We take the dio_sem here because the tree log stuff can race with + * lockless dio writes and get an extent map logged for an extent we + * never waited on. We need it this high up for lockdep reasons. + */ + down_write(&BTRFS_I(inode)->dio_sem); + atomic_inc(&root->log_batch); /* @@ -2086,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2109,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2127,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); goto out; } @@ -2148,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ + up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); /* diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c @@ -1772,6 +1772,13 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, return -1; } +static inline u64 get_max_extent_size(struct btrfs_free_space *entry) +{ + if (entry->bitmap) + return entry->max_extent_size; + return entry->bytes; +} + /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, @@ -1793,8 +1800,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, for (node = &entry->offset_index; node; node = rb_next(node)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bytes < *bytes) { - if (entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); continue; } @@ -1812,8 +1819,8 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, } if (entry->bytes < *bytes + align_off) { - if (entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); continue; } @@ -1825,8 +1832,10 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, *offset = tmp; *bytes = size; return entry; - } else if (size > *max_extent_size) { - *max_extent_size = size; + } else { + *max_extent_size = + max(get_max_extent_size(entry), + *max_extent_size); } continue; } @@ -2449,6 +2458,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, struct rb_node *n; int count = 0; + spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) @@ -2457,6 +2467,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } + spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); btrfs_info(fs_info, @@ -2685,8 +2696,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { - if (search_bytes > *max_extent_size) - *max_extent_size = search_bytes; + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); return 0; } @@ -2723,8 +2734,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, entry = rb_entry(node, struct btrfs_free_space, offset_index); while (1) { - if (entry->bytes < bytes && entry->bytes > *max_extent_size) - *max_extent_size = entry->bytes; + if (entry->bytes < bytes) + *max_extent_size = max(get_max_extent_size(entry), + *max_extent_size); if (entry->bytes < bytes || (!entry->bitmap && entry->offset < min_start)) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c @@ -502,6 +502,7 @@ again: pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ + nr_pages = 0; goto cont; } @@ -2940,6 +2941,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) bool truncated = false; bool range_locked = false; bool clear_new_delalloc_bytes = false; + bool clear_reserved_extent = true; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && @@ -3043,10 +3045,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); - if (!ret) + if (!ret) { + clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, ordered_extent->start, ordered_extent->disk_len); + } } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, ordered_extent->len, @@ -3107,8 +3111,13 @@ out: * wrong we need to return the space for this ordered extent * back to the allocator. We only free the extent in the * truncated case if we didn't write out the extent at all. + * + * If we made it past insert_reserved_file_extent before we + * errored out then we don't need to do this as the accounting + * has already been done. */ if ((ret || !logical_len) && + clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) btrfs_free_reserved_extent(fs_info, @@ -5259,11 +5268,13 @@ static void evict_inode_truncate_pages(struct inode *inode) struct extent_state *cached_state = NULL; u64 start; u64 end; + unsigned state_flags; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); start = state->start; end = state->end; + state_flags = state->state; spin_unlock(&io_tree->lock); lock_extent_bits(io_tree, start, end, &cached_state); @@ -5276,7 +5287,7 @@ static void evict_inode_truncate_pages(struct inode *inode) * * Note, end is the bytenr of last byte, so we need + 1 here. */ - if (state->state & EXTENT_DELALLOC) + if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c @@ -2283,15 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); - /* - * If fs has been frozen, we can not handle delayed iputs, otherwise - * it'll result in deadlock about SB_FREEZE_FS. - */ - if (current != fs_info->transaction_kthread && - current != fs_info->cleaner_kthread && - !test_bit(BTRFS_FS_FROZEN, &fs_info->flags)) - btrfs_run_delayed_iputs(fs_info); - return ret; scrub_continue: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c @@ -4390,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); - down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; logged_start = start; @@ -4456,7 +4455,6 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); - up_write(&inode->dio_sem); btrfs_release_path(path); if (!ret) @@ -4652,7 +4650,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, ASSERT(len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE)); + BTRFS_COMPRESS_NONE) || + (len < i_size && i_size < fs_info->sectorsize)); return 0; }