whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 5404525b98c2fa18b8bd47047f9bf2e67825c857
parent b36fdc6853a38a6f8749897a33435635019e0647
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu,  6 Sep 2018 09:04:45 -0700

Merge tag 'for-4.19-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - fix for improper fsync after hardlink

 - fix for a corruption during file deduplication

 - use after free fixes

 - RCU warning fix

 - fix for buffered write to nodatacow file

* tag 'for-4.19-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: Fix suspicious RCU usage warning in btrfs_debug_in_rcu
  btrfs: use after free in btrfs_quota_enable
  btrfs: btrfs_shrink_device should call commit transaction at the end
  btrfs: fix qgroup_free wrong num_bytes in btrfs_subvolume_reserve_metadata
  Btrfs: fix data corruption when deduplicating between different files
  Btrfs: sync log after logging new name
  Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space

Diffstat:
Mfs/btrfs/ctree.h | 12++++++++++--
Mfs/btrfs/disk-io.c | 1+
Mfs/btrfs/extent-tree.c | 17++++++++---------
Mfs/btrfs/inode.c | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Mfs/btrfs/ioctl.c | 35+++++++++++++++++++++++++++++++++++
Mfs/btrfs/qgroup.c | 5++---
Mfs/btrfs/tree-log.c | 48++++++++++++++++++++++++++++++++++++++++++------
Mfs/btrfs/tree-log.h | 10+++++++++-
Mfs/btrfs/volumes.c | 7++++++-
9 files changed, 197 insertions(+), 55 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h @@ -1280,6 +1280,7 @@ struct btrfs_root { int send_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshotted; + atomic_t snapshot_force_cow; /* For qgroup metadata reserved space */ spinlock_t qgroup_meta_rsv_lock; @@ -3390,9 +3391,9 @@ do { \ #define btrfs_debug(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) #endif @@ -3404,6 +3405,13 @@ do { \ rcu_read_unlock(); \ } while (0) +#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_no_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + #define btrfs_printk_ratelimited(fs_info, fmt, args...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c @@ -1187,6 +1187,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_batch, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); + atomic_set(&root->snapshot_force_cow, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c @@ -5800,7 +5800,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) * root: the root of the parent directory * rsv: block reservation * items: the number of items that we need do reservation - * qgroup_reserved: used to return the reserved size in qgroup + * use_global_rsv: allow fallback to the global block reservation * * This function is used to reserve the space for snapshot/subvolume * creation and deletion. Those operations are different with the @@ -5810,10 +5810,10 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) * the space reservation mechanism in start_transaction(). */ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - int items, + struct btrfs_block_rsv *rsv, int items, bool use_global_rsv) { + u64 qgroup_num_bytes = 0; u64 num_bytes; int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -5821,12 +5821,11 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { /* One for parent inode, two for dir entries */ - num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + qgroup_num_bytes = 3 * fs_info->nodesize; + ret = btrfs_qgroup_reserve_meta_prealloc(root, + qgroup_num_bytes, true); if (ret) return ret; - } else { - num_bytes = 0; } num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); @@ -5838,8 +5837,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); - if (ret && num_bytes) - btrfs_qgroup_free_meta_prealloc(root, num_bytes); + if (ret && qgroup_num_bytes) + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c @@ -1271,7 +1271,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 disk_num_bytes; u64 ram_bytes; int extent_type; - int ret, err; + int ret; int type; int nocow; int check_prev = 1; @@ -1403,11 +1403,8 @@ next_slot: * if there are pending snapshots for this root, * we fall into common COW way. */ - if (!nolock) { - err = btrfs_start_write_no_snapshotting(root); - if (!err) - goto out_check; - } + if (!nolock && atomic_read(&root->snapshot_force_cow)) + goto out_check; /* * force cow if csum exists in the range. * this ensure that csum for a given extent are @@ -1416,9 +1413,6 @@ next_slot: ret = csum_exist_in_range(fs_info, disk_bytenr, num_bytes); if (ret) { - if (!nolock) - btrfs_end_write_no_snapshotting(root); - /* * ret could be -EIO if the above fails to read * metadata. @@ -1431,11 +1425,8 @@ next_slot: WARN_ON_ONCE(nolock); goto out_check; } - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) { - if (!nolock) - btrfs_end_write_no_snapshotting(root); + if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; - } nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1448,8 +1439,6 @@ next_slot: out_check: if (extent_end <= start) { path->slots[0]++; - if (!nolock && nocow) - btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); goto next_slot; @@ -1471,8 +1460,6 @@ out_check: end, page_started, nr_written, 1, NULL); if (ret) { - if (!nolock && nocow) - btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); @@ -1492,8 +1479,6 @@ out_check: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - if (!nolock && nocow) - btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); @@ -1532,8 +1517,6 @@ out_check: EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_PRIVATE2); - if (!nolock && nocow) - btrfs_end_write_no_snapshotting(root); cur_offset = extent_end; /* @@ -6639,6 +6622,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; + int ret; + err = btrfs_update_inode(trans, root, inode); if (err) goto fail; @@ -6652,7 +6637,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); + ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, + true, NULL); + if (ret == BTRFS_NEED_TRANS_COMMIT) { + err = btrfs_commit_transaction(trans); + trans = NULL; + } } fail: @@ -9388,14 +9378,21 @@ static int btrfs_rename_exchange(struct inode *old_dir, u64 new_idx = 0; u64 root_objectid; int ret; - int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; + struct btrfs_log_ctx ctx_root; + struct btrfs_log_ctx ctx_dest; + bool sync_log_root = false; + bool sync_log_dest = false; + bool commit_transaction = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; + btrfs_init_log_ctx(&ctx_root, old_inode); + btrfs_init_log_ctx(&ctx_dest, new_inode); + /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); @@ -9542,15 +9539,29 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (root_log_pinned) { parent = new_dentry->d_parent; - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - parent); + ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), + BTRFS_I(old_dir), parent, + false, &ctx_root); + if (ret == BTRFS_NEED_LOG_SYNC) + sync_log_root = true; + else if (ret == BTRFS_NEED_TRANS_COMMIT) + commit_transaction = true; + ret = 0; btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - parent = old_dentry->d_parent; - btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), - parent); + if (!commit_transaction) { + parent = old_dentry->d_parent; + ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), + BTRFS_I(new_dir), parent, + false, &ctx_dest); + if (ret == BTRFS_NEED_LOG_SYNC) + sync_log_dest = true; + else if (ret == BTRFS_NEED_TRANS_COMMIT) + commit_transaction = true; + ret = 0; + } btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -9583,8 +9594,26 @@ out_fail: dest_log_pinned = false; } } - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; + if (!ret && sync_log_root && !commit_transaction) { + ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, + &ctx_root); + if (ret) + commit_transaction = true; + } + if (!ret && sync_log_dest && !commit_transaction) { + ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, + &ctx_dest); + if (ret) + commit_transaction = true; + } + if (commit_transaction) { + ret = btrfs_commit_transaction(trans); + } else { + int ret2; + + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; + } out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); @@ -9661,6 +9690,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, int ret; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; + struct btrfs_log_ctx ctx; + bool sync_log = false; + bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9818,8 +9850,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (log_pinned) { struct dentry *parent = new_dentry->d_parent; - btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), - parent); + btrfs_init_log_ctx(&ctx, old_inode); + ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), + BTRFS_I(old_dir), parent, + false, &ctx); + if (ret == BTRFS_NEED_LOG_SYNC) + sync_log = true; + else if (ret == BTRFS_NEED_TRANS_COMMIT) + commit_transaction = true; + ret = 0; btrfs_end_log_trans(root); log_pinned = false; } @@ -9856,7 +9895,19 @@ out_fail: btrfs_end_log_trans(root); log_pinned = false; } - btrfs_end_transaction(trans); + if (!ret && sync_log) { + ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); + if (ret) + commit_transaction = true; + } + if (commit_transaction) { + ret = btrfs_commit_transaction(trans); + } else { + int ret2; + + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; + } out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c @@ -747,6 +747,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; + bool snapshot_force_cow = false; if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; @@ -763,6 +764,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } + /* + * Force new buffered writes to reserve space even when NOCOW is + * possible. This is to avoid later writeback (running dealloc) to + * fallback to COW mode and unexpectedly fail with ENOSPC. + */ atomic_inc(&root->will_be_snapshotted); smp_mb__after_atomic(); /* wait for no snapshot writes */ @@ -773,6 +779,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; + /* + * All previous writes have started writeback in NOCOW mode, so now + * we force future writes to fallback to COW mode during snapshot + * creation. + */ + atomic_inc(&root->snapshot_force_cow); + snapshot_force_cow = true; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, @@ -837,6 +851,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, fail: btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: + if (snapshot_force_cow) + atomic_dec(&root->snapshot_force_cow); if (atomic_dec_and_test(&root->will_be_snapshotted)) wake_up_var(&root->will_be_snapshotted); free_pending: @@ -3453,6 +3469,25 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, same_lock_start = min_t(u64, loff, dst_loff); same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; + } else { + /* + * If the source and destination inodes are different, the + * source's range end offset matches the source's i_size, that + * i_size is not a multiple of the sector size, and the + * destination range does not go past the destination's i_size, + * we must round down the length to the nearest sector size + * multiple. If we don't do this adjustment we end replacing + * with zeroes the bytes in the range that starts at the + * deduplication range's end offset and ends at the next sector + * size multiple. + */ + if (loff + olen == i_size_read(src) && + dst_loff + len < i_size_read(dst)) { + const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; + + len = round_down(i_size_read(src), sz) - loff; + olen = len; + } } again: diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c @@ -1019,10 +1019,9 @@ out_add_root: spin_unlock(&fs_info->qgroup_lock); ret = btrfs_commit_transaction(trans); - if (ret) { - trans = NULL; + trans = NULL; + if (ret) goto out_free_path; - } ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c @@ -6025,14 +6025,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. * - * It will return zero if all goes well, and it will return 1 if a - * full transaction commit is required. + * @ctx can not be NULL when @sync_log is false, and should be NULL when it's + * true (because it's not used). + * + * Return value depends on whether @sync_log is true or false. + * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be + * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT + * otherwise. + * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to + * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, + * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be + * committed (without attempting to sync the log). */ int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent) + struct dentry *parent, + bool sync_log, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; /* * this will force the logging code to walk the dentry chain @@ -6047,9 +6058,34 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return 0; + return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : + BTRFS_DONT_NEED_LOG_SYNC; + + if (sync_log) { + struct btrfs_log_ctx ctx2; + + btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); + ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, &ctx2); + if (ret == BTRFS_NO_LOG_SYNC) + return BTRFS_DONT_NEED_TRANS_COMMIT; + else if (ret) + return BTRFS_NEED_TRANS_COMMIT; + + ret = btrfs_sync_log(trans, inode->root, &ctx2); + if (ret) + return BTRFS_NEED_TRANS_COMMIT; + return BTRFS_DONT_NEED_TRANS_COMMIT; + } + + ASSERT(ctx); + ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, ctx); + if (ret == BTRFS_NO_LOG_SYNC) + return BTRFS_DONT_NEED_LOG_SYNC; + else if (ret) + return BTRFS_NEED_TRANS_COMMIT; - return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, NULL); + return BTRFS_NEED_LOG_SYNC; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h @@ -71,8 +71,16 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); +/* Return values for btrfs_log_new_name() */ +enum { + BTRFS_DONT_NEED_TRANS_COMMIT, + BTRFS_NEED_TRANS_COMMIT, + BTRFS_DONT_NEED_LOG_SYNC, + BTRFS_NEED_LOG_SYNC, +}; int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent); + struct dentry *parent, + bool sync_log, struct btrfs_log_ctx *ctx); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c @@ -4491,7 +4491,12 @@ again: /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); - btrfs_end_transaction(trans); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } else { + ret = btrfs_commit_transaction(trans); + } done: btrfs_free_path(path); if (ret) {