whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit f6b1495fba0b66cfa05efa0ca2370513b79b45b6
parent bc77789a491cdc6f47e5bbd1d04ddd283d64658b
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu, 27 Dec 2018 17:09:41 -0800

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "All cleanups and bug fixes; most notably, fix some problems discovered
  in ext4's NFS support, and fix an ioctl (EXT4_IOC_GROUP_ADD) used by
  old versions of e2fsprogs which we accidentally broke a while back.

  Also fixed some error paths in ext4's quota and inline data support.

  Finally, improve tail latency in jbd2's commit code"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: check for shutdown and r/o file system in ext4_write_inode()
  ext4: force inode writes when nfsd calls commit_metadata()
  ext4: avoid declaring fs inconsistent due to invalid file handles
  ext4: include terminating u32 in size of xattr entries when expanding inodes
  ext4: compare old and new mode before setting update_mode flag
  ext4: fix EXT4_IOC_GROUP_ADD ioctl
  ext4: hard fail dax mount on unsupported devices
  jbd2: update locking documentation for transaction_t
  ext4: remove redundant condition check
  jbd2: clean up indentation issue, replace spaces with tab
  ext4: clean up indentation issues, remove extraneous tabs
  ext4: missing unlock/put_page() in ext4_try_to_write_inline_data()
  ext4: fix possible use after free in ext4_quota_enable
  jbd2: avoid long hold times of j_state_lock while committing a transaction
  ext4: add ext4_sb_bread() to disambiguate ENOMEM cases

Diffstat:
Mfs/ext4/acl.c | 3++-
Mfs/ext4/ext4.h | 17+++++++++++++++--
Mfs/ext4/ialloc.c | 2+-
Mfs/ext4/inline.c | 5++++-
Mfs/ext4/inode.c | 63++++++++++++++++++++++++++++++++++++++++++++-------------------
Mfs/ext4/ioctl.c | 2+-
Mfs/ext4/migrate.c | 48++++++++++++++++++++++++------------------------
Mfs/ext4/namei.c | 4++--
Mfs/ext4/resize.c | 79++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mfs/ext4/super.c | 92+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Mfs/ext4/xattr.c | 83++++++++++++++++++++++++++++++++++++++-----------------------------------------
Mfs/jbd2/commit.c | 3+++
Mfs/jbd2/transaction.c | 45+++++++++++++++++++++++++++++++++++++++------
Minclude/linux/jbd2.h | 7++++---
Minclude/trace/events/ext4.h | 20++++++++++++++++++++
15 files changed, 296 insertions(+), 177 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c @@ -248,7 +248,8 @@ retry: error = posix_acl_update_mode(inode, &mode, &acl); if (error) goto out_stop; - update_mode = 1; + if (mode != inode->i_mode) + update_mode = 1; } error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h @@ -2454,8 +2454,19 @@ int do_journal_get_write_access(handle_t *handle, #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 -extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); @@ -2538,6 +2549,8 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c @@ -1225,7 +1225,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) if (!ext4_test_bit(bit, bitmap_bh->b_data)) goto bad_orphan; - inode = ext4_iget(sb, ino); + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(sb, "couldn't read orphan inode %lu (err %d)", diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c @@ -705,8 +705,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto out_up_read; + } } ret = 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c @@ -4817,7 +4817,9 @@ static inline u64 ext4_inode_peek_iversion(const struct inode *inode) return inode_peek_iversion(inode); } -struct inode *ext4_iget(struct super_block *sb, unsigned long ino) +struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; @@ -4831,6 +4833,18 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) gid_t i_gid; projid_t i_projid; + if (((flags & EXT4_IGET_NORMAL) && + (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || + (ino < EXT4_ROOT_INO) || + (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { + if (flags & EXT4_IGET_HANDLE) + return ERR_PTR(-ESTALE); + __ext4_error(sb, function, line, + "inode #%lu: comm %s: iget: illegal inode #", + ino, current->comm); + return ERR_PTR(-EFSCORRUPTED); + } + inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); @@ -4846,18 +4860,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) raw_inode = ext4_raw_inode(&iloc); if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { - EXT4_ERROR_INODE(inode, "root inode unallocated"); + ext4_error_inode(inode, function, line, 0, + "iget: root inode unallocated"); ret = -EFSCORRUPTED; goto bad_inode; } + if ((flags & EXT4_IGET_HANDLE) && + (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { - EXT4_ERROR_INODE(inode, - "bad extra_isize %u (inode size %u)", + ext4_error_inode(inode, function, line, 0, + "iget: bad extra_isize %u " + "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; @@ -4879,7 +4901,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { - EXT4_ERROR_INODE(inode, "checksum invalid"); + ext4_error_inode(inode, function, line, 0, + "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } @@ -4936,7 +4959,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { - EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ext4_error_inode(inode, function, line, 0, + "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } @@ -5012,7 +5036,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ext4_error_inode(inode, function, line, 0, + "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; @@ -5040,8 +5065,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - EXT4_ERROR_INODE(inode, - "immutable or append flags not allowed on symlinks"); + ext4_error_inode(inode, function, line, 0, + "iget: immutable or append flags " + "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } @@ -5071,7 +5097,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) make_bad_inode(inode); } else { ret = -EFSCORRUPTED; - EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); + ext4_error_inode(inode, function, line, 0, + "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } brelse(iloc.bh); @@ -5085,13 +5112,6 @@ bad_inode: return ERR_PTR(ret); } -struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) -{ - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-EFSCORRUPTED); - return ext4_iget(sb, ino); -} - static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) @@ -5380,9 +5400,13 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || + sb_rdonly(inode->i_sb)) return 0; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); @@ -5398,7 +5422,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; - err = ext4_force_commit(inode->i_sb); + err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c @@ -125,7 +125,7 @@ static long swap_inode_boot_loader(struct super_block *sb, !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) return -EPERM; - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c @@ -116,9 +116,9 @@ static int update_ind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -145,9 +145,9 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -175,9 +175,9 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -224,9 +224,9 @@ static int free_dind_blocks(handle_t *handle, struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -254,9 +254,9 @@ static int free_tind_blocks(handle_t *handle, struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -382,9 +382,9 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, struct ext4_extent_header *eh; block = ext4_idx_pblock(ix); - bh = sb_bread(inode->i_sb, block); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = (struct ext4_extent_header *)bh->b_data; if (eh->eh_depth != 0) { @@ -535,22 +535,22 @@ int ext4_ext_migrate(struct inode *inode) if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); - if (retval) - goto err_out; + if (retval) + goto err_out; } else lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); - if (retval) - goto err_out; + if (retval) + goto err_out; } else lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); - if (retval) - goto err_out; + if (retval) + goto err_out; } /* * Build the last extent diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c @@ -1571,7 +1571,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi dentry); return ERR_PTR(-EFSCORRUPTED); } - inode = ext4_iget_normal(dir->i_sb, ino); + inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1613,7 +1613,7 @@ struct dentry *ext4_get_parent(struct dentry *child) return ERR_PTR(-EFSCORRUPTED); } - return d_obtain_alias(ext4_iget_normal(child->d_sb, ino)); + return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c @@ -127,10 +127,12 @@ static int verify_group_input(struct super_block *sb, else if (free_blocks_count < 0) ext4_warning(sb, "Bad blocks count %u", input->blocks_count); - else if (!(bh = sb_bread(sb, end - 1))) + else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) { + err = PTR_ERR(bh); + bh = NULL; ext4_warning(sb, "Cannot read last block (%llu)", end - 1); - else if (outside(input->block_bitmap, start, end)) + } else if (outside(input->block_bitmap, start, end)) ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) @@ -781,11 +783,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, struct ext4_super_block *es = EXT4_SB(sb)->s_es; unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; - struct buffer_head **o_group_desc, **n_group_desc; - struct buffer_head *dind; - struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc = NULL; + struct buffer_head *dind = NULL; + struct buffer_head *gdb_bh = NULL; int gdbackups; - struct ext4_iloc iloc; + struct ext4_iloc iloc = { .bh = NULL }; __le32 *data; int err; @@ -794,21 +796,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; - goto exit_bh; + goto errout; } data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_bh; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; + goto errout; } data = (__le32 *)dind->b_data; @@ -816,18 +819,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_warning(sb, "new group %u GDT block %llu not reserved", group, gdblock); err = -EINVAL; - goto exit_dind; + goto errout; } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (unlikely(err)) - goto exit_dind; + goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_dind; + goto errout; BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, dind); @@ -837,7 +840,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dind; + goto errout; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -846,7 +849,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); - goto exit_inode; + goto errout; } /* @@ -862,7 +865,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, dind); if (unlikely(err)) { ext4_std_error(sb, err); - goto exit_inode; + goto errout; } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> (9 - EXT4_SB(sb)->s_cluster_bits); @@ -871,8 +874,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); - iloc.bh = NULL; - goto exit_inode; + goto errout; } brelse(dind); @@ -888,15 +890,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_super(handle, sb); if (err) ext4_std_error(sb, err); - return err; - -exit_inode: +errout: kvfree(n_group_desc); brelse(iloc.bh); -exit_dind: brelse(dind); -exit_bh: brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); @@ -916,9 +914,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb, gdblock = ext4_meta_bg_first_block_no(sb, group) + ext4_bg_has_super(sb, group); - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), GFP_NOFS); @@ -975,9 +973,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, return -ENOMEM; data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; goto exit_free; } @@ -996,9 +995,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, err = -EINVAL; goto exit_bh; } - primary[res] = sb_bread(sb, blk); - if (!primary[res]) { - err = -EIO; + primary[res] = ext4_sb_bread(sb, blk, 0); + if (IS_ERR(primary[res])) { + err = PTR_ERR(primary[res]); + primary[res] = NULL; goto exit_bh; } gdbackups = verify_reserved_gdb(sb, group, primary[res]); @@ -1631,13 +1631,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } if (reserved_gdb || gdb_off == 0) { - if (ext4_has_feature_resize_inode(sb) || + if (!ext4_has_feature_resize_inode(sb) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; } - inode = ext4_iget(sb, EXT4_RESIZE_INO); + inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); @@ -1965,7 +1965,8 @@ retry: } if (!resize_inode) - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO, + EXT4_IGET_SPECIAL); if (IS_ERR(resize_inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(resize_inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c @@ -140,6 +140,29 @@ MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +/* + * This works like sb_bread() except it uses ERR_PTR for error + * returns. Currently with sb_bread it's impossible to distinguish + * between ENOMEM and EIO situations (since both result in a NULL + * return. + */ +struct buffer_head * +ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) +{ + struct buffer_head *bh = sb_getblk(sb, block); + + if (bh == NULL) + return ERR_PTR(-ENOMEM); + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + return ERR_PTR(-EIO); +} + static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { @@ -1000,14 +1023,13 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } - if (sbi->s_ea_inode_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); - sbi->s_ea_inode_cache = NULL; - } - if (sbi->s_ea_block_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_block_cache); - sbi->s_ea_block_cache = NULL; - } + + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); brelse(sbi->s_sbh); @@ -1151,20 +1173,11 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb, { struct inode *inode; - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-ESTALE); - if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) - return ERR_PTR(-ESTALE); - - /* iget isn't really right if the inode is currently unallocated!! - * - * ext4_read_inode will return a bad_inode if the inode had been - * deleted, so we should be safe. - * + /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ - inode = ext4_iget_normal(sb, ino); + inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { @@ -1189,6 +1202,16 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, ext4_nfs_get_inode); } +static int ext4_nfs_commit_metadata(struct inode *inode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL + }; + + trace_ext4_nfs_commit_metadata(inode); + return ext4_write_inode(inode, &wbc); +} + /* * Try to release metadata pages (indirect blocks, directories) which are * mapped via the block device. Since these pages could have journal heads @@ -1393,6 +1416,7 @@ static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, + .commit_metadata = ext4_nfs_commit_metadata, }; enum { @@ -1939,7 +1963,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, #ifdef CONFIG_FS_DAX ext4_msg(sb, KERN_WARNING, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= m->mount_opt; + sbi->s_mount_opt |= m->mount_opt; #else ext4_msg(sb, KERN_INFO, "dax option not supported"); return -1; @@ -3842,12 +3866,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; + goto failed_mount; } if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext4_msg(sb, KERN_ERR, - "DAX unsupported by block device. Turning off DAX."); - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; + "DAX unsupported by block device."); + goto failed_mount; } } @@ -4328,7 +4352,7 @@ no_journal: * so we can safely mount the rest of the filesystem now. */ - root = ext4_iget(sb, EXT4_ROOT_INO); + root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); ret = PTR_ERR(root); @@ -4522,14 +4546,12 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: - if (sbi->s_ea_inode_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); - sbi->s_ea_inode_cache = NULL; - } - if (sbi->s_ea_block_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_block_cache); - sbi->s_ea_block_cache = NULL; - } + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -4598,7 +4620,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ - journal_inode = ext4_iget(sb, journal_inum); + journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return NULL; @@ -5680,7 +5702,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, if (!qf_inums[type]) return -EPERM; - qf_inode = ext4_iget(sb, qf_inums[type]); + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); return PTR_ERR(qf_inode); @@ -5690,9 +5712,9 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_enable(qf_inode, type, format_id, flags); - iput(qf_inode); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); + iput(qf_inode); return err; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c @@ -384,7 +384,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, struct inode *inode; int err; - inode = ext4_iget(parent->i_sb, ea_ino); + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, @@ -522,14 +522,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); - error = -ENODATA; if (!EXT4_I(inode)->i_file_acl) - goto cleanup; + return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - goto cleanup; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); @@ -696,26 +695,23 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); - error = 0; if (!EXT4_I(inode)->i_file_acl) - goto cleanup; + return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) - goto cleanup; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); - error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); - + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, + buffer_size); cleanup: brelse(bh); - return error; } @@ -830,9 +826,9 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) } if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - ret = -EIO; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); goto out; } @@ -1486,7 +1482,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, } while (ce) { - ea_inode = ext4_iget(inode->i_sb, ce->e_value); + ea_inode = ext4_iget(inode->i_sb, ce->e_value, + EXT4_IGET_NORMAL); if (!IS_ERR(ea_inode) && !is_bad_inode(ea_inode) && (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && @@ -1821,16 +1818,15 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ - bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bs->bh) - goto cleanup; + bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bs->bh)) + return PTR_ERR(bs->bh); ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) - goto cleanup; + return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); @@ -1839,13 +1835,10 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) - goto cleanup; + return error; bs->s.not_found = error; } - error = 0; - -cleanup: - return error; + return 0; } static int @@ -2274,9 +2267,9 @@ static struct buffer_head *ext4_xattr_get_block(struct inode *inode) if (!EXT4_I(inode)->i_file_acl) return NULL; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - return ERR_PTR(-EIO); + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); @@ -2729,7 +2722,7 @@ retry: base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; - total_ino = sizeof(struct ext4_xattr_ibody_header); + total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); error = xattr_check_inode(inode, header, end); if (error) @@ -2746,10 +2739,11 @@ retry: if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); goto cleanup; + } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); @@ -2903,11 +2897,12 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, } if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); - error = -EIO; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + if (error == -EIO) + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3060,8 +3055,10 @@ ext4_xattr_block_cache_find(struct inode *inode, while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_value); - if (!bh) { + bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); + if (IS_ERR(bh)) { + if (PTR_ERR(bh) == -ENOMEM) + return NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c @@ -439,6 +439,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); + commit_transaction->t_state = T_SWITCH; + write_unlock(&journal->j_state_lock); J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); @@ -505,6 +507,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) atomic_sub(atomic_read(&journal->j_reserved_credits), &commit_transaction->t_outstanding_credits); + write_lock(&journal->j_state_lock); trace_jbd2_commit_flushing(journal, commit_transaction); stats.run.rs_flushing = jiffies; stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c @@ -138,9 +138,9 @@ static inline void update_t_max_wait(transaction_t *transaction, } /* - * Wait until running transaction passes T_LOCKED state. Also starts the commit - * if needed. The function expects running transaction to exist and releases - * j_state_lock. + * Wait until running transaction passes to T_FLUSH state and new transaction + * can thus be started. Also starts the commit if needed. The function expects + * running transaction to exist and releases j_state_lock. */ static void wait_transaction_locked(journal_t *journal) __releases(journal->j_state_lock) @@ -160,6 +160,32 @@ static void wait_transaction_locked(journal_t *journal) finish_wait(&journal->j_wait_transaction_locked, &wait); } +/* + * Wait until running transaction transitions from T_SWITCH to T_FLUSH + * state and new transaction can thus be started. The function releases + * j_state_lock. + */ +static void wait_transaction_switching(journal_t *journal) + __releases(journal->j_state_lock) +{ + DEFINE_WAIT(wait); + + if (WARN_ON(!journal->j_running_transaction || + journal->j_running_transaction->t_state != T_SWITCH)) + return; + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + read_unlock(&journal->j_state_lock); + /* + * We don't call jbd2_might_wait_for_commit() here as there's no + * waiting for outstanding handles happening anymore in T_SWITCH state + * and handling of reserved handles actually relies on that for + * correctness. + */ + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); +} + static void sub_reserved_credits(journal_t *journal, int blocks) { atomic_sub(blocks, &journal->j_reserved_credits); @@ -183,7 +209,8 @@ static int add_transaction_credits(journal_t *journal, int blocks, * If the current transaction is locked down for commit, wait * for the lock to be released. */ - if (t->t_state == T_LOCKED) { + if (t->t_state != T_RUNNING) { + WARN_ON_ONCE(t->t_state >= T_FLUSH); wait_transaction_locked(journal); return 1; } @@ -360,8 +387,14 @@ repeat: /* * We have handle reserved so we are allowed to join T_LOCKED * transaction and we don't have to check for transaction size - * and journal space. + * and journal space. But we still have to wait while running + * transaction is being switched to a committing one as it + * won't wait for any handles anymore. */ + if (transaction->t_state == T_SWITCH) { + wait_transaction_switching(journal); + goto repeat; + } sub_reserved_credits(journal, blocks); handle->h_reserved = 0; } @@ -910,7 +943,7 @@ repeat: * this is the first time this transaction is touching this buffer, * reset the modified flag */ - jh->b_modified = 0; + jh->b_modified = 0; /* * If the buffer is not journaled right now, we need to make sure it diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h @@ -575,6 +575,7 @@ struct transaction_s enum { T_RUNNING, T_LOCKED, + T_SWITCH, T_FLUSH, T_COMMIT, T_COMMIT_DFLUSH, @@ -662,13 +663,13 @@ struct transaction_s /* * Number of outstanding updates running on this transaction - * [t_handle_lock] + * [none] */ atomic_t t_updates; /* * Number of buffers reserved for use by all handles in this transaction - * handle but not yet modified. [t_handle_lock] + * handle but not yet modified. [none] */ atomic_t t_outstanding_credits; @@ -690,7 +691,7 @@ struct transaction_s ktime_t t_start_time; /* - * How many handles used this transaction? [t_handle_lock] + * How many handles used this transaction? [none] */ atomic_t t_handle_count; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h @@ -226,6 +226,26 @@ TRACE_EVENT(ext4_drop_inode, (unsigned long) __entry->ino, __entry->drop) ); +TRACE_EVENT(ext4_nfs_commit_metadata, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + ), + + TP_printk("dev %d,%d ino %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino) +); + TRACE_EVENT(ext4_mark_inode_dirty, TP_PROTO(struct inode *inode, unsigned long IP),