whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 9e1fd794cb6bf813a40849a1fc236703bdcbc1a7
parent b1e243957e9b3ba8e820fb8583bdf18e7c737aa2
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu,  7 Mar 2019 09:38:51 -0800

Merge tag 'xfs-5.1-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs updates from Darrick Wong:
 "Here are a number of new features and bug fixes for 5.1

  They've undergone a week's worth of fstesting and merge cleanly with
  master as of this morning

  Most of the changes center on improving metadata validation and fixing
  problems with online fsck, though there's also a new cache to speed up
  unlinked inode handling and cleanup of the copy on write code in
  preparation for future features

  Changes for Linux 5.1:

   - Fix online fsck to handle inode btrees correctly on 64k block
     filesystems

   - Teach online fsck to check directory and attribute names for
     invalid characters

   - Miscellanous fixes for online fsck

   - Introduce a new panic mask so that we can halt immediately on
     metadata corruption (for debugging purposes)

   - Fix a block mapping race during writeback

   - Cache unlinked inode list backrefs in memory to speed up list
     processing

   - Separate the bnobt/cntbt and inobt/finobt buffer verifiers so that
     we can detect crosslinked btrees

   - Refactor magic number verification so that we can standardize it

   - Strengthen ondisk metadata structure offset build time verification

   - Fix a memory corruption problem in the listxattr code

   - Fix a shutdown problem during log recovery due to unreserved finobt
     expansion

   - Fix a referential integrity problem where O_TMPFILE inodes were put
     on the unlinked list with nlink > 0 which would cause asserts
     during log recovery if the system went down immediately

   - Refactor the delayed allocation allocator to be more clever about
     the possibility that its mapping might be stale

   - Various fixes to the copy on write mechanism

   - Make CoW preallocation suitable for use even with writes that
     wouldn't otherwise require it

   - Refactor an internal API

   - Fix some statx implementation bugs

   - Fix miscellaneous compiler and static checker complaints"

* tag 'xfs-5.1-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (70 commits)
  xfs: fix reporting supported extra file attributes for statx()
  xfs: fix backwards endian conversion in scrub
  xfs: fix uninitialized error variables
  xfs: rework breaking of shared extents in xfs_file_iomap_begin
  xfs: don't pass iomap flags to xfs_reflink_allocate_cow
  xfs: fix uninitialized error variable
  xfs: introduce an always_cow mode
  xfs: report IOMAP_F_SHARED from xfs_file_iomap_begin_delay
  xfs: make COW fork unwritten extent conversions more robust
  xfs: merge COW handling into xfs_file_iomap_begin_delay
  xfs: also truncate holes covered by COW blocks
  xfs: don't use delalloc extents for COW on files with extsize hints
  xfs: fix SEEK_DATA for speculative COW fork preallocation
  xfs: make xfs_bmbt_to_iomap more useful
  xfs: fix xfs_buf magic number endian checks
  xfs: retry COW fork delalloc conversion when no extent was found
  xfs: remove the truncate short cut in xfs_map_blocks
  xfs: move xfs_iomap_write_allocate to xfs_aops.c
  xfs: move stat accounting to xfs_bmapi_convert_delalloc
  xfs: move transaction handling to xfs_bmapi_convert_delalloc
  ..

Diffstat:
MDocumentation/filesystems/xfs.txt | 3++-
Mfs/xfs/libxfs/xfs_ag.c | 6+++---
Mfs/xfs/libxfs/xfs_ag_resv.c | 2+-
Mfs/xfs/libxfs/xfs_alloc.c | 12++++++++----
Mfs/xfs/libxfs/xfs_alloc_btree.c | 74+++++++++++++++++++++++++++++++++++---------------------------------------
Mfs/xfs/libxfs/xfs_attr.c | 17+++++++++++++++++
Mfs/xfs/libxfs/xfs_attr.h | 2+-
Mfs/xfs/libxfs/xfs_attr_leaf.c | 21++++++---------------
Mfs/xfs/libxfs/xfs_attr_remote.c | 8+++++---
Mfs/xfs/libxfs/xfs_bmap.c | 302+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mfs/xfs/libxfs/xfs_bmap.h | 16+++++++---------
Mfs/xfs/libxfs/xfs_bmap_btree.c | 13++++++-------
Mfs/xfs/libxfs/xfs_da_btree.c | 49++++++++++++++++++++++++++++++++++---------------
Mfs/xfs/libxfs/xfs_da_format.h | 3+++
Mfs/xfs/libxfs/xfs_dir2.c | 17+++++++++++++++++
Mfs/xfs/libxfs/xfs_dir2.h | 1+
Mfs/xfs/libxfs/xfs_dir2_block.c | 10+++++-----
Mfs/xfs/libxfs/xfs_dir2_data.c | 12+++++++-----
Mfs/xfs/libxfs/xfs_dir2_leaf.c | 100+++++++++++++++++--------------------------------------------------------------
Mfs/xfs/libxfs/xfs_dir2_node.c | 10+++++-----
Mfs/xfs/libxfs/xfs_dquot_buf.c | 4++++
Mfs/xfs/libxfs/xfs_errortag.h | 4+++-
Mfs/xfs/libxfs/xfs_ialloc.c | 3++-
Mfs/xfs/libxfs/xfs_ialloc_btree.c | 29+++++++++++++++++------------
Mfs/xfs/libxfs/xfs_iext_tree.c | 13++++++-------
Mfs/xfs/libxfs/xfs_inode_buf.c | 11+++++++----
Mfs/xfs/libxfs/xfs_inode_fork.h | 2+-
Mfs/xfs/libxfs/xfs_refcount_btree.c | 3++-
Mfs/xfs/libxfs/xfs_rmap_btree.c | 3++-
Mfs/xfs/libxfs/xfs_sb.c | 7+++++--
Mfs/xfs/libxfs/xfs_shared.h | 4+++-
Mfs/xfs/libxfs/xfs_symlink_remote.c | 3++-
Mfs/xfs/libxfs/xfs_types.c | 24++++++++++++++++++++++++
Mfs/xfs/libxfs/xfs_types.h | 3+++
Mfs/xfs/scrub/agheader.c | 10++++------
Mfs/xfs/scrub/agheader_repair.c | 12+++---------
Mfs/xfs/scrub/attr.c | 11+++++++++++
Mfs/xfs/scrub/bmap.c | 27+++++++++++++++++++++++++++
Mfs/xfs/scrub/dir.c | 6++++++
Mfs/xfs/scrub/ialloc.c | 330++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
Mfs/xfs/scrub/repair.c | 3++-
Mfs/xfs/scrub/repair.h | 3---
Mfs/xfs/scrub/rtbitmap.c | 5++---
Mfs/xfs/scrub/trace.h | 45+++++++++++++++++++++++++++++++++++++++++++++
Mfs/xfs/xfs_aops.c | 266+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mfs/xfs/xfs_aops.h | 24++----------------------
Mfs/xfs/xfs_attr_list.c | 1+
Mfs/xfs/xfs_bmap_util.c | 9+++------
Mfs/xfs/xfs_buf.c | 72++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mfs/xfs/xfs_buf.h | 8+++++++-
Mfs/xfs/xfs_error.c | 6+++++-
Mfs/xfs/xfs_error.h | 1+
Mfs/xfs/xfs_file.c | 31++++++++++++++++++++++---------
Mfs/xfs/xfs_fsops.c | 1+
Mfs/xfs/xfs_globals.c | 2+-
Mfs/xfs/xfs_inode.c | 769+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Mfs/xfs/xfs_inode.h | 3+++
Mfs/xfs/xfs_iomap.c | 518++++++++++++++++++++++++++++++++++++++-----------------------------------------
Mfs/xfs/xfs_iomap.h | 7+++----
Mfs/xfs/xfs_iops.c | 21+++++++++++++++++++--
Mfs/xfs/xfs_log_recover.c | 14+++++++++-----
Mfs/xfs/xfs_mount.c | 5+++++
Mfs/xfs/xfs_mount.h | 10+++++++++-
Mfs/xfs/xfs_ondisk.h | 21+++++++++++++++++++++
Mfs/xfs/xfs_pnfs.c | 2+-
Mfs/xfs/xfs_reflink.c | 150+++++++++++++++++++++++++++++++++----------------------------------------------
Mfs/xfs/xfs_reflink.h | 18+++++++++++++++---
Mfs/xfs/xfs_super.c | 22++++++++++++++++++----
Mfs/xfs/xfs_sysctl.h | 1+
Mfs/xfs/xfs_sysfs.c | 24++++++++++++++++++++++++
Mfs/xfs/xfs_trace.h | 115+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mfs/xfs/xfs_trans_bmap.c | 1-
Mfs/xfs/xfs_trans_buf.c | 2+-
Mfs/xfs/xfs_trans_extfree.c | 1-
Mfs/xfs/xfs_trans_refcount.c | 1-
Mfs/xfs/xfs_trans_rmap.c | 1-
Mfs/xfs/xfs_xattr.c | 3+++
77 files changed, 2134 insertions(+), 1239 deletions(-)

diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt @@ -272,7 +272,7 @@ The following sysctls are available for the XFS filesystem: XFS_ERRLEVEL_LOW: 1 XFS_ERRLEVEL_HIGH: 5 - fs.xfs.panic_mask (Min: 0 Default: 0 Max: 255) + fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256) Causes certain error conditions to call BUG(). Value is a bitmask; OR together the tags which represent errors which should cause panics: @@ -285,6 +285,7 @@ The following sysctls are available for the XFS filesystem: XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 XFS_PTAG_FSBLOCK_ZERO 0x00000080 + XFS_PTAG_VERIFIER_ERROR 0x00000100 This option is intended for debugging only. diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c @@ -339,14 +339,14 @@ xfs_ag_init_headers( { /* BNO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_cntbt_buf_ops, .work = &xfs_cntroot_init, .need_init = true }, @@ -361,7 +361,7 @@ xfs_ag_init_headers( { /* FINO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_inobt_buf_ops, + .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c @@ -281,7 +281,7 @@ xfs_ag_resv_init( */ ask = used = 0; - mp->m_inotbt_nores = true; + mp->m_finobt_nores = true; error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c @@ -568,9 +568,9 @@ xfs_agfl_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return NULL; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) + if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) return __this_address; - if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -643,6 +643,7 @@ xfs_agfl_write_verify( const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", + .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, .verify_struct = xfs_agfl_verify, @@ -2587,8 +2588,10 @@ xfs_agf_verify( return __this_address; } - if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + if (!xfs_verify_magic(bp, agf->agf_magicnum)) + return __this_address; + + if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && @@ -2670,6 +2673,7 @@ xfs_agf_write_verify( const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", + .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, .verify_struct = xfs_agf_verify, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -297,48 +297,34 @@ xfs_allocbt_verify( struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; + xfs_btnum_t btnum = XFS_BTNUM_BNOi; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; + } /* - * magic number and level verification - * - * During growfs operations, we can't verify the exact level or owner as - * the perag is not fully initialised and hence not attached to the - * buffer. In this case, check against the maximum tree depth. + * The perag may not be attached during grow operations or fully + * initialized from the AGF during log recovery. Therefore we can only + * check against maximum tree depth from those contexts. * - * Similarly, during log recovery we will have a perag structure - * attached, but the agf information will not yet have been initialised - * from the on disk AGF. Again, we can only check against maximum limits - * in this case. + * Otherwise check against the per-tree limit. Peek at one of the + * verifier magic values to determine the type of tree we're verifying + * against. */ level = be16_to_cpu(block->bb_level); - switch (block->bb_magic) { - case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) + btnum = XFS_BTNUM_CNTi; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[btnum]) return __this_address; - break; - case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) - return __this_address; - break; - default: + } else if (level >= mp->m_ag_maxlevels) return __this_address; - } return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } @@ -377,13 +363,23 @@ xfs_allocbt_write_verify( } -const struct xfs_buf_ops xfs_allocbt_buf_ops = { - .name = "xfs_allocbt", +const struct xfs_buf_ops xfs_bnobt_buf_ops = { + .name = "xfs_bnobt", + .magic = { cpu_to_be32(XFS_ABTB_MAGIC), + cpu_to_be32(XFS_ABTB_CRC_MAGIC) }, .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, .verify_struct = xfs_allocbt_verify, }; +const struct xfs_buf_ops xfs_cntbt_buf_ops = { + .name = "xfs_cntbt", + .magic = { cpu_to_be32(XFS_ABTC_MAGIC), + cpu_to_be32(XFS_ABTC_CRC_MAGIC) }, + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, + .verify_struct = xfs_allocbt_verify, +}; STATIC int xfs_bnobt_keys_inorder( @@ -448,7 +444,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_bnobt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_bnobt_buf_ops, .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, @@ -470,7 +466,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_cntbt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_cntbt_buf_ops, .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c @@ -1336,3 +1336,20 @@ xfs_attr_node_get(xfs_da_args_t *args) xfs_da_state_free(state); return retval; } + +/* Returns true if the attribute entry name is valid. */ +bool +xfs_attr_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any nulls here */ + return !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h @@ -145,6 +145,6 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); - +bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -245,25 +245,14 @@ xfs_attr3_leaf_verify( struct xfs_attr_leaf_entry *entries; uint32_t end; /* must be 32bit - see below */ int i; + xfs_failaddr_t fa; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) - return __this_address; - } /* * In recovery there is a transient state where count == 0 is valid * because we may have transitioned an empty shortform attr to a leaf @@ -369,6 +358,8 @@ xfs_attr3_leaf_read_verify( const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", + .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c @@ -79,6 +79,7 @@ xfs_attr3_rmt_hdr_ok( static xfs_failaddr_t xfs_attr3_rmt_verify( struct xfs_mount *mp, + struct xfs_buf *bp, void *ptr, int fsbsize, xfs_daddr_t bno) @@ -87,7 +88,7 @@ xfs_attr3_rmt_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -131,7 +132,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -193,7 +194,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -220,6 +221,7 @@ xfs_attr3_rmt_write_verify( const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .name = "xfs_attr3_rmt", + .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) }, .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, .verify_struct = xfs_attr3_rmt_verify_struct, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c @@ -577,42 +577,44 @@ __xfs_bmap_add_free( */ /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. + * Convert the inode format to extent format if it currently is in btree format, + * but the extent list is small enough that it fits into the extent format. + * + * Since the extents are already in-core, all we have to do is give up the space + * for the btree root and pitch the leaf block. */ STATIC int /* error */ xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - /* REFERENCED */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ xfs_buf_t *cbp; /* child block's buffer */ int error; /* error return value */ - struct xfs_ifork *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ struct xfs_owner_info oinfo; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* check if we actually need the extent format first: */ + if (!xfs_bmap_wants_extents(ip, whichfork)) + return 0; + + ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); - *logflagsp = 0; #ifdef DEBUG XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, xfs_btree_check_lptr(cur, cbno, 1)); @@ -635,7 +637,7 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -2029,7 +2031,7 @@ done: /* * Convert an unwritten allocation to a real allocation or vice versa. */ -STATIC int /* error */ +int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ @@ -3685,17 +3687,6 @@ xfs_trim_extent( } } -/* trim extent to within eof */ -void -xfs_trim_extent_eof( - struct xfs_bmbt_irec *irec, - struct xfs_inode *ip) - -{ - xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, - i_size_read(VFS_I(ip)))); -} - /* * Trim the returned map to the required bounds */ @@ -4203,6 +4194,44 @@ xfs_bmapi_convert_unwritten( return 0; } +static inline xfs_extlen_t +xfs_bmapi_minleft( + struct xfs_trans *tp, + struct xfs_inode *ip, + int fork) +{ + if (tp && tp->t_firstblock != NULLFSBLOCK) + return 0; + if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + return 1; + return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; +} + +/* + * Log whatever the flags say, even if error. Otherwise we might miss detecting + * a case where the data is changed, there's an error, and it's not logged so we + * don't shutdown when we should. Don't bother logging extents/btree changes if + * we converted to the other format. + */ +static void +xfs_bmapi_finish( + struct xfs_bmalloca *bma, + int whichfork, + int error) +{ + if ((bma->logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma->logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma->logflags &= ~xfs_ilog_fbroot(whichfork); + + if (bma->logflags) + xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags); + if (bma->cur) + xfs_btree_del_cursor(bma->cur, error); +} + /* * Map file blocks to filesystem blocks, and allocate blocks or convert the * extent state if necessary. Details behaviour is controlled by the flags @@ -4247,9 +4276,7 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(tp != NULL || - (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == - (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); + ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -4282,25 +4309,12 @@ xfs_bmapi_write( XFS_STATS_INC(mp, xs_blk_mapw); - if (!tp || tp->t_firstblock == NULLFSBLOCK) { - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; - else - bma.minleft = 1; - } else { - bma.minleft = 0; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) goto error0; } - n = 0; - end = bno + len; - obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) @@ -4309,7 +4323,11 @@ xfs_bmapi_write( bma.ip = ip; bma.total = total; bma.datatype = 0; + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + n = 0; + end = bno + len; + obno = bno; while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; @@ -4323,26 +4341,7 @@ xfs_bmapi_write( ASSERT(!((flags & XFS_BMAPI_CONVERT) && (flags & XFS_BMAPI_COWFORK))); - if (flags & XFS_BMAPI_DELALLOC) { - /* - * For the COW fork we can reasonably get a - * request for converting an extent that races - * with other threads already having converted - * part of it, as there converting COW to - * regular blocks is not protected using the - * IOLOCK. - */ - ASSERT(flags & XFS_BMAPI_COWFORK); - if (!(flags & XFS_BMAPI_COWFORK)) { - error = -EIO; - goto error0; - } - - if (eof || bno >= end) - break; - } else { - need_alloc = true; - } + need_alloc = true; } else if (isnullstartblock(bma.got.br_startblock)) { wasdelay = true; } @@ -4351,8 +4350,7 @@ xfs_bmapi_write( * First, deal with the hole before the allocated space * that we found, if any. */ - if ((need_alloc || wasdelay) && - !(flags & XFS_BMAPI_CONVERT_ONLY)) { + if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4420,49 +4418,130 @@ xfs_bmapi_write( } *nmap = n; - /* - * Transform from btree to extents, give it cur. - */ - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - ASSERT(bma.cur); - error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, - &tmp_logflags, whichfork); - bma.logflags |= tmp_logflags; - if (error) - goto error0; - } + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto error0; ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork)); - error = 0; + xfs_bmapi_finish(&bma, whichfork, 0); + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return 0; error0: + xfs_bmapi_finish(&bma, whichfork, error); + return error; +} + +/* + * Convert an existing delalloc extent to real blocks based on file offset. This + * attempts to allocate the entire delalloc extent and may require multiple + * invocations to allocate the target offset if a large enough physical extent + * is not available. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap, + unsigned int *seq) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmalloca bma = { NULL }; + struct xfs_trans *tp; + int error; + /* - * Log everything. Do this after conversion, there's no point in - * logging the extent records if we've converted to btree format. + * Space for the extent and indirect blocks was reserved when the + * delalloc extent was created so there's no need to do so here. */ - if ((bma.logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - bma.logflags &= ~xfs_ilog_fext(whichfork); - else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - bma.logflags &= ~xfs_ilog_fbroot(whichfork); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || + bma.got.br_startoff > offset_fsb) { + /* + * No extent found in the range we are trying to convert. This + * should only happen for the COW fork, where another thread + * might have moved the extent to the data fork in the meantime. + */ + WARN_ON_ONCE(whichfork != XFS_COW_FORK); + error = -EAGAIN; + goto out_trans_cancel; + } + /* - * Log whatever the flags say, even if error. Otherwise we might miss - * detecting a case where the data is changed, there's an error, - * and it's not logged so we don't shutdown when we should. + * If we find a real extent here we raced with another thread converting + * the extent. Just return the real extent at this offset. */ - if (bma.logflags) - xfs_trans_log_inode(tp, ip, bma.logflags); + if (!isnullstartblock(bma.got.br_startblock)) { + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + goto out_trans_cancel; + } + + bma.tp = tp; + bma.ip = ip; + bma.wasdel = true; + bma.offset = bma.got.br_startoff; + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + if (whichfork == XFS_COW_FORK) + bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; - if (bma.cur) { - xfs_btree_del_cursor(bma.cur, error); + if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; + + error = xfs_bmapi_allocate(&bma); + if (error) + goto out_finish; + + error = -ENOSPC; + if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) + goto out_finish; + error = -EFSCORRUPTED; + if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + goto out_finish; + + XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); + XFS_STATS_INC(mp, xs_xstrat_quick); + + ASSERT(!isnullstartblock(bma.got.br_startblock)); + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); + if (error) + goto out_finish; } - if (!error) - xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto out_finish; + + xfs_bmapi_finish(&bma, whichfork, 0); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_finish: + xfs_bmapi_finish(&bma, whichfork, error); +out_trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -4536,13 +4615,7 @@ xfs_bmapi_remap( if (error) goto error0; - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, whichfork); - logflags |= tmp_logflags; - } + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) @@ -5406,24 +5479,11 @@ nodelete: error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; - if (error) - goto error0; - } - /* - * transform from btree to extents, give it cur - */ - else if (xfs_bmap_wants_extents(ip, whichfork)) { - ASSERT(cur != NULL); - error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + } else { + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; } - /* - * transform from extents to local? - */ - error = 0; + error0: /* * Log everything. Do this after conversion, there's no point in diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h @@ -95,12 +95,6 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 -/* Only convert delalloc space, don't allocate entirely new extents */ -#define XFS_BMAPI_DELALLOC 0x400 - -/* Only convert unwritten extents, don't allocate new blocks */ -#define XFS_BMAPI_CONVERT_ONLY 0x800 - /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD 0x1000 @@ -117,8 +111,6 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } @@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); -void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); @@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, + unsigned int *seq); +int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, int *logflagsp); static inline void xfs_bmap_add_free( diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -416,8 +416,10 @@ xfs_bmbt_verify( xfs_failaddr_t fa; unsigned int level; - switch (block->bb_magic) { - case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -425,11 +427,6 @@ xfs_bmbt_verify( fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_BMAP_MAGIC): - break; - default: - return __this_address; } /* @@ -481,6 +478,8 @@ xfs_bmbt_write_verify( const struct xfs_buf_ops xfs_bmbt_buf_ops = { .name = "xfs_bmbt", + .magic = { cpu_to_be32(XFS_BMAP_MAGIC), + cpu_to_be32(XFS_BMAP_CRC_MAGIC) }, .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, .verify_struct = xfs_bmbt_verify, diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c @@ -116,6 +116,34 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } +/* + * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only + * accessible on v5 filesystems. This header format is common across da node, + * attr leaf and dir leaf blocks. + */ +xfs_failaddr_t +xfs_da3_blkinfo_verify( + struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_blkinfo *hdr = &hdr3->hdr; + + if (!xfs_verify_magic16(bp, hdr->magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return __this_address; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return __this_address; + } + + return NULL; +} + static xfs_failaddr_t xfs_da3_node_verify( struct xfs_buf *bp) @@ -124,27 +152,16 @@ xfs_da3_node_verify( struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; const struct xfs_dir_ops *ops; + xfs_failaddr_t fa; ops = xfs_dir_get_ops(mp, NULL); ops->node_hdr_from_disk(&ichdr, hdr); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_DA3_NODE_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_DA_NODE_MAGIC) - return __this_address; - } if (ichdr.level == 0) return __this_address; if (ichdr.level > XFS_DA_NODE_MAXDEPTH) @@ -257,6 +274,8 @@ xfs_da3_node_verify_struct( const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", + .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC), + cpu_to_be16(XFS_DA3_NODE_MAGIC) }, .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, .verify_struct = xfs_da3_node_verify_struct, diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h @@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); } +xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3); + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c @@ -703,3 +703,20 @@ xfs_dir2_shrink_inode( xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } + +/* Returns true if the directory entry name is valid. */ +bool +xfs_dir2_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any slashes or nulls here */ + return !memchr(name, '/', length) && !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h @@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); +bool xfs_dir2_namecheck(const void *name, size_t length); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c @@ -53,18 +53,16 @@ xfs_dir3_block_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -112,6 +110,8 @@ xfs_dir3_block_write_verify( const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", + .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC), + cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) }, .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, .verify_struct = xfs_dir3_block_verify, diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c @@ -252,18 +252,16 @@ xfs_dir3_data_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -339,6 +337,8 @@ xfs_dir3_data_write_verify( const struct xfs_buf_ops xfs_dir3_data_buf_ops = { .name = "xfs_dir3_data", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, .verify_struct = xfs_dir3_data_verify, @@ -346,6 +346,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = { static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .name = "xfs_dir3_data_reada", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -142,41 +142,22 @@ xfs_dir3_leaf_check_int( */ static xfs_failaddr_t xfs_dir3_leaf_verify( - struct xfs_buf *bp, - uint16_t magic) + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_failaddr_t fa; - ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - uint16_t magic3; - - magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC - : XFS_DIR3_LEAFN_MAGIC; - - if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) - return __this_address; - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) - return __this_address; - } else { - if (leaf->hdr.info.magic != cpu_to_be16(magic)) - return __this_address; - } + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); } static void -__read_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_read_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; xfs_failaddr_t fa; @@ -185,23 +166,22 @@ __read_verify( !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void -__write_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_write_verify( + struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -216,60 +196,22 @@ __write_verify( xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } -static xfs_failaddr_t -xfs_dir3_leaf1_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static xfs_failaddr_t -xfs_dir3_leafn_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", - .verify_read = xfs_dir3_leaf1_read_verify, - .verify_write = xfs_dir3_leaf1_write_verify, - .verify_struct = xfs_dir3_leaf1_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC), + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", - .verify_read = xfs_dir3_leafn_read_verify, - .verify_write = xfs_dir3_leafn_write_verify, - .verify_struct = xfs_dir3_leafn_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC), + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; int diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c @@ -87,20 +87,18 @@ xfs_dir3_free_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; + if (!xfs_verify_magic(bp, hdr->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) - return __this_address; } /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ @@ -151,6 +149,8 @@ xfs_dir3_free_write_verify( const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .name = "xfs_dir3_free", + .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC), + cpu_to_be32(XFS_DIR3_FREE_MAGIC) }, .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, .verify_struct = xfs_dir3_free_verify, diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -277,6 +277,8 @@ xfs_dquot_buf_write_verify( const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, .verify_struct = xfs_dquot_buf_verify_struct, @@ -284,6 +286,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = { const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .name = "xfs_dquot_ra", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h @@ -54,7 +54,8 @@ #define XFS_ERRTAG_BUF_LRU_REF 31 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 -#define XFS_ERRTAG_MAX 34 +#define XFS_ERRTAG_IUNLINK_FALLBACK 34 +#define XFS_ERRTAG_MAX 35 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -93,5 +94,6 @@ #define XFS_RANDOM_BUF_LRU_REF 2 #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 +#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c @@ -2508,7 +2508,7 @@ xfs_agi_verify( /* * Validate the magic number of the agi block. */ - if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; @@ -2582,6 +2582,7 @@ xfs_agi_write_verify( const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", + .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -124,7 +124,7 @@ xfs_finobt_alloc_block( union xfs_btree_ptr *new, int *stat) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); @@ -154,7 +154,7 @@ xfs_finobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_free_block(cur, bp); return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } @@ -260,6 +260,9 @@ xfs_inobt_verify( xfs_failaddr_t fa; unsigned int level; + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + /* * During growfs operations, we can't verify the exact owner as the * perag is not fully initialised and hence not attached to the buffer. @@ -270,18 +273,10 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - switch (block->bb_magic) { - case cpu_to_be32(XFS_IBT_CRC_MAGIC): - case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (xfs_sb_version_hascrc(&mp->m_sb)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_IBT_MAGIC): - case cpu_to_be32(XFS_FIBT_MAGIC): - break; - default: - return __this_address; } /* level verification */ @@ -328,6 +323,16 @@ xfs_inobt_write_verify( const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", + .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) }, + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, + .verify_struct = xfs_inobt_verify, +}; + +const struct xfs_buf_ops xfs_finobt_buf_ops = { + .name = "xfs_finobt", + .magic = { cpu_to_be32(XFS_FIBT_MAGIC), + cpu_to_be32(XFS_FIBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, @@ -389,7 +394,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .buf_ops = &xfs_inobt_buf_ops, + .buf_ops = &xfs_finobt_buf_ops, .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c @@ -614,16 +614,15 @@ xfs_iext_realloc_root( } /* - * Increment the sequence counter if we are on a COW fork. This allows - * the writeback code to skip looking for a COW extent if the COW fork - * hasn't changed. We use WRITE_ONCE here to ensure the update to the - * sequence counter is seen before the modifications to the extent - * tree itself take effect. + * Increment the sequence counter on extent tree changes. If we are on a COW + * fork, this allows the writeback code to skip looking for a COW extent if the + * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the + * sequence counter is seen before the modifications to the extent tree itself + * take effect. */ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) { - if (state & BMAP_COWFORK) - WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); + WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } void diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c @@ -97,10 +97,9 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - (unlinked_ino == NULLAGINO || - xfs_verify_agino(mp, agno, unlinked_ino)); + xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -147,12 +146,16 @@ xfs_inode_buf_write_verify( const struct xfs_buf_ops xfs_inode_buf_ops = { .name = "xfs_inode", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { - .name = "xxfs_inode_ra", + .name = "xfs_inode_ra", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h @@ -14,7 +14,7 @@ struct xfs_dinode; */ struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ - unsigned int if_seq; /* cow fork mod counter */ + unsigned int if_seq; /* fork mod counter */ struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -209,7 +209,7 @@ xfs_refcountbt_verify( xfs_failaddr_t fa; unsigned int level; - if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasreflink(&mp->m_sb)) @@ -264,6 +264,7 @@ xfs_refcountbt_write_verify( const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", + .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) }, .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, .verify_struct = xfs_refcountbt_verify, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -310,7 +310,7 @@ xfs_rmapbt_verify( * from the on disk AGF. Again, we can only check against maximum limits * in this case. */ - if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) @@ -365,6 +365,7 @@ xfs_rmapbt_write_verify( const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .name = "xfs_rmapbt", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, .verify_read = xfs_rmapbt_read_verify, .verify_write = xfs_rmapbt_write_verify, .verify_struct = xfs_rmapbt_verify, diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c @@ -225,10 +225,11 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); uint32_t agcount = 0; uint32_t rem; - if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; } @@ -781,12 +782,14 @@ out_error: const struct xfs_buf_ops xfs_sb_buf_ops = { .name = "xfs_sb", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .name = "xfs_sb_quiet", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; @@ -874,7 +877,7 @@ xfs_initialize_perag_data( uint64_t bfreelst = 0; uint64_t btree = 0; uint64_t fdblocks; - int error; + int error = 0; for (index = 0; index < agcount; index++) { /* diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h @@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; @@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_finobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -95,7 +95,7 @@ xfs_symlink_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -159,6 +159,7 @@ xfs_symlink_write_verify( const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", + .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) }, .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, .verify_struct = xfs_symlink_verify, diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c @@ -116,6 +116,19 @@ xfs_verify_agino( } /* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +bool +xfs_verify_agino_or_null( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); +} + +/* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ @@ -204,3 +217,14 @@ xfs_verify_icount( xfs_icount_range(mp, &min, &max); return icount >= min && icount <= max; } + +/* Sanity-checking of dir/attr block offsets. */ +bool +xfs_verify_dablk( + struct xfs_mount *mp, + xfs_fileoff_t dabno) +{ + xfs_dablk_t max_dablk = -1U; + + return dabno <= max_dablk; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h @@ -183,10 +183,13 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last); bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino); +bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); +bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c @@ -399,7 +399,7 @@ xchk_agf_xref_cntbt( if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) return; if (!have) { - if (agf->agf_freeblks != be32_to_cpu(0)) + if (agf->agf_freeblks != cpu_to_be32(0)) xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); return; } @@ -864,19 +864,17 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (agino == NULLAGINO) - continue; - if (!xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c @@ -341,23 +341,19 @@ xrep_agf( struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { [XREP_AGF_BNOBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTB_CRC_MAGIC, + .buf_ops = &xfs_bnobt_buf_ops, }, [XREP_AGF_CNTBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTC_CRC_MAGIC, + .buf_ops = &xfs_cntbt_buf_ops, }, [XREP_AGF_RMAPBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_rmapbt_buf_ops, - .magic = XFS_RMAP_CRC_MAGIC, }, [XREP_AGF_REFCOUNTBT] = { .rmap_owner = XFS_RMAP_OWN_REFC, .buf_ops = &xfs_refcountbt_buf_ops, - .magic = XFS_REFC_CRC_MAGIC, }, [XREP_AGF_END] = { .buf_ops = NULL, @@ -875,12 +871,10 @@ xrep_agi( [XREP_AGI_INOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_IBT_CRC_MAGIC, }, [XREP_AGI_FINOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_FIBT_CRC_MAGIC, + .buf_ops = &xfs_finobt_buf_ops, }, [XREP_AGI_END] = { .buf_ops = NULL diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c @@ -82,12 +82,23 @@ xchk_xattr_listent( sx = container_of(context, struct xchk_xattr, context); + if (xchk_should_terminate(sx->sc, &error)) { + context->seen_enough = 1; + return; + } + if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ xchk_ino_set_preen(sx->sc, context->dp->i_ino); return; } + /* Does this name make sense? */ + if (!xfs_attr_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + return; + } + args.flags = ATTR_KERNOTIME; if (flags & XFS_ATTR_ROOT) args.flags |= ATTR_ROOT; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c @@ -281,6 +281,31 @@ xchk_bmap_extent_xref( xchk_ag_free(info->sc, &info->sc->sa); } +/* + * Directories and attr forks should never have blocks that can't be addressed + * by a xfs_dablk_t. + */ +STATIC void +xchk_bmap_dirattr_extent( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t off; + + if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK) + return; + + if (!xfs_verify_dablk(mp, irec->br_startoff)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + off = irec->br_startoff + irec->br_blockcount - 1; + if (!xfs_verify_dablk(mp, off)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, off); +} + /* Scrub a single extent record. */ STATIC int xchk_bmap_extent( @@ -305,6 +330,8 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + xchk_bmap_dirattr_extent(ip, info, irec); + /* There should never be a "hole" extent in either extent list. */ if (irec->br_startblock == HOLESTARTBLOCK) xchk_fblock_set_corrupt(info->sc, info->whichfork, diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c @@ -129,6 +129,12 @@ xchk_dir_actor( goto out; } + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + if (!strncmp(".", name, namelen)) { /* If this is "." then check that the inum matches the dir. */ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c @@ -47,6 +47,12 @@ xchk_setup_ag_iallocbt( struct xchk_iallocbt { /* Number of inodes we see while scanning inobt. */ unsigned long long inodes; + + /* Expected next startino, for big block filesystems. */ + xfs_agino_t next_startino; + + /* Expected end of the current inode cluster. */ + xfs_agino_t next_cluster_ino; }; /* @@ -128,41 +134,57 @@ xchk_iallocbt_freecount( return hweight64(freemask); } -/* Check a particular inode with ir_free. */ +/* + * Check that an inode's allocation status matches ir_free in the inobt + * record. First we try querying the in-core inode state, and if the inode + * isn't loaded we examine the on-disk inode directly. + * + * Since there can be 1:M and M:1 mappings between inobt records and inode + * clusters, we pass in the inode location information as an inobt record; + * the index of an inode cluster within the inobt record (as well as the + * cluster buffer itself); and the index of the inode within the cluster. + * + * @irec is the inobt record. + * @irec_ino is the inode offset from the start of the record. + * @dip is the on-disk inode. + */ STATIC int -xchk_iallocbt_check_cluster_freemask( +xchk_iallocbt_check_cluster_ifree( struct xchk_btree *bs, - xfs_ino_t fsino, - xfs_agino_t chunkino, - xfs_agino_t clusterino, struct xfs_inobt_rec_incore *irec, - struct xfs_buf *bp) + unsigned int irec_ino, + struct xfs_dinode *dip) { - struct xfs_dinode *dip; struct xfs_mount *mp = bs->cur->bc_mp; - bool inode_is_free = false; + xfs_ino_t fsino; + xfs_agino_t agino; + bool irec_free; + bool ino_inuse; bool freemask_ok; - bool inuse; int error = 0; if (xchk_should_terminate(bs->sc, &error)) return error; - dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + /* + * Given an inobt record and the offset of an inode from the start of + * the record, compute which fs inode we're talking about. + */ + agino = irec->ir_startino + irec_ino; + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || - (dip->di_version >= 3 && - be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } - if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) - inode_is_free = true; - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, - fsino + clusterino, &inuse); + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, + &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ - freemask_ok = inode_is_free ^ !!(dip->di_mode); + freemask_ok = irec_free ^ !!(dip->di_mode); if (!bs->sc->try_harder && !freemask_ok) return -EDEADLOCK; } else if (error < 0) { @@ -174,7 +196,7 @@ xchk_iallocbt_check_cluster_freemask( goto out; } else { /* Inode is all there. */ - freemask_ok = inode_is_free ^ inuse; + freemask_ok = irec_free ^ ino_inuse; } if (!freemask_ok) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); @@ -182,86 +204,221 @@ out: return 0; } -/* Make sure the free mask is consistent with what the inodes think. */ +/* + * Check that the holemask and freemask of a hypothetical inode cluster match + * what's actually on disk. If sparse inodes are enabled, the cluster does + * not actually have to map to inodes if the corresponding holemask bit is set. + * + * @cluster_base is the first inode in the cluster within the @irec. + */ STATIC int -xchk_iallocbt_check_freemask( +xchk_iallocbt_check_cluster( struct xchk_btree *bs, - struct xfs_inobt_rec_incore *irec) + struct xfs_inobt_rec_incore *irec, + unsigned int cluster_base) { struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_dinode *dip; - struct xfs_buf *bp; - xfs_ino_t fsino; - xfs_agino_t nr_inodes; - xfs_agino_t agino; - xfs_agino_t chunkino; - xfs_agino_t clusterino; + struct xfs_buf *cluster_bp; + unsigned int nr_inodes; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t agbno; - uint16_t holemask; + unsigned int cluster_index; + uint16_t cluster_mask = 0; uint16_t ir_holemask; int error = 0; - /* Make sure the freemask matches the inode records. */ - nr_inodes = mp->m_inodes_per_cluster; - - for (agino = irec->ir_startino; - agino < irec->ir_startino + XFS_INODES_PER_CHUNK; - agino += mp->m_inodes_per_cluster) { - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - chunkino = agino - irec->ir_startino; - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - - /* Compute the holemask mask for this cluster. */ - for (clusterino = 0, holemask = 0; clusterino < nr_inodes; - clusterino += XFS_INODES_PER_HOLEMASK_BIT) - holemask |= XFS_INOBT_MASK((chunkino + clusterino) / - XFS_INODES_PER_HOLEMASK_BIT); - - /* The whole cluster must be a hole or not a hole. */ - ir_holemask = (irec->ir_holemask & holemask); - if (ir_holemask != holemask && ir_holemask != 0) { + nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_inodes_per_cluster); + + /* Map this inode cluster */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base); + + /* Compute a bitmask for this cluster that can be used for holemask. */ + for (cluster_index = 0; + cluster_index < nr_inodes; + cluster_index += XFS_INODES_PER_HOLEMASK_BIT) + cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* + * Map the first inode of this cluster to a buffer and offset. + * Be careful about inobt records that don't align with the start of + * the inode buffer when block sizes are large enough to hold multiple + * inode chunks. When this happens, cluster_base will be zero but + * ir_startino can be large enough to make im_boffset nonzero. + */ + ir_holemask = (irec->ir_holemask & cluster_mask); + imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino); + + if (imap.im_boffset != 0 && cluster_base != 0) { + ASSERT(imap.im_boffset == 0 || cluster_base == 0); + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, + imap.im_blkno, imap.im_len, cluster_base, nr_inodes, + cluster_mask, ir_holemask, + XFS_INO_TO_OFFSET(mp, irec->ir_startino + + cluster_base)); + + /* The whole cluster must be a hole or not a hole. */ + if (ir_holemask != cluster_mask && ir_holemask != 0) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) { + xchk_xref_is_not_owned_by(bs->sc, agbno, + mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + return 0; + } + + xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + + /* Grab the inode cluster buffer. */ + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, + 0, 0); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) + return error; + + /* Check free status of each inode within this cluster. */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + struct xfs_dinode *dip; + + if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - continue; + break; } - /* If any part of this is a hole, skip it. */ - if (ir_holemask) { - xchk_xref_is_not_owned_by(bs->sc, agbno, - mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); - continue; + dip = xfs_buf_offset(cluster_bp, imap.im_boffset); + error = xchk_iallocbt_check_cluster_ifree(bs, irec, + cluster_base + cluster_index, dip); + if (error) + break; + imap.im_boffset += mp->m_sb.sb_inodesize; + } + + xfs_trans_brelse(bs->cur->bc_tp, cluster_bp); + return error; +} + +/* + * For all the inode clusters that could map to this inobt record, make sure + * that the holemask makes sense and that the allocation status of each inode + * matches the freemask. + */ +STATIC int +xchk_iallocbt_check_clusters( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + unsigned int cluster_base; + int error = 0; + + /* + * For the common case where this inobt record maps to multiple inode + * clusters this will call _check_cluster for each cluster. + * + * For the case that multiple inobt records map to a single cluster, + * this will call _check_cluster once. + */ + for (cluster_base = 0; + cluster_base < XFS_INODES_PER_CHUNK; + cluster_base += bs->sc->mp->m_inodes_per_cluster) { + error = xchk_iallocbt_check_cluster(bs, irec, cluster_base); + if (error) + break; + } + + return error; +} + +/* + * Make sure this inode btree record is aligned properly. Because a fs block + * contains multiple inodes, we check that the inobt record is aligned to the + * correct inode, not just the correct block on disk. This results in a finer + * grained corruption check. + */ +STATIC void +xchk_iallocbt_rec_alignment( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = bs->sc->mp; + struct xchk_iallocbt *iabt = bs->private; + + /* + * finobt records have different positioning requirements than inobt + * records: each finobt record must have a corresponding inobt record. + * That is checked in the xref function, so for now we only catch the + * obvious case where the record isn't at all aligned properly. + * + * Note that if a fs block contains more than a single chunk of inodes, + * we will have finobt records only for those chunks containing free + * inodes, and therefore expect chunk alignment of finobt records. + * Otherwise, we expect that the finobt record is aligned to the + * cluster alignment as told by the superblock. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + unsigned int imask; + + imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, + mp->m_cluster_align_inodes) - 1; + if (irec->ir_startino & imask) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (iabt->next_startino != NULLAGINO) { + /* + * We're midway through a cluster of inodes that is mapped by + * multiple inobt records. Did we get the record for the next + * irec in the sequence? + */ + if (irec->ir_startino != iabt->next_startino) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); + iabt->next_startino += XFS_INODES_PER_CHUNK; - /* Grab the inode cluster buffer. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, - agbno); - imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); - imap.im_boffset = 0; - - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, - &dip, &bp, 0, 0); - if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, - &error)) - continue; - - /* Which inodes are free? */ - for (clusterino = 0; clusterino < nr_inodes; clusterino++) { - error = xchk_iallocbt_check_cluster_freemask(bs, - fsino, chunkino, clusterino, irec, bp); - if (error) { - xfs_trans_brelse(bs->cur->bc_tp, bp); - return error; - } + /* Are we done with the cluster? */ + if (iabt->next_startino >= iabt->next_cluster_ino) { + iabt->next_startino = NULLAGINO; + iabt->next_cluster_ino = NULLAGINO; } + return; + } + + /* inobt records must be aligned to cluster and inoalignmnt size. */ + if (irec->ir_startino & (mp->m_cluster_align_inodes - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } - xfs_trans_brelse(bs->cur->bc_tp, bp); + if (irec->ir_startino & (mp->m_inodes_per_cluster - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - return error; + if (mp->m_inodes_per_cluster <= XFS_INODES_PER_CHUNK) + return; + + /* + * If this is the start of an inode cluster that can be mapped by + * multiple inobt records, the next inobt record must follow exactly + * after this one. + */ + iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK; + iabt->next_cluster_ino = irec->ir_startino + mp->m_inodes_per_cluster; } /* Scrub an inobt/finobt record. */ @@ -276,7 +433,6 @@ xchk_iallocbt_rec( uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agino_t agino; - xfs_agblock_t agbno; xfs_extlen_t len; int holecount; int i; @@ -303,11 +459,9 @@ xchk_iallocbt_rec( goto out; } - /* Make sure this record is aligned to cluster and inoalignmnt size. */ - agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); - if ((agbno & (mp->m_cluster_align - 1)) || - (agbno & (mp->m_blocks_per_cluster - 1))) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_iallocbt_rec_alignment(bs, &irec); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; iabt->inodes += irec.ir_count; @@ -320,7 +474,7 @@ xchk_iallocbt_rec( if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) goto out; - goto check_freemask; + goto check_clusters; } /* Check each chunk of a sparse inode cluster. */ @@ -346,8 +500,8 @@ xchk_iallocbt_rec( holecount + irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); -check_freemask: - error = xchk_iallocbt_check_freemask(bs, &irec); +check_clusters: + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; @@ -429,6 +583,8 @@ xchk_iallocbt( struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { .inodes = 0, + .next_startino = NULLAGINO, + .next_cluster_ino = NULLAGINO, }; int error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c @@ -743,7 +743,8 @@ xrep_findroot_block( /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); - if (be32_to_cpu(btblock->bb_magic) != fab->magic) + ASSERT(fab->buf_ops->magic[1] != 0); + if (btblock->bb_magic != fab->buf_ops->magic[1]) goto out; /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h @@ -42,9 +42,6 @@ struct xrep_find_ag_btree { /* in: buffer ops */ const struct xfs_buf_ops *buf_ops; - /* in: magic number of the btree */ - uint32_t magic; - /* out: the highest btree block found and the tree height */ xfs_agblock_t root; unsigned int height; diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c @@ -141,9 +141,8 @@ xchk_xref_is_used_rt_space( startext = fsbno; endext = fsbno + len - 1; do_div(startext, sc->mp->m_sb.sb_rextsize); - if (do_div(endext, sc->mp->m_sb.sb_rextsize)) - endext++; - extcount = endext - startext; + do_div(endext, sc->mp->m_sb.sb_rextsize); + extcount = endext - startext + 1; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h @@ -545,6 +545,51 @@ TRACE_EVENT(xchk_xref_error, __entry->ret_ip) ); +TRACE_EVENT(xchk_iallocbt_check_cluster, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, xfs_daddr_t map_daddr, + unsigned short map_len, unsigned int chunk_ino, + unsigned int nr_inodes, uint16_t cluster_mask, + uint16_t holemask, unsigned int cluster_ino), + TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + cluster_mask, holemask, cluster_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(xfs_daddr_t, map_daddr) + __field(unsigned short, map_len) + __field(unsigned int, chunk_ino) + __field(unsigned int, nr_inodes) + __field(unsigned int, cluster_ino) + __field(uint16_t, cluster_mask) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->map_daddr = map_daddr; + __entry->map_len = map_len; + __entry->chunk_ino = chunk_ino; + __entry->nr_inodes = nr_inodes; + __entry->cluster_mask = cluster_mask; + __entry->holemask = holemask; + __entry->cluster_ino = cluster_ino; + ), + TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->map_daddr, + __entry->map_len, + __entry->chunk_ino, + __entry->nr_inodes, + __entry->cluster_mask, + __entry->holemask, + __entry->cluster_ino) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c @@ -28,7 +28,8 @@ */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; - unsigned int io_type; + int fork; + unsigned int data_seq; unsigned int cow_seq; struct xfs_ioend *ioend; }; @@ -255,30 +256,20 @@ xfs_end_io( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) xfs_reflink_cancel_cow_range(ip, offset, size, true); - break; - } - goto done; } /* - * Success: commit the COW or unwritten blocks if needed. + * Success: commit the COW or unwritten blocks if needed. */ - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) error = xfs_reflink_end_cow(ip, offset, size); - break; - case XFS_IO_UNWRITTEN: - /* writeback should never update isize */ + else if (ioend->io_state == XFS_EXT_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - break; - default: + else ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); - break; - } done: if (ioend->io_append_trans) @@ -293,7 +284,8 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) + if (ioend->io_fork == XFS_COW_FORK || + ioend->io_state == XFS_EXT_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -301,6 +293,75 @@ xfs_end_bio( xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } +/* + * Fast revalidation of the cached writeback mapping. Return true if the current + * mapping is valid, false otherwise. + */ +static bool +xfs_imap_valid( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + if (offset_fsb < wpc->imap.br_startoff || + offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + return false; + /* + * If this is a COW mapping, it is sufficient to check that the mapping + * covers the offset. Be careful to check this first because the caller + * can revalidate a COW mapping without updating the data seqno. + */ + if (wpc->fork == XFS_COW_FORK) + return true; + + /* + * This is not a COW mapping. Check the sequence number of the data fork + * because concurrent changes could have invalidated the extent. Check + * the COW fork because concurrent changes since the last time we + * checked (and found nothing at this offset) could have added + * overlapping blocks. + */ + if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + return false; + if (xfs_inode_has_cow_data(ip) && + wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + return false; + return true; +} + +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in wpc->imap. + * + * The current page is held locked so nothing could have removed the block + * backing offset_fsb, although it could have moved from the COW to the data + * fork by another thread. + */ +static int +xfs_convert_blocks( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs + * offset_fsb and put the result into wpc->imap. Allocate in a loop + * because it may take several attempts to allocate real blocks for a + * contiguous delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, + &wpc->imap, wpc->fork == XFS_COW_FORK ? + &wpc->cow_seq : &wpc->data_seq); + if (error) + return error; + } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + + return 0; +} + STATIC int xfs_map_blocks( struct xfs_writepage_ctx *wpc, @@ -310,26 +371,16 @@ xfs_map_blocks( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; - int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; - bool imap_valid; + int retries = 0; int error = 0; - /* - * We have to make sure the cached mapping is within EOF to protect - * against eofblocks trimming on file release leaving us with a stale - * mapping. Otherwise, a page for a subsequent file extending buffered - * write could get picked up by this writeback cycle and written to the - * wrong blocks. - * - * Note that what we really want here is a generic mapping invalidation - * mechanism to protect us from arbitrary extent modifying contexts, not - * just eofblocks. - */ - xfs_trim_extent_eof(&wpc->imap, ip); + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; /* * COW fork blocks can overlap data fork blocks even if the blocks @@ -346,31 +397,19 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - imap_valid = offset_fsb >= wpc->imap.br_startoff && - offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; - if (imap_valid && - (!xfs_inode_has_cow_data(ip) || - wpc->io_type == XFS_IO_COW || - wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) + if (xfs_imap_valid(wpc, ip, offset_fsb)) return 0; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * If we don't have a valid map, now it's time to get a new one for this * offset. This will convert delayed allocations (including COW ones) * into real extents. If we return without a valid map, it means we * landed in a hole and we skip the block. */ +retry: xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_super->s_maxbytes); - - if (offset > mp->m_super->s_maxbytes - count) - count = mp->m_super->s_maxbytes - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); /* * Check if this is offset is covered by a COW extents, and if yes use @@ -382,30 +421,16 @@ xfs_map_blocks( if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - /* - * Truncate can race with writeback since writeback doesn't - * take the iolock and truncate decreases the file size before - * it starts truncating the pages between new_size and old_size. - * Therefore, we can end up in the situation where writeback - * gets a CoW fork mapping but the truncate makes the mapping - * invalid and we end up in here trying to get a new mapping. - * bail out here so that we simply never get a valid mapping - * and so we drop the write altogether. The page truncation - * will kill the contents anyway. - */ - if (offset > i_size_read(inode)) { - wpc->io_type = XFS_IO_HOLE; - return 0; - } - whichfork = XFS_COW_FORK; - wpc->io_type = XFS_IO_COW; + + wpc->fork = XFS_COW_FORK; goto allocate_blocks; } /* - * Map valid and no COW extent in the way? We're done. + * No COW extent overlap. Revalidate now that we may have updated + * ->cow_seq. If the data mapping is still valid, we're done. */ - if (imap_valid) { + if (xfs_imap_valid(wpc, ip, offset_fsb)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -417,51 +442,65 @@ xfs_map_blocks( */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ + wpc->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); + wpc->fork = XFS_DATA_FORK; + + /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { - /* landed in a hole or beyond EOF */ imap.br_blockcount = imap.br_startoff - offset_fsb; imap.br_startoff = offset_fsb; imap.br_startblock = HOLESTARTBLOCK; - wpc->io_type = XFS_IO_HOLE; - } else { - /* - * Truncate to the next COW extent if there is one. This is the - * only opportunity to do this because we can skip COW fork - * lookups for the subsequent blocks in the mapping; however, - * the requirement to treat the COW range separately remains. - */ - if (cow_fsb != NULLFILEOFF && - cow_fsb < imap.br_startoff + imap.br_blockcount) - imap.br_blockcount = cow_fsb - imap.br_startoff; - - if (isnullstartblock(imap.br_startblock)) { - /* got a delalloc extent */ - wpc->io_type = XFS_IO_DELALLOC; - goto allocate_blocks; - } - - if (imap.br_state == XFS_EXT_UNWRITTEN) - wpc->io_type = XFS_IO_UNWRITTEN; - else - wpc->io_type = XFS_IO_OVERWRITE; + imap.br_state = XFS_EXT_NORM; } + /* + * Truncate to the next COW extent if there is one. This is the only + * opportunity to do this because we can skip COW fork lookups for the + * subsequent blocks in the mapping; however, the requirement to treat + * the COW range separately remains. + */ + if (cow_fsb != NULLFILEOFF && + cow_fsb < imap.br_startoff + imap.br_blockcount) + imap.br_blockcount = cow_fsb - imap.br_startoff; + + /* got a delalloc extent? */ + if (imap.br_startblock != HOLESTARTBLOCK && + isnullstartblock(imap.br_startblock)) + goto allocate_blocks; + wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); - trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); + trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); return 0; allocate_blocks: - error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, - &wpc->cow_seq); - if (error) + error = xfs_convert_blocks(wpc, ip, offset_fsb); + if (error) { + /* + * If we failed to find the extent in the COW fork we might have + * raced with a COW to data fork conversion or truncate. + * Restart the lookup to catch the extent in the data fork for + * the former case, but prevent additional retries to avoid + * looping forever for the latter case. + */ + if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + goto retry; + ASSERT(error != -EAGAIN); return error; - ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || - imap.br_startoff + imap.br_blockcount <= cow_fsb); - wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); + } + + /* + * Due to merging the return real extent might be larger than the + * original delalloc one. Trim the return extent to the next COW + * boundary again to force a re-lookup. + */ + if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && + cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) + wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + + ASSERT(wpc->imap.br_startoff <= offset_fsb); + ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); + trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); return 0; } @@ -486,7 +525,7 @@ xfs_submit_ioend( int status) { /* Convert CoW extents to regular */ - if (!status && ioend->io_type == XFS_IO_COW) { + if (!status && ioend->io_fork == XFS_COW_FORK) { /* * Yuk. This can do memory allocation, but is not a * transactional operation so everything is done in GFP_KERNEL @@ -504,7 +543,8 @@ xfs_submit_ioend( /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && + (ioend->io_fork == XFS_COW_FORK || + ioend->io_state != XFS_EXT_UNWRITTEN) && xfs_ioend_is_append(ioend) && !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); @@ -533,7 +573,8 @@ xfs_submit_ioend( static struct xfs_ioend * xfs_alloc_ioend( struct inode *inode, - unsigned int type, + int fork, + xfs_exntst_t state, xfs_off_t offset, struct block_device *bdev, sector_t sector) @@ -547,7 +588,8 @@ xfs_alloc_ioend( ioend = container_of(bio, struct xfs_ioend, io_inline_bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; + ioend->io_fork = fork; + ioend->io_state = state; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; @@ -608,13 +650,15 @@ xfs_add_to_ioend( sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || + if (!wpc->ioend || + wpc->fork != wpc->ioend->io_fork || + wpc->imap.br_state != wpc->ioend->io_state || sector != bio_end_sector(wpc->ioend->io_bio) || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, - bdev, sector); + wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, + wpc->imap.br_state, offset, bdev, sector); } if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { @@ -723,7 +767,7 @@ xfs_writepage_map( error = xfs_map_blocks(wpc, inode, file_offset); if (error) break; - if (wpc->io_type == XFS_IO_HOLE) + if (wpc->imap.br_startblock == HOLESTARTBLOCK) continue; xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, &submit_list); @@ -918,9 +962,7 @@ xfs_vm_writepage( struct page *page, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; ret = xfs_do_writepage(page, wbc, &wpc); @@ -934,9 +976,7 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); @@ -983,7 +1023,7 @@ xfs_vm_bmap( * Since we don't pass back blockdev info, we can't return bmap * information for rt files either. */ - if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) + if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; return iomap_bmap(mapping, block, &xfs_iomap_ops); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h @@ -9,32 +9,12 @@ extern struct bio_set xfs_ioend_bioset; /* - * Types of I/O for bmap clustering and I/O completion tracking. - * - * This enum is used in string mapping in xfs_trace.h; please keep the - * TRACE_DEFINE_ENUMs for it up to date. - */ -enum { - XFS_IO_HOLE, /* covers region without any block allocation */ - XFS_IO_DELALLOC, /* covers delalloc region */ - XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ - XFS_IO_OVERWRITE, /* covers already allocated extent */ - XFS_IO_COW, /* covers copy-on-write extent */ -}; - -#define XFS_IO_TYPES \ - { XFS_IO_HOLE, "hole" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" } - -/* * Structure for buffered I/O completions. */ struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ - unsigned int io_type; /* delalloc / unwritten */ + int io_fork; /* inode fork written back */ + xfs_exntst_t io_state; /* extent state */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c @@ -555,6 +555,7 @@ xfs_attr_put_listent( attrlist_ent_t *aep; int arraytop; + ASSERT(!context->seen_enough); ASSERT(!(context->flags & ATTR_KERNOVAL)); ASSERT(context->count >= 0); ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c @@ -1162,16 +1162,13 @@ xfs_zero_file_space( * by virtue of the hole punch. */ error = xfs_free_file_space(ip, offset, len); - if (error) - goto out; + if (error || xfs_is_always_cow_inode(ip)) + return error; - error = xfs_alloc_file_space(ip, round_down(offset, blksize), + return xfs_alloc_file_space(ip, round_down(offset, blksize), round_up(offset + len, blksize) - round_down(offset, blksize), XFS_BMAPI_PREALLOC); -out: - return error; - } static int diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c @@ -776,29 +776,24 @@ _xfs_buf_read( } /* - * Set buffer ops on an unchecked buffer and validate it, if possible. + * Reverify a buffer found in cache without an attached ->b_ops. * - * If the caller passed in an ops structure and the buffer doesn't have ops - * assigned, set the ops and use them to verify the contents. If the contents - * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no - * recorded errors and is already in XBF_DONE state. + * If the caller passed an ops structure and the buffer doesn't have ops + * assigned, set the ops and use it to verify the contents. If verification + * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is + * already in XBF_DONE state on entry. * - * Under normal operations, every in-core buffer must have buffer ops assigned - * to them when the buffer is read in from disk so that we can validate the - * metadata. - * - * However, there are two scenarios where one can encounter in-core buffers - * that don't have buffer ops. The first is during log recovery of buffers on - * a V4 filesystem, though these buffers are purged at the end of recovery. - * - * The other is online repair, which tries to match arbitrary metadata blocks - * with btree types in order to find the root. If online repair doesn't match - * the buffer with /any/ btree type, the buffer remains in memory in DONE state - * with no ops, and a subsequent read_buf call from elsewhere will not set the - * ops. This function helps us fix this situation. + * Under normal operations, every in-core buffer is verified on read I/O + * completion. There are two scenarios that can lead to in-core buffers without + * an assigned ->b_ops. The first is during log recovery of buffers on a V4 + * filesystem, though these buffers are purged at the end of recovery. The + * other is online repair, which intentionally reads with a NULL buffer ops to + * run several verifiers across an in-core buffer in order to establish buffer + * type. If repair can't establish that, the buffer will be left in memory + * with NULL buffer ops. */ int -xfs_buf_ensure_ops( +xfs_buf_reverify( struct xfs_buf *bp, const struct xfs_buf_ops *ops) { @@ -840,7 +835,7 @@ xfs_buf_read_map( return bp; } - xfs_buf_ensure_ops(bp, ops); + xfs_buf_reverify(bp, ops); if (flags & XBF_ASYNC) { /* @@ -2209,3 +2204,40 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) atomic_set(&bp->b_lru_ref, lru_ref); } + +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic( + struct xfs_buf *bp, + __be32 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + return false; + return dmagic == bp->b_ops->magic[idx]; +} +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic16( + struct xfs_buf *bp, + __be16 dmagic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + return false; + return dmagic == bp->b_ops->magic16[idx]; +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h @@ -125,6 +125,10 @@ struct xfs_buf_map { struct xfs_buf_ops { char *name; + union { + __be32 magic[2]; /* v4 and v5 on disk magic values */ + __be16 magic16[2]; /* v4 and v5 on disk magic values */ + }; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); @@ -385,6 +389,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); +bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c @@ -51,6 +51,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BUF_LRU_REF, XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, + XFS_RANDOM_IUNLINK_FALLBACK, }; struct xfs_errortag_attr { @@ -159,6 +160,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); +XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -195,6 +197,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), + XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), NULL, }; @@ -357,7 +360,8 @@ xfs_buf_verifier_error( fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); - xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", fa, bp->b_ops->name, bp->b_bn, name); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h @@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 +#define XFS_PTAG_VERIFIER_ERROR 0x00000100 #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c @@ -507,7 +507,7 @@ xfs_file_dio_aio_write( * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); return -EREMCHG; } @@ -872,14 +872,27 @@ xfs_file_fallocate( goto out_unlock; } - if (mode & FALLOC_FL_ZERO_RANGE) + if (mode & FALLOC_FL_ZERO_RANGE) { error = xfs_zero_file_space(ip, offset, len); - else { - if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; + } else if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + + if (!xfs_is_always_cow_inode(ip)) { + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); } + } else { + /* + * If always_cow mode we can't use preallocations and + * thus should not create them. + */ + if (xfs_is_always_cow_inode(ip)) { + error = -EOPNOTSUPP; + goto out_unlock; + } + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); } @@ -1068,10 +1081,10 @@ xfs_file_llseek( default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); break; case SEEK_DATA: - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); break; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c @@ -533,6 +533,7 @@ xfs_fs_reserve_ag_blocks( int error = 0; int err2; + mp->m_finobt_nores = false; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { pag = xfs_perag_get(mp, agno); err2 = xfs_ag_resv_init(pag, NULL); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c @@ -16,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 255 }, + .panic_mask = { 0, 0, 256 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c @@ -1332,7 +1332,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); + error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -1754,7 +1754,7 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - if (unlikely(mp->m_inotbt_nores)) { + if (unlikely(mp->m_finobt_nores)) { error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); @@ -1907,86 +1907,510 @@ xfs_inactive( } /* - * This is called when the inode's link count goes to 0 or we are creating a - * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be - * set to true as the link count is dropped to zero by the VFS after we've - * created the file successfully, so we have to add it to the unlinked list - * while the link count is non-zero. + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * What if we modelled the unlinked list as a collection of records capturing + * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd + * have a fast way to look up unlinked list predecessors, which avoids the + * slow list walk. That's exactly what we do here (in-core) with a per-AG + * rhashtable. + * + * Because this is a backref cache, we ignore operational failures since the + * iunlink code can fall back to the slow bucket walk. The only errors that + * should bubble out are for obviously incorrect situations. + * + * All users of the backref cache MUST hold the AGI buffer lock to serialize + * access or have otherwise provided for concurrency control. + */ + +/* Capture a "X.next_unlinked = Y" relationship. */ +struct xfs_iunlink { + struct rhash_head iu_rhash_head; + xfs_agino_t iu_agino; /* X */ + xfs_agino_t iu_next_unlinked; /* Y */ +}; + +/* Unlinked list predecessor lookup hashtable construction */ +static int +xfs_iunlink_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const xfs_agino_t *key = arg->key; + const struct xfs_iunlink *iu = obj; + + if (iu->iu_next_unlinked != *key) + return 1; + return 0; +} + +static const struct rhashtable_params xfs_iunlink_hash_params = { + .min_size = XFS_AGI_UNLINKED_BUCKETS, + .key_len = sizeof(xfs_agino_t), + .key_offset = offsetof(struct xfs_iunlink, + iu_next_unlinked), + .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = xfs_iunlink_obj_cmpfn, +}; + +/* + * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such + * relation is found. + */ +static xfs_agino_t +xfs_iunlink_lookup_backref( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + struct xfs_iunlink *iu; + + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + return iu ? iu->iu_agino : NULLAGINO; +} + +/* + * Take ownership of an iunlink cache entry and insert it into the hash table. + * If successful, the entry will be owned by the cache; if not, it is freed. + * Either way, the caller does not own @iu after this call. + */ +static int +xfs_iunlink_insert_backref( + struct xfs_perag *pag, + struct xfs_iunlink *iu) +{ + int error; + + error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + /* + * Fail loudly if there already was an entry because that's a sign of + * corruption of in-memory data. Also fail loudly if we see an error + * code we didn't anticipate from the rhashtable code. Currently we + * only anticipate ENOMEM. + */ + if (error) { + WARN(error != -ENOMEM, "iunlink cache insert error %d", error); + kmem_free(iu); + } + /* + * Absorb any runtime errors that aren't a result of corruption because + * this is a cache and we can always fall back to bucket list scanning. + */ + if (error != 0 && error != -EEXIST) + error = 0; + return error; +} + +/* Remember that @prev_agino.next_unlinked = @this_agino. */ +static int +xfs_iunlink_add_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t this_agino) +{ + struct xfs_iunlink *iu; + + if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) + return 0; + + iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu->iu_agino = prev_agino; + iu->iu_next_unlinked = this_agino; + + return xfs_iunlink_insert_backref(pag, iu); +} + +/* + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. + * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there + * wasn't any such entry then we don't bother. + */ +static int +xfs_iunlink_change_backref( + struct xfs_perag *pag, + xfs_agino_t agino, + xfs_agino_t next_unlinked) +{ + struct xfs_iunlink *iu; + int error; + + /* Look up the old entry; if there wasn't one then exit. */ + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + if (!iu) + return 0; + + /* + * Remove the entry. This shouldn't ever return an error, but if we + * couldn't remove the old entry we don't want to add it again to the + * hash table, and if the entry disappeared on us then someone's + * violated the locking rules and we need to fail loudly. Either way + * we cannot remove the inode because internal state is or would have + * been corrupt. + */ + error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + if (error) + return error; + + /* If there is no new next entry just free our item and return. */ + if (next_unlinked == NULLAGINO) { + kmem_free(iu); + return 0; + } + + /* Update the entry and re-add it to the hash table. */ + iu->iu_next_unlinked = next_unlinked; + return xfs_iunlink_insert_backref(pag, iu); +} + +/* Set up the in-core predecessor structures. */ +int +xfs_iunlink_init( + struct xfs_perag *pag) +{ + return rhashtable_init(&pag->pagi_unlinked_hash, + &xfs_iunlink_hash_params); +} + +/* Free the in-core predecessor structures. */ +static void +xfs_iunlink_free_item( + void *ptr, + void *arg) +{ + struct xfs_iunlink *iu = ptr; + bool *freed_anything = arg; + + *freed_anything = true; + kmem_free(iu); +} + +void +xfs_iunlink_destroy( + struct xfs_perag *pag) +{ + bool freed_anything = false; + + rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, + xfs_iunlink_free_item, &freed_anything); + + ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) + return -EFSCORRUPTED; + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +/* Set an on-disk inode's next_unlinked pointer. */ +STATIC void +xfs_iunlink_update_dinode( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_buf *ibp, + struct xfs_dinode *dip, + struct xfs_imap *imap, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + int offset; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + trace_xfs_iunlink_update_dinode(mp, agno, agino, + be32_to_cpu(dip->di_next_unlinked), next_agino); + + dip->di_next_unlinked = cpu_to_be32(next_agino); + offset = imap->im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + xfs_inobp_check(mp, ibp); +} + +/* Set an in-core inode's unlinked pointer and return the old value. */ +STATIC int +xfs_iunlink_update_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_agnumber_t agno, + xfs_agino_t next_agino, + xfs_agino_t *old_next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + xfs_agino_t old_value; + int error; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + return error; + + /* Make sure the old pointer isn't garbage. */ + old_value = be32_to_cpu(dip->di_next_unlinked); + if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + error = -EFSCORRUPTED; + goto out; + } + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + *old_next_agino = old_value; + if (old_value == next_agino) { + if (next_agino != NULLAGINO) + error = -EFSCORRUPTED; + goto out; + } + + /* Ok, update the new pointer. */ + xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + ibp, dip, &ip->i_imap, next_agino); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. * * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. */ STATIC int xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp = tp->t_mountp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agino_t agino; - short bucket_index; - int offset; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + xfs_agino_t next_agino; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ - error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(agi->agi_unlinked[bucket_index]); - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(mp, agno, next_agino)) + return -EFSCORRUPTED; + + if (next_agino != NULLAGINO) { + struct xfs_perag *pag; + xfs_agino_t old_agino; + + /* + * There is already another inode in the bucket, so point this + * inode to the current head of the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + &old_agino); + if (error) + return error; + ASSERT(old_agino == NULLAGINO); - if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { /* - * There is already another inode in the bucket we need - * to add ourselves to. Add us at the front of the list. - * Here we put the head pointer into our next pointer, - * and then we fall through to point the head at us. + * agino has been unlinked, add a backref from the next inode + * back to agino. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_add_backref(pag, agino, next_agino); + xfs_perag_put(pag); if (error) return error; + } + + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); +} - ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); - dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); +/* Return the imap, dinode pointer, and buffer for an inode. */ +STATIC int +xfs_iunlink_map_ino( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = tp->t_mountp; + int error; - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); + imap->im_blkno = 0; + error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + return 0; +} + +/* + * Walk the unlinked chain from @head_agino until we find the inode that + * points to @target_agino. Return the inode number, map, dinode pointer, + * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. + * + * @tp, @pag, @head_agino, and @target_agino are input parameters. + * @agino, @imap, @dipp, and @bpp are all output parameters. + * + * Do not call this function if @target_agino is the head of the list. + */ +STATIC int +xfs_iunlink_map_prev( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t head_agino, + xfs_agino_t target_agino, + xfs_agino_t *agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_agino_t next_agino; + int error; + + ASSERT(head_agino != target_agino); + *bpp = NULL; + + /* See if our backref cache can find it faster. */ + *agino = xfs_iunlink_lookup_backref(pag, target_agino); + if (*agino != NULLAGINO) { + error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); + if (error) + return error; + + if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) + return 0; + + /* + * If we get here the cache contents were corrupt, so drop the + * buffer and fall back to walking the bucket list. + */ + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + WARN_ON_ONCE(1); + } + + trace_xfs_iunlink_map_prev_fallback(mp, agno); + + /* Otherwise, walk the entire bucket until we find it. */ + next_agino = head_agino; + while (next_agino != target_agino) { + xfs_agino_t unlinked_agino; + + if (*bpp) + xfs_trans_brelse(tp, *bpp); + + *agino = next_agino; + error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, + bpp); + if (error) + return error; + + unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); + /* + * Make sure this pointer is valid and isn't an obvious + * infinite loop. + */ + if (!xfs_verify_agino(mp, agno, unlinked_agino) || + next_agino == unlinked_agino) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, + *dipp, sizeof(**dipp)); + error = -EFSCORRUPTED; + return error; + } + next_agino = unlinked_agino; } - /* - * Point the bucket head pointer at the inode being inserted. - */ - ASSERT(agino != 0); - agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); return 0; } @@ -1995,181 +2419,106 @@ xfs_iunlink( */ STATIC int xfs_iunlink_remove( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_ino_t next_ino; - xfs_mount_t *mp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_agino_t agino; - xfs_agino_t next_agino; - xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip = NULL; - short bucket_index; - int offset, last_offset = 0; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + struct xfs_buf *last_ibp; + struct xfs_dinode *last_dip = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t next_agino; + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; - mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + trace_xfs_iunlink_remove(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - if (!xfs_verify_agino(mp, agno, agino)) - return -EFSCORRUPTED; - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - if (!xfs_verify_agino(mp, agno, - be32_to_cpu(agi->agi_unlinked[bucket_index]))) { + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; } - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { - /* - * We're at the head of the list. Get the inode's on-disk - * buffer to see if there is anyone after us on the list. - * Only modify our next pointer if it is not already NULLAGINO. - * This saves us the overhead of dealing with the buffer when - * there is no need to change it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the bucket head pointer at the next inode. - */ - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - } else { - /* - * We need to search the list for the inode being freed. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - last_ibp = NULL; - while (next_agino != agino) { - struct xfs_imap imap; + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + if (error) + return error; - if (last_ibp) - xfs_trans_brelse(tp, last_ibp); + /* + * If there was a backref pointing from the next inode back to this + * one, remove it because we've removed this inode from the list. + * + * Later, if this inode was in the middle of the list we'll update + * this inode's backref to point from the next inode. + */ + if (next_agino != NULLAGINO) { + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_change_backref(pag, next_agino, + NULLAGINO); + if (error) + goto out; + } - imap.im_blkno = 0; - next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + if (head_agino == agino) { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); + if (error) + goto out; + } else { + struct xfs_imap imap; + xfs_agino_t prev_agino; - error = xfs_imap(mp, tp, next_ino, &imap, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } + if (!pag) + pag = xfs_perag_get(mp, agno); - error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, - &last_ibp, 0, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } + /* We need to search the list for the inode being freed. */ + error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp, + pag); + if (error) + goto out; - last_offset = imap.im_boffset; - next_agino = be32_to_cpu(last_dip->di_next_unlinked); - if (!xfs_verify_agino(mp, agno, next_agino)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - last_dip, sizeof(*last_dip)); - return -EFSCORRUPTED; - } - } + /* Point the previous inode on the list to the next inode. */ + xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + last_dip, &imap, next_agino); /* - * Now last_ibp points to the buffer previous to us on the - * unlinked list. Pull us from the list. + * Now we deal with the backref for this inode. If this inode + * pointed at a real inode, change the backref that pointed to + * us to point to our old next. If this inode was the end of + * the list, delete the backref that pointed to us. Note that + * change_backref takes care of deleting the backref if + * next_agino is NULLAGINO. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the previous inode on the list to the next inode. - */ - last_dip->di_next_unlinked = cpu_to_be32(next_agino); - ASSERT(next_agino != 0); - offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, last_dip); - - xfs_trans_inode_buf(tp, last_ibp); - xfs_trans_log_buf(tp, last_ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, last_ibp); + error = xfs_iunlink_change_backref(pag, agino, next_agino); + if (error) + goto out; } - return 0; + +out: + if (pag) + xfs_perag_put(pag); + return error; } /* @@ -2833,11 +3182,9 @@ xfs_rename_alloc_whiteout( /* * Prepare the tmpfile inode as if it were created through the VFS. - * Otherwise, the link increment paths will complain about nlink 0->1. - * Drop the link count as done by d_tmpfile(), complete the inode setup - * and flag it as linkable. + * Complete the inode setup and flag it as linkable. nlink is already + * zero, so we can skip the drop_nlink. */ - drop_nlink(VFS_I(tmpfile)); xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h @@ -500,4 +500,7 @@ extern struct kmem_zone *xfs_inode_zone; bool xfs_inode_verify_forks(struct xfs_inode *ip); +int xfs_iunlink_init(struct xfs_perag *pag); +void xfs_iunlink_destroy(struct xfs_perag *pag); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c @@ -35,18 +35,40 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -void +static int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return -EFSCORRUPTED; +} + +int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool shared) { struct xfs_mount *mp = ip->i_mount; + if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { + } else if (imap->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(imap->br_startblock)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { @@ -60,6 +82,13 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + + if (xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + if (shared) + iomap->flags |= IOMAP_F_SHARED; + return 0; } static void @@ -138,23 +167,6 @@ xfs_iomap_eof_align_last_fsb( return 0; } -STATIC int -xfs_alert_fsblock_zero( - xfs_inode_t *ip, - xfs_bmbt_irec_t *imap) -{ - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)imap->br_startblock, - (unsigned long long)imap->br_startoff, - (unsigned long long)imap->br_blockcount, - imap->br_state); - return -EFSCORRUPTED; -} - int xfs_iomap_write_direct( xfs_inode_t *ip, @@ -383,12 +395,13 @@ xfs_quota_calc_throttle( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_inode *ip, + int whichfork, loff_t offset, loff_t count, struct xfs_iext_cursor *icur) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmbt_irec prev; int shift = 0; @@ -522,15 +535,16 @@ xfs_file_iomap_begin_delay( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); xfs_fileoff_t end_fsb; - int error = 0, eof = 0; - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; + bool eof = false, cow_eof = false, shared = false; + int whichfork = XFS_DATA_FORK; + int error = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -548,7 +562,7 @@ xfs_file_iomap_begin_delay( XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto out_unlock; @@ -556,53 +570,101 @@ xfs_file_iomap_begin_delay( end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); + /* + * Search the data fork fork first to look up our source mapping. We + * always need the data fork map, as we have to return it to the + * iomap code so that the higher level write code can read data in to + * perform read-modify-write cycles for unaligned writes. + */ + eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) - got.br_startoff = end_fsb; /* fake hole until the end */ + imap.br_startoff = end_fsb; /* fake hole until the end */ + + /* We never need to allocate blocks for zeroing a hole. */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); + goto out_unlock; + } - if (got.br_startoff <= offset_fsb) { + /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. + */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + whichfork = XFS_COW_FORK; + goto done; + } + } + + if (imap.br_startoff <= offset_fsb) { /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of * existing extents that contain data. */ - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - got.br_state != XFS_EXT_UNWRITTEN)) { - xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got); - if (error) - goto out_unlock; + if (!xfs_is_cow_inode(ip) || + ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; } - trace_xfs_iomap_found(ip, offset, count, 0, &got); - goto done; - } + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); - if (flags & IOMAP_ZERO) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); - goto out_unlock; + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_inode_need_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the + * end of the extent. + */ + whichfork = XFS_COW_FORK; + end_fsb = imap.br_startoff + imap.br_blockcount; + } else { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat + * symmetric with the work writeback does. This is a completely + * arbitrary number pulled out of thin air. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (xfs_is_always_cow_inode(ip)) + whichfork = XFS_COW_FORK; } error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages - * to keep the chunks of work done where somewhat symmetric with the - * work writeback does. This is a completely arbitrary number pulled - * out of thin air as a best guess for initial testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, - &icur); + prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset, + count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -623,9 +685,11 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &icur, - eof); + error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, + whichfork == XFS_DATA_FORK ? &imap : &cmap, + whichfork == XFS_DATA_FORK ? &icur : &ccur, + whichfork == XFS_DATA_FORK ? eof : cow_eof); switch (error) { case 0: break; @@ -647,186 +711,22 @@ retry: * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); + trace_xfs_iomap_alloc(ip, offset, count, whichfork, + whichfork == XFS_DATA_FORK ? &imap : &cmap); done: - if (isnullstartblock(got.br_startblock)) - got.br_startblock = DELAYSTARTBLOCK; - - if (!got.br_startblock) { - error = xfs_alert_fsblock_zero(ip, &got); - if (error) + if (whichfork == XFS_COW_FORK) { + if (imap.br_startoff > offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); goto out_unlock; - } - - xfs_bmbt_to_iomap(ip, iomap, &got); - -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - -/* - * Pass in a delayed allocate extent, convert it to real extents; - * return to the caller the extent we create which maps on top of - * the originating callers request. - * - * Called without a lock on the inode. - * - * We no longer bother to look at the incoming map - all we have to - * guarantee is that whatever we allocate fills the required range. - */ -int -xfs_iomap_write_allocate( - xfs_inode_t *ip, - int whichfork, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - unsigned int *cow_seq) -{ - xfs_mount_t *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_fileoff_t offset_fsb, last_block; - xfs_fileoff_t end_fsb, map_start_fsb; - xfs_filblks_t count_fsb; - xfs_trans_t *tp; - int nimaps; - int error = 0; - int flags = XFS_BMAPI_DELALLOC; - int nres; - - if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; - - /* - * Make sure that the dquots are there. - */ - error = xfs_qm_dqattach(ip); - if (error) - return error; - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = imap->br_blockcount; - map_start_fsb = imap->br_startoff; - - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); - - while (count_fsb != 0) { - /* - * Set up a transaction with which to allocate the - * backing store for the file. Do allocations in a - * loop until we get some space in the range we are - * interested in. The other space that might be allocated - * is in the delayed allocation extent on which we sit - * but before our buffer starts. - */ - nimaps = 0; - while (nimaps == 0) { - nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - /* - * We have already reserved space for the extent and any - * indirect blocks when creating the delalloc extent, - * there is no need to reserve space in this transaction - * again. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * it is possible that the extents have changed since - * we did the read call as we dropped the ilock for a - * while. We have to be careful about truncates or hole - * punchs here - we are not allowed to allocate - * non-delalloc blocks here. - * - * The only protection against truncation is the pages - * for the range we are being asked to convert are - * locked and hence a truncate will block on them - * first. - * - * As a result, if we go beyond the range we really - * need and hit an delalloc extent boundary followed by - * a hole while we have excess blocks in the map, we - * will fill the hole incorrectly and overrun the - * transaction reservation. - * - * Using a single map prevents this as we are forced to - * check each map we look for overlap with the desired - * range and abort as soon as we find it. Also, given - * that we only return a single map, having one beyond - * what we can return is probably a bit silly. - * - * We also need to check that we don't go beyond EOF; - * this is a truncate optimisation as a truncate sets - * the new file size before block on the pages we - * currently have locked under writeback. Because they - * are about to be tossed, we don't need to write them - * back.... - */ - nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(ip, &last_block, - XFS_DATA_FORK); - if (error) - goto trans_cancel; - - last_block = XFS_FILEOFF_MAX(last_block, end_fsb); - if ((map_start_fsb + count_fsb) > last_block) { - count_fsb = last_block - map_start_fsb; - if (count_fsb == 0) { - error = -EAGAIN; - goto trans_cancel; - } - } - - /* - * From this point onwards we overwrite the imap - * pointer that the caller gave to us. - */ - error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, flags, nres, imap, - &nimaps); - if (error) - goto trans_cancel; - - error = xfs_trans_commit(tp); - if (error) - goto error0; - - if (whichfork == XFS_COW_FORK) - *cow_seq = READ_ONCE(ifp->if_seq); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - /* - * See if we were able to allocate an extent that - * covers at least part of the callers request - */ - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, imap); - - if ((offset_fsb >= imap->br_startoff) && - (offset_fsb < (imap->br_startoff + - imap->br_blockcount))) { - XFS_STATS_INC(mp, xs_xstrat_quick); - return 0; } - - /* - * So far we have not mapped the requested part of the - * file, just surrounding data, try again. - */ - count_fsb -= imap->br_blockcount; - map_start_fsb = imap->br_startoff + imap->br_blockcount; + /* ensure we only report blocks we have a reservation for */ + xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); + shared = true; } - -trans_cancel: - xfs_trans_cancel(tp); -error0: + error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -975,7 +875,7 @@ xfs_ilock_for_iomap( * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && is_write) { + if (xfs_is_cow_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -1009,7 +909,7 @@ relock: * check, so if we got ILOCK_SHARED for a write and but we're now a * reflink inode we have to switch to ILOCK_EXCL and relock. */ - if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) { xfs_iunlock(ip, mode); mode = XFS_ILOCK_EXCL; goto relock; @@ -1081,23 +981,33 @@ xfs_file_iomap_begin( * Break shared extents if necessary. Checks for non-blocking IO have * been done up front, so we don't need to do them here. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { + struct xfs_bmbt_irec cmap; + bool directio = (flags & IOMAP_DIRECT); + /* if zeroing doesn't need COW allocation, then we are done. */ if ((flags & IOMAP_ZERO) && !needs_cow_for_zeroing(&imap, nimaps)) goto out_found; - if (flags & IOMAP_DIRECT) { - /* may drop and re-acquire the ilock */ - error = xfs_reflink_allocate_cow(ip, &imap, &shared, - &lockmode); - if (error) - goto out_unlock; - } else { - error = xfs_reflink_reserve_cow(ip, &imap); - if (error) - goto out_unlock; - } + /* may drop and re-acquire the ilock */ + cmap = imap; + error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode, + directio); + if (error) + goto out_unlock; + + /* + * For buffered writes we need to report the address of the + * previous block (if there was any) so that the higher level + * write code can perform read-modify-write operations; we + * won't need the CoW fork mapping until writeback. For direct + * I/O, which must be block aligned, we need to report the + * newly allocated address. If the data fork has a hole, copy + * the COW fork mapping to avoid allocating to the data fork. + */ + if (directio || imap.br_startblock == HOLESTARTBLOCK) + imap = cmap; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; @@ -1139,23 +1049,15 @@ xfs_file_iomap_begin( return error; iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: - if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields - & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; - - xfs_bmbt_to_iomap(ip, iomap, &imap); - - if (shared) - iomap->flags |= IOMAP_F_SHARED; - return 0; + return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); out_found: ASSERT(nimaps); xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); goto out_finish; out_unlock: @@ -1241,6 +1143,92 @@ const struct iomap_ops xfs_iomap_ops = { }; static int +xfs_seek_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + int error = 0; + unsigned lockmode; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lockmode = xfs_ilock_data_map_shared(ip); + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } + + if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { + /* + * If we found a data extent we are done. + */ + if (imap.br_startoff <= offset_fsb) + goto done; + data_fsb = imap.br_startoff; + } else { + /* + * Fake a hole until the end of the file. + */ + data_fsb = min(XFS_B_TO_FSB(mp, offset + length), + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + } + + /* + * If a COW fork extent covers the hole, report it - capped to the next + * data fork extent: + */ + if (xfs_inode_has_cow_data(ip) && + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cow_fsb = cmap.br_startoff; + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + if (data_fsb < cow_fsb + cmap.br_blockcount) + end_fsb = min(end_fsb, data_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + /* + * This is a COW extent, so we must probe the page cache + * because there could be dirty page cache being backed + * by this extent. + */ + iomap->type = IOMAP_UNWRITTEN; + goto out_unlock; + } + + /* + * Else report a hole, capped to the next found data or COW extent. + */ + if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) + imap.br_blockcount = cow_fsb - offset_fsb; + else + imap.br_blockcount = data_fsb - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; +done: + xfs_trim_extent(&imap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + +const struct iomap_ops xfs_seek_iomap_ops = { + .iomap_begin = xfs_seek_iomap_begin, +}; + +static int xfs_xattr_iomap_begin( struct inode *inode, loff_t offset, @@ -1273,12 +1261,10 @@ xfs_xattr_iomap_begin( out_unlock: xfs_iunlock(ip, lockmode); - if (!error) { - ASSERT(nimaps); - xfs_bmbt_to_iomap(ip, iomap, &imap); - } - - return error; + if (error) + return error; + ASSERT(nimaps); + return xfs_bmbt_to_iomap(ip, iomap, &imap, false); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h @@ -13,12 +13,10 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, - struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); -void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *); +int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *, bool shared); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t @@ -42,6 +40,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c @@ -191,9 +191,18 @@ xfs_generic_create( xfs_setup_iops(ip); - if (tmpfile) + if (tmpfile) { + /* + * The VFS requires that any inode fed to d_tmpfile must have + * nlink == 1 so that it can decrement the nlink in d_tmpfile. + * However, we created the temp file with nlink == 0 because + * we're not allowed to put an inode with nlink > 0 on the + * unlinked list. Therefore we have to set nlink to 1 so that + * d_tmpfile can immediately set it back to zero. + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); - else + } else d_instantiate(dentry, inode); xfs_finish_inode_setup(ip); @@ -522,6 +531,10 @@ xfs_vn_getattr( } } + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) @@ -529,6 +542,10 @@ xfs_vn_getattr( if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND | + STATX_ATTR_NODUMP); + switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c @@ -2439,17 +2439,21 @@ xlog_recover_validate_buf_type( case XFS_BLFT_BTREE_BUF: switch (magic32) { case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_allocbt_buf_ops; + bp->b_ops = &xfs_cntbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: - case XFS_FIBT_CRC_MAGIC: case XFS_IBT_MAGIC: - case XFS_FIBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; @@ -3045,7 +3049,7 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c @@ -149,6 +149,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -227,6 +228,9 @@ xfs_initialize_perag( /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + error = xfs_iunlink_init(pag); + if (error) + goto out_hash_destroy; } index = xfs_set_inode_alloc(mp, agcount); @@ -249,6 +253,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_iunlink_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h @@ -138,7 +138,7 @@ typedef struct xfs_mount { struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint64_t m_flags; /* global mount flags */ - bool m_inotbt_nores; /* no per-AG finobt resv. */ + bool m_finobt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode @@ -194,6 +194,7 @@ typedef struct xfs_mount { */ uint32_t m_generation; + bool m_always_cow; bool m_fail_unmount; #ifdef DEBUG /* @@ -396,6 +397,13 @@ typedef struct xfs_perag { /* reference count */ uint8_t pagf_refcount_level; + + /* + * Unlinked inode information. This incore information reflects + * data stored in the AGI, so callers must hold the AGI buffer lock + * or have some other means to control concurrency. + */ + struct rhashtable pagi_unlinked_hash; } xfs_perag_t; static inline struct xfs_ag_resv * diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h @@ -125,6 +125,27 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + + /* + * The v5 superblock format extended several v4 header structures with + * additional data. While new fields are only accessible on v5 + * superblocks, it's important that the v5 structures place original v4 + * fields/headers in the correct location on-disk. For example, we must + * be able to find magic values at the same location in certain blocks + * regardless of superblock version. + * + * The following checks ensure that various v5 data structures place the + * subset of v4 metadata associated with the same type of block at the + * start of the on-disk block. If there is no data structure definition + * for certain types of v4 blocks, traverse down to the first field of + * common metadata (e.g., magic value) and make sure it is at offset + * zero. + */ + XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c @@ -185,7 +185,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_bmbt_to_iomap(ip, iomap, &imap); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -234,93 +234,59 @@ xfs_reflink_trim_around_shared( } } -/* - * Trim the passed in imap to the next shared/unshared extent boundary, and - * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. - * - * Note that imap will always contain the block numbers for the existing blocks - * in the data fork, as the upper layers need them for read-modify-write - * operations. - */ -int -xfs_reflink_reserve_cow( +bool +xfs_inode_need_cow( struct xfs_inode *ip, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool *shared) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got; - int error = 0; - bool eof = false; - struct xfs_iext_cursor icur; - bool shared; - - /* - * Search the COW fork extent list first. This serves two purposes: - * first this implement the speculative preallocation using cowextisze, - * so that we also unshared block adjacent to shared blocks instead - * of just the shared blocks themselves. Second the lookup in the - * extent list is generally faster than going out to the shared extent - * tree. - */ - - if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) - eof = true; - if (!eof && got.br_startoff <= imap->br_startoff) { - trace_xfs_reflink_cow_found(ip, imap); - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + /* We can't update any real extents in always COW mode. */ + if (xfs_is_always_cow_inode(ip) && + !isnullstartblock(imap->br_startblock)) { + *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, &shared); - if (error) - return error; - - /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) - return 0; - - /* - * Fork all the shared blocks from our write offset until the end of - * the extent. - */ - error = xfs_qm_dqattach_locked(ip, false); - if (error) - return error; - - error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &icur, eof); - if (error == -ENOSPC || error == -EDQUOT) - trace_xfs_reflink_cow_enospc(ip, imap); - if (error) - return error; - - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - trace_xfs_reflink_cow_alloc(ip, &got); - return 0; + return xfs_reflink_trim_around_shared(ip, imap, shared); } -/* Convert part of an unwritten CoW extent to a real one. */ -STATIC int -xfs_reflink_convert_cow_extent( - struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - xfs_fileoff_t offset_fsb, - xfs_filblks_t count_fsb) +static int +xfs_reflink_convert_cow_locked( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb) { - int nimaps = 1; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_btree_cur *dummy_cur = NULL; + int dummy_logflags; + int error = 0; - if (imap->br_state == XFS_EXT_NORM) + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; - xfs_trim_extent(imap, offset_fsb, count_fsb); - trace_xfs_reflink_convert_cow(ip, imap); - if (imap->br_blockcount == 0) - return 0; - return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, - &nimaps); + do { + if (got.br_startoff >= offset_fsb + count_fsb) + break; + if (got.br_state == XFS_EXT_NORM) + continue; + if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) + return -EIO; + + xfs_trim_extent(&got, offset_fsb, count_fsb); + if (!got.br_blockcount) + continue; + + got.br_state = XFS_EXT_NORM; + error = xfs_bmap_add_extent_unwritten_real(NULL, ip, + XFS_COW_FORK, &icur, &dummy_cur, &got, + &dummy_logflags); + if (error) + return error; + } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); + + return error; } /* Convert all of the unwritten CoW extents in a file's range to real ones. */ @@ -334,15 +300,12 @@ xfs_reflink_convert_cow( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_filblks_t count_fsb = end_fsb - offset_fsb; - struct xfs_bmbt_irec imap; - int nimaps = 1, error = 0; + int error; ASSERT(count != 0); xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | - XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -375,7 +338,7 @@ xfs_find_trim_cow_extent( if (got.br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, got.br_startoff - imap->br_startoff); - return xfs_reflink_trim_around_shared(ip, imap, shared); + return xfs_inode_need_cow(ip, imap, shared); } *shared = true; @@ -397,7 +360,8 @@ xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, - uint *lockmode) + uint *lockmode, + bool convert_now) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; @@ -409,7 +373,10 @@ xfs_reflink_allocate_cow( xfs_extlen_t resblks = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } error = xfs_find_trim_cow_extent(ip, imap, shared, &found); if (error || !*shared) @@ -471,7 +438,16 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: - return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); + xfs_trim_extent(imap, offset_fsb, count_fsb); + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || imap->br_state == XFS_EXT_NORM) + return 0; + trace_xfs_reflink_convert_cow(ip, imap); + return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, @@ -586,7 +562,7 @@ xfs_reflink_cancel_cow_range( int error; trace_xfs_reflink_cancel_cow_range(ip, offset, count); - ASSERT(xfs_is_reflink_inode(ip)); + ASSERT(ip->i_cowfp); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (count == NULLFILEOFF) @@ -1192,7 +1168,7 @@ xfs_reflink_remap_blocks( break; ASSERT(nimaps == 1); - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, &imap); /* Translate imap into the destination file. */ diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h @@ -6,16 +6,28 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) +{ + return ip->i_mount->m_always_cow && + xfs_sb_version_hasreflink(&ip->i_mount->m_sb); +} + +static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +{ + return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); +} + extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + bool *shared); -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); + struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, + bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c @@ -1594,6 +1594,13 @@ xfs_mount_alloc( INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; return mp; } @@ -1729,11 +1736,18 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) { - xfs_alert(mp, + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, "reflink not compatible with realtime device!"); - error = -EINVAL; - goto out_filestream_unmount; + error = -EINVAL; + goto out_filestream_unmount; + } + + if (xfs_globals.always_cow) { + xfs_info(mp, "using DEBUG-only always_cow mode."); + mp->m_always_cow = true; + } } if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,7 @@ struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ + bool always_cow; /* use COW fork for all overwrites */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c @@ -183,10 +183,34 @@ mount_delay_show( } XFS_SYSFS_ATTR_RW(mount_delay); +static ssize_t +always_cow_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.always_cow); + if (ret < 0) + return ret; + return count; +} + +static ssize_t +always_cow_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow); +} +XFS_SYSFS_ATTR_RW(always_cow); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), + ATTR_LIST(always_cow), NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h @@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \ DEFINE_READPAGE_EVENT(xfs_vm_readpage); DEFINE_READPAGE_EVENT(xfs_vm_readpages); -TRACE_DEFINE_ENUM(XFS_IO_HOLE); -TRACE_DEFINE_ENUM(XFS_IO_DELALLOC); -TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN); -TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE); -TRACE_DEFINE_ENUM(XFS_IO_COW); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int type, struct xfs_bmbt_irec *irec), - TP_ARGS(ip, offset, count, type, irec), + int whichfork, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, whichfork, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, type) + __field(int, whichfork) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->type = type; + __entry->whichfork = whichfork; __entry->startoff = irec ? irec->br_startoff : 0; __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->count, - __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->whichfork == XFS_COW_FORK ? "cow" : "data", __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount) ) -#define DEFINE_IOMAP_EVENT(name) \ +#define DEFINE_IMAP_EVENT(name) \ DEFINE_EVENT(xfs_imap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int type, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, type, irec)) -DEFINE_IOMAP_EVENT(xfs_map_blocks_found); -DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_found); + int whichfork, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, whichfork, irec)) +DEFINE_IMAP_EVENT(xfs_map_blocks_found); +DEFINE_IMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); TRACE_EVENT(xfs_reflink_remap_blocks_loop, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, @@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); -DEFINE_RW_EVENT(xfs_reflink_reserve_cow); - DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); @@ -3371,6 +3362,84 @@ DEFINE_TRANS_EVENT(xfs_trans_roll); DEFINE_TRANS_EVENT(xfs_trans_add_item); DEFINE_TRANS_EVENT(xfs_trans_free_items); +TRACE_EVENT(xfs_iunlink_update_bucket, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bucket = bucket; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_ptr, + __entry->new_ptr) +); + +TRACE_EVENT(xfs_iunlink_update_dinode, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, agino, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->old_ptr, + __entry->new_ptr) +); + +DECLARE_EVENT_CLASS(xfs_ag_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno %u agino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agino) +) + +#define DEFINE_AGINODE_EVENT(name) \ +DEFINE_EVENT(xfs_ag_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_AGINODE_EVENT(xfs_iunlink); +DEFINE_AGINODE_EVENT(xfs_iunlink_remove); +DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c @@ -17,7 +17,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_inode.h" -#include "xfs_defer.h" /* * This routine is called to allocate a "bmap update done" diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c @@ -277,7 +277,7 @@ xfs_trans_read_buf_map( * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); - error = xfs_buf_ensure_ops(bp, ops); + error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c @@ -18,7 +18,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_trace.h" -#include "xfs_defer.h" /* * This routine is called to allocate an "extent free done" diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c @@ -16,7 +16,6 @@ #include "xfs_refcount_item.h" #include "xfs_alloc.h" #include "xfs_refcount.h" -#include "xfs_defer.h" /* * This routine is called to allocate a "refcount update done" diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c @@ -16,7 +16,6 @@ #include "xfs_rmap_item.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_defer.h" /* Set the map extent flags for this reverse mapping. */ static void diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c @@ -129,6 +129,9 @@ __xfs_xattr_put_listent( char *offset; int arraytop; + if (context->count < 0 || context->seen_enough) + return; + if (!context->alist) goto compute_size;