whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit fe0142df648f5478f410c41e01771b90b9793215
parent bfd93a87eadb03499a5ff02dfebfaf515310d27c
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed, 24 Oct 2018 17:36:12 +0100

Merge tag 'xfs-4.20-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pul xfs updates from Dave Chinner:
 "There's not a huge amount of change in this cycle - Darrick has been
  out of action for a couple of months (hence me sending the last few
  pull requests), so we decided a quiet cycle mainly focussed on bug
  fixes was a good idea. Darrick will take the helm again at the end of
  this merge window.

  FYI, I may be sending another update later in the cycle - there's a
  pending rework of the clone/dedupe_file_range code that fixes numerous
  bugs that is spread amongst the VFS, XFS and ocfs2 code. It has been
  reviewed and tested, Al and I just need to work out the details of the
  merge, so it may come from him rather than me.

  Summary:

   - only support filesystems with unwritten extents

   - add definition for statfs XFS magic number

   - remove unused parameters around reflink code

   - more debug for dangling delalloc extents

   - cancel COW extents on extent swap targets

   - fix quota stats output and clean up the code

   - refactor some of the attribute code in preparation for parent
     pointers

   - fix several buffer handling bugs"

* tag 'xfs-4.20-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (21 commits)
  xfs: cancel COW blocks before swapext
  xfs: clear ail delwri queued bufs on unmount of shutdown fs
  xfs: use offsetof() in place of offset macros for __xfsstats
  xfs: Fix xqmstats offsets in /proc/fs/xfs/xqmstat
  xfs: fix use-after-free race in xfs_buf_rele
  xfs: Add attibute remove and helper functions
  xfs: Add attibute set and helper functions
  xfs: Add helper function xfs_attr_try_sf_addname
  xfs: Move fs/xfs/xfs_attr.h to fs/xfs/libxfs/xfs_attr.h
  xfs: issue log message on user force shutdown
  xfs: fix buffer state management in xrep_findroot_block
  xfs: always assign buffer verifiers when one is provided
  xfs: xrep_findroot_block should reject root blocks with siblings
  xfs: add a define for statfs magic to uapi
  xfs: print dangling delalloc extents
  xfs: fix fork selection in xfs_find_trim_cow_extent
  xfs: remove the unused trimmed argument from xfs_reflink_trim_around_shared
  xfs: remove the unused shared argument to xfs_reflink_reserve_cow
  xfs: handle zeroing in xfs_file_iomap_begin_delay
  xfs: remove suport for filesystems without unwritten extent flag
  ...

Diffstat:
Mfs/xfs/libxfs/xfs_attr.c | 236+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Afs/xfs/libxfs/xfs_attr.h | 150+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mfs/xfs/libxfs/xfs_bmap.c | 70+++++++++++++++++++++++++++++++++++++---------------------------------
Mfs/xfs/libxfs/xfs_bmap.h | 1+
Mfs/xfs/libxfs/xfs_format.h | 8++------
Mfs/xfs/libxfs/xfs_sb.c | 5++---
Mfs/xfs/scrub/repair.c | 128+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mfs/xfs/scrub/scrub.c | 13-------------
Mfs/xfs/xfs_aops.c | 4++--
Mfs/xfs/xfs_aops.h | 14++++++--------
Dfs/xfs/xfs_attr.h | 148-------------------------------------------------------------------------------
Mfs/xfs/xfs_bmap_util.c | 61+++++++++----------------------------------------------------
Mfs/xfs/xfs_buf.c | 109++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mfs/xfs/xfs_buf.h | 2++
Mfs/xfs/xfs_fsops.c | 50+++++++++++++++++++++++++-------------------------
Mfs/xfs/xfs_ioctl.c | 8--------
Mfs/xfs/xfs_iomap.c | 53+++++++++++++++++++++++++++++++++++++++++------------
Mfs/xfs/xfs_reflink.c | 33+++++++++++++++------------------
Mfs/xfs/xfs_reflink.h | 4++--
Mfs/xfs/xfs_stats.c | 52++++++++++++++++++++++++++++------------------------
Mfs/xfs/xfs_stats.h | 28+++-------------------------
Mfs/xfs/xfs_super.c | 38+++++++++++++++++++++++++++++++++++---
Mfs/xfs/xfs_trans.h | 1+
Mfs/xfs/xfs_trans_ail.c | 28++++++++++++++++++++++------
Mfs/xfs/xfs_trans_buf.c | 42++++++++++++++++++++++++++++++++++++++++++
Minclude/uapi/linux/magic.h | 1+
26 files changed, 756 insertions(+), 531 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c @@ -191,6 +191,128 @@ xfs_attr_calc_size( return nblks; } +STATIC int +xfs_attr_try_sf_addname( + struct xfs_inode *dp, + struct xfs_da_args *args) +{ + + struct xfs_mount *mp = dp->i_mount; + int error, error2; + + error = xfs_attr_shortform_addname(args); + if (error == -ENOSPC) + return error; + + /* + * Commit the shortform mods, and we're done. + * NOTE: this is also the error path (EEXIST, etc). + */ + if (!error && (args->flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args->trans); + + error2 = xfs_trans_commit(args->trans); + args->trans = NULL; + return error ? error : error2; +} + +/* + * Set the attribute specified in @args. + */ +int +xfs_attr_set_args( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) +{ + struct xfs_inode *dp = args->dp; + int error; + + /* + * If the attribute list is non-existent or a shortform list, + * upgrade it to a single-leaf-block attribute list. + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + + /* + * Build initial attribute list (if required). + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(args); + + /* + * Try to add the attr to the attribute list in the inode. + */ + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) + return error; + + /* + * It won't fit in the shortform, transform to a leaf block. + * GROT: another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a + * concurrent AIL push cannot grab the half-baked leaf + * buffer and run into problems with the write verifier. + */ + xfs_trans_bhold(args->trans, *leaf_bp); + + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + /* + * Commit the leaf transformation. We'll need another + * (linked) transaction to add the new attribute to the + * leaf. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + xfs_trans_bjoin(args->trans, *leaf_bp); + *leaf_bp = NULL; + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + error = xfs_attr_leaf_addname(args); + else + error = xfs_attr_node_addname(args); + return error; +} + +/* + * Remove the attribute specified in @args. + */ +int +xfs_attr_remove_args( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + int error; + + if (!xfs_inode_hasattr(dp)) { + error = -ENOATTR; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + error = xfs_attr_shortform_remove(args); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_removename(args); + } else { + error = xfs_attr_node_removename(args); + } + + return error; +} + int xfs_attr_set( struct xfs_inode *dp, @@ -204,7 +326,7 @@ xfs_attr_set( struct xfs_da_args args; struct xfs_trans_res tres; int rsvd = (flags & ATTR_ROOT) != 0; - int error, err2, local; + int error, local; XFS_STATS_INC(mp, xs_attr_set); @@ -255,93 +377,17 @@ xfs_attr_set( error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_trans_cancel(args.trans); - return error; - } + if (error) + goto out_trans_cancel; xfs_trans_ijoin(args.trans, dp, 0); - - /* - * If the attribute list is non-existent or a shortform list, - * upgrade it to a single-leaf-block attribute list. - */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - - /* - * Build initial attribute list (if required). - */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) - xfs_attr_shortform_create(&args); - - /* - * Try to add the attr to the attribute list in - * the inode. - */ - error = xfs_attr_shortform_addname(&args); - if (error != -ENOSPC) { - /* - * Commit the shortform mods, and we're done. - * NOTE: this is also the error path (EEXIST, etc). - */ - ASSERT(args.trans != NULL); - - /* - * If this is a synchronous mount, make sure that - * the transaction goes to disk before returning - * to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); - - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_trans_ichgtime(args.trans, dp, - XFS_ICHGTIME_CHG); - } - err2 = xfs_trans_commit(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - return error ? error : err2; - } - - /* - * It won't fit in the shortform, transform to a leaf block. - * GROT: another possible req'mt for a double-split btree op. - */ - error = xfs_attr_shortform_to_leaf(&args, &leaf_bp); - if (error) - goto out; - /* - * Prevent the leaf buffer from being unlocked so that a - * concurrent AIL push cannot grab the half-baked leaf - * buffer and run into problems with the write verifier. - */ - xfs_trans_bhold(args.trans, leaf_bp); - error = xfs_defer_finish(&args.trans); - if (error) - goto out; - - /* - * Commit the leaf transformation. We'll need another (linked) - * transaction to add the new attribute to the leaf, which - * means that we have to hold & join the leaf buffer here too. - */ - error = xfs_trans_roll_inode(&args.trans, dp); - if (error) - goto out; - xfs_trans_bjoin(args.trans, leaf_bp); - leaf_bp = NULL; - } - - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) - error = xfs_attr_leaf_addname(&args); - else - error = xfs_attr_node_addname(&args); + error = xfs_attr_set_args(&args, &leaf_bp); if (error) - goto out; + goto out_release_leaf; + if (!args.trans) { + /* shortform attribute has already been committed */ + goto out_unlock; + } /* * If this is a synchronous mount, make sure that the @@ -358,17 +404,17 @@ xfs_attr_set( */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); error = xfs_trans_commit(args.trans); +out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -out: +out_release_leaf: if (leaf_bp) xfs_trans_brelse(args.trans, leaf_bp); +out_trans_cancel: if (args.trans) xfs_trans_cancel(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; + goto out_unlock; } /* @@ -423,17 +469,7 @@ xfs_attr_remove( */ xfs_trans_ijoin(args.trans, dp, 0); - if (!xfs_inode_hasattr(dp)) { - error = -ENOATTR; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); - error = xfs_attr_shortform_remove(&args); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_removename(&args); - } else { - error = xfs_attr_node_removename(&args); - } - + error = xfs_attr_remove_args(&args); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_ATTR_H__ +#define __XFS_ATTR_H__ + +struct xfs_inode; +struct xfs_da_args; +struct xfs_attr_list_context; + +/* + * Large attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. + * The internal links in the Btree are logical block offsets into the file. + * + * Small attribute lists use a different format and are packed as tightly + * as possible so as to fit into the literal area of the inode. + */ + +/*======================================================================== + * External interfaces + *========================================================================*/ + + +#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ +#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ +#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ +#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ + +#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ +#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ + +#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ + +#define XFS_ATTR_FLAGS \ + { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ + { ATTR_ROOT, "ROOT" }, \ + { ATTR_TRUST, "TRUST" }, \ + { ATTR_SECURE, "SECURE" }, \ + { ATTR_CREATE, "CREATE" }, \ + { ATTR_REPLACE, "REPLACE" }, \ + { ATTR_KERNOTIME, "KERNOTIME" }, \ + { ATTR_KERNOVAL, "KERNOVAL" }, \ + { ATTR_INCOMPLETE, "INCOMPLETE" } + +/* + * The maximum size (into the kernel or returned from the kernel) of an + * attribute value or the buffer used for an attr_list() call. Larger + * sizes will result in an ERANGE return code. + */ +#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ + +/* + * Define how lists of attribute names are returned to the user from + * the attr_list() call. A large, 32bit aligned, buffer is passed in + * along with its size. We put an array of offsets at the top that each + * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. + */ +typedef struct attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +} attrlist_t; + +/* + * Show the interesting info about one attribute. This is what the + * al_offset[i] entry points to. + */ +typedef struct attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +} attrlist_ent_t; + +/* + * Given a pointer to the (char*) buffer containing the attr_list() result, + * and an index, return a pointer to the indicated attribute in the buffer. + */ +#define ATTR_ENTRY(buffer, index) \ + ((attrlist_ent_t *) \ + &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) + +/* + * Kernel-internal version of the attrlist cursor. + */ +typedef struct attrlist_cursor_kern { + __u32 hashval; /* hash value of next entry to add */ + __u32 blkno; /* block containing entry (suggestion) */ + __u32 offset; /* offset in list of equal-hashvals */ + __u16 pad1; /* padding to match user-level */ + __u8 pad2; /* padding to match user-level */ + __u8 initted; /* T/F: cursor has been initialized */ +} attrlist_cursor_kern_t; + + +/*======================================================================== + * Structure used to pass context around among the routines. + *========================================================================*/ + + +/* void; state communicated via *context */ +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, + unsigned char *, int, int); + +typedef struct xfs_attr_list_context { + struct xfs_trans *tp; + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor; /* position in list */ + char *alist; /* output buffer */ + int seen_enough; /* T/F: seen enough of list? */ + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch; /* T/F: resynch with cursor */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +} xfs_attr_list_context_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Overall external interface routines. + */ +int xfs_attr_inactive(struct xfs_inode *dp); +int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); +int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_inode_hasattr(struct xfs_inode *ip); +int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_remove_args(struct xfs_da_args *args); +int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, + int flags, struct attrlist_cursor_kern *cursor); + + +#endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c @@ -1019,6 +1019,34 @@ xfs_bmap_add_attrfork_local( return -EFSCORRUPTED; } +/* Set an inode attr fork off based on the format */ +int +xfs_bmap_set_attrforkoff( + struct xfs_inode *ip, + int size, + int *version) +{ + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version) + *version = 2; + break; + default: + ASSERT(0); + return -EINVAL; + } + + return 0; +} + /* * Convert inode from non-attributed to attributed. * Must not be in a transaction, ip must not be locked. @@ -1070,26 +1098,9 @@ xfs_bmap_add_attrfork( xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; - else if (mp->m_flags & XFS_MOUNT_ATTR2) - version = 2; - break; - default: - ASSERT(0); - error = -EINVAL; + error = xfs_bmap_set_attrforkoff(ip, size, &version); + if (error) goto trans_cancel; - } - ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); ip->i_afp->if_flags = XFS_IFEXTENTS; @@ -4081,8 +4092,7 @@ xfs_bmapi_allocate( * extents to real extents when we're about to write the data. */ if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && - (bma->flags & XFS_BMAPI_PREALLOC) && - xfs_sb_version_hasextflgbit(&mp->m_sb)) + (bma->flags & XFS_BMAPI_PREALLOC)) bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) @@ -5245,8 +5255,7 @@ __xfs_bunmapi( * unmapping part of it. But we can't really * get rid of part of a realtime extent. */ - if (del.br_state == XFS_EXT_UNWRITTEN || - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + if (del.br_state == XFS_EXT_UNWRITTEN) { /* * This piece is unwritten, or we're not * using unwritten extents. Skip over it. @@ -5296,10 +5305,9 @@ __xfs_bunmapi( del.br_blockcount -= mod; del.br_startoff += mod; del.br_startblock += mod; - } else if ((del.br_startoff == start && - (del.br_state == XFS_EXT_UNWRITTEN || - tp->t_blk_res == 0)) || - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + } else if (del.br_startoff == start && + (del.br_state == XFS_EXT_UNWRITTEN || + tp->t_blk_res == 0)) { /* * Can't make it unwritten. There isn't * a full extent here so just skip it. @@ -6114,11 +6122,7 @@ xfs_bmap_validate_extent( XFS_FSB_TO_AGNO(mp, endfsb)) return __this_address; } - if (irec->br_state != XFS_EXT_NORM) { - if (whichfork != XFS_DATA_FORK) - return __this_address; - if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) - return __this_address; - } + if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK) + return __this_address; return NULL; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h @@ -183,6 +183,7 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_owner_info *oinfo, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h @@ -287,6 +287,8 @@ static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) { if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) return false; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) + return false; /* check for unknown features in the fs */ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || @@ -357,12 +359,6 @@ static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } -static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); -} - static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) { return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c @@ -1115,7 +1115,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | - XFS_FSOP_GEOM_FLAGS_DIRV2; + XFS_FSOP_GEOM_FLAGS_DIRV2 | + XFS_FSOP_GEOM_FLAGS_EXTFLG; if (xfs_sb_version_hasattr(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_sb_version_hasquota(sbp)) @@ -1124,8 +1125,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; if (xfs_sb_version_hasdalign(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; - if (xfs_sb_version_hasextflgbit(sbp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG; if (xfs_sb_version_hassector(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; if (xfs_sb_version_hasasciici(sbp)) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c @@ -29,6 +29,8 @@ #include "xfs_ag_resv.h" #include "xfs_trans_space.h" #include "xfs_quota.h" +#include "xfs_attr.h" +#include "xfs_reflink.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -692,13 +694,14 @@ xrep_findroot_block( struct xrep_find_ag_btree *fab, uint64_t owner, xfs_agblock_t agbno, - bool *found_it) + bool *done_with_block) { struct xfs_mount *mp = ri->sc->mp; struct xfs_buf *bp; struct xfs_btree_block *btblock; xfs_daddr_t daddr; - int error; + int block_level; + int error = 0; daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno); @@ -717,36 +720,111 @@ xrep_findroot_block( return error; } + /* + * Read the buffer into memory so that we can see if it's a match for + * our btree type. We have no clue if it is beforehand, and we want to + * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which + * will cause needless disk reads in subsequent calls to this function) + * and logging metadata verifier failures. + * + * Therefore, pass in NULL buffer ops. If the buffer was already in + * memory from some other caller it will already have b_ops assigned. + * If it was in memory from a previous unsuccessful findroot_block + * call, the buffer won't have b_ops but it should be clean and ready + * for us to try to verify if the read call succeeds. The same applies + * if the buffer wasn't in memory at all. + * + * Note: If we never match a btree type with this buffer, it will be + * left in memory with NULL b_ops. This shouldn't be a problem unless + * the buffer gets written. + */ error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, mp->m_bsize, 0, &bp, NULL); if (error) return error; - /* - * Does this look like a block matching our fs and higher than any - * other block we've found so far? If so, reattach buffer verifiers - * so the AIL won't complain if the buffer is also dirty. - */ + /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); if (be32_to_cpu(btblock->bb_magic) != fab->magic) goto out; - if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - goto out; - bp->b_ops = fab->buf_ops; - /* Ignore this block if it's lower in the tree than we've seen. */ - if (fab->root != NULLAGBLOCK && - xfs_btree_get_level(btblock) < fab->height) - goto out; + /* + * If the buffer already has ops applied and they're not the ones for + * this btree type, we know this block doesn't match the btree and we + * can bail out. + * + * If the buffer ops match ours, someone else has already validated + * the block for us, so we can move on to checking if this is a root + * block candidate. + * + * If the buffer does not have ops, nobody has successfully validated + * the contents and the buffer cannot be dirty. If the magic, uuid, + * and structure match this btree type then we'll move on to checking + * if it's a root block candidate. If there is no match, bail out. + */ + if (bp->b_ops) { + if (bp->b_ops != fab->buf_ops) + goto out; + } else { + ASSERT(!xfs_trans_buf_is_dirty(bp)); + if (!uuid_equal(&btblock->bb_u.s.bb_uuid, + &mp->m_sb.sb_meta_uuid)) + goto out; + fab->buf_ops->verify_read(bp); + if (bp->b_error) { + bp->b_error = 0; + goto out; + } - /* Make sure we pass the verifiers. */ - bp->b_ops->verify_read(bp); - if (bp->b_error) + /* + * Some read verifiers will (re)set b_ops, so we must be + * careful not to blow away any such assignment. + */ + if (!bp->b_ops) + bp->b_ops = fab->buf_ops; + } + + /* + * This block passes the magic/uuid and verifier tests for this btree + * type. We don't need the caller to try the other tree types. + */ + *done_with_block = true; + + /* + * Compare this btree block's level to the height of the current + * candidate root block. + * + * If the level matches the root we found previously, throw away both + * blocks because there can't be two candidate roots. + * + * If level is lower in the tree than the root we found previously, + * ignore this block. + */ + block_level = xfs_btree_get_level(btblock); + if (block_level + 1 == fab->height) { + fab->root = NULLAGBLOCK; goto out; - fab->root = agbno; - fab->height = xfs_btree_get_level(btblock) + 1; - *found_it = true; + } else if (block_level < fab->height) { + goto out; + } + + /* + * This is the highest block in the tree that we've found so far. + * Update the btree height to reflect what we've learned from this + * block. + */ + fab->height = block_level + 1; + + /* + * If this block doesn't have sibling pointers, then it's the new root + * block candidate. Otherwise, the root will be found farther up the + * tree. + */ + if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) && + btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) + fab->root = agbno; + else + fab->root = NULLAGBLOCK; trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno, be32_to_cpu(btblock->bb_magic), fab->height - 1); @@ -768,7 +846,7 @@ xrep_findroot_rmap( struct xrep_findroot *ri = priv; struct xrep_find_ag_btree *fab; xfs_agblock_t b; - bool found_it; + bool done; int error = 0; /* Ignore anything that isn't AG metadata. */ @@ -777,16 +855,16 @@ xrep_findroot_rmap( /* Otherwise scan each block + btree type. */ for (b = 0; b < rec->rm_blockcount; b++) { - found_it = false; + done = false; for (fab = ri->btree_info; fab->buf_ops; fab++) { if (rec->rm_owner != fab->rmap_owner) continue; error = xrep_findroot_block(ri, fab, rec->rm_owner, rec->rm_startblock + b, - &found_it); + &done); if (error) return error; - if (found_it) + if (done) break; } } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c @@ -412,19 +412,6 @@ xchk_validate_inputs( goto out; } - error = -EOPNOTSUPP; - /* - * We won't scrub any filesystem that doesn't have the ability - * to record unwritten extents. The option was made default in - * 2003, removed from mkfs in 2007, and cannot be disabled in - * v5, so if we find a filesystem without this flag it's either - * really old or totally unsupported. Avoid it either way. - * We also don't support v1-v3 filesystems, which aren't - * mountable. - */ - if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) - goto out; - /* * We only want to repair read-write v5+ filesystems. Defer the check * for ops->repair until after our scrub confirms that we need to diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c @@ -917,7 +917,7 @@ xfs_vm_writepage( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_INVALID, + .io_type = XFS_IO_HOLE, }; int ret; @@ -933,7 +933,7 @@ xfs_vm_writepages( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_INVALID, + .io_type = XFS_IO_HOLE, }; int ret; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h @@ -12,21 +12,19 @@ extern struct bio_set xfs_ioend_bioset; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { - XFS_IO_INVALID, /* initial state */ + XFS_IO_HOLE, /* covers region without any block allocation */ XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ XFS_IO_COW, /* covers copy-on-write extent */ - XFS_IO_HOLE, /* covers region without any block allocation */ }; #define XFS_IO_TYPES \ - { XFS_IO_INVALID, "invalid" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" }, \ - { XFS_IO_HOLE, "hole" } + { XFS_IO_HOLE, "hole" }, \ + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" }, \ + { XFS_IO_COW, "CoW" } /* * Structure for buffered I/O completions. diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h @@ -1,148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - */ -#ifndef __XFS_ATTR_H__ -#define __XFS_ATTR_H__ - -struct xfs_inode; -struct xfs_da_args; -struct xfs_attr_list_context; - -/* - * Large attribute lists are structured around Btrees where all the data - * elements are in the leaf nodes. Attribute names are hashed into an int, - * then that int is used as the index into the Btree. Since the hashval - * of an attribute name may not be unique, we may have duplicate keys. - * The internal links in the Btree are logical block offsets into the file. - * - * Small attribute lists use a different format and are packed as tightly - * as possible so as to fit into the literal area of the inode. - */ - -/*======================================================================== - * External interfaces - *========================================================================*/ - - -#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ -#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ -#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ -#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ -#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ -#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ - -#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ -#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ - -#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ - -#define XFS_ATTR_FLAGS \ - { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ - { ATTR_ROOT, "ROOT" }, \ - { ATTR_TRUST, "TRUST" }, \ - { ATTR_SECURE, "SECURE" }, \ - { ATTR_CREATE, "CREATE" }, \ - { ATTR_REPLACE, "REPLACE" }, \ - { ATTR_KERNOTIME, "KERNOTIME" }, \ - { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" } - -/* - * The maximum size (into the kernel or returned from the kernel) of an - * attribute value or the buffer used for an attr_list() call. Larger - * sizes will result in an ERANGE return code. - */ -#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ - -/* - * Define how lists of attribute names are returned to the user from - * the attr_list() call. A large, 32bit aligned, buffer is passed in - * along with its size. We put an array of offsets at the top that each - * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. - */ -typedef struct attrlist { - __s32 al_count; /* number of entries in attrlist */ - __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ -} attrlist_t; - -/* - * Show the interesting info about one attribute. This is what the - * al_offset[i] entry points to. - */ -typedef struct attrlist_ent { /* data from attr_list() */ - __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ -} attrlist_ent_t; - -/* - * Given a pointer to the (char*) buffer containing the attr_list() result, - * and an index, return a pointer to the indicated attribute in the buffer. - */ -#define ATTR_ENTRY(buffer, index) \ - ((attrlist_ent_t *) \ - &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) - -/* - * Kernel-internal version of the attrlist cursor. - */ -typedef struct attrlist_cursor_kern { - __u32 hashval; /* hash value of next entry to add */ - __u32 blkno; /* block containing entry (suggestion) */ - __u32 offset; /* offset in list of equal-hashvals */ - __u16 pad1; /* padding to match user-level */ - __u8 pad2; /* padding to match user-level */ - __u8 initted; /* T/F: cursor has been initialized */ -} attrlist_cursor_kern_t; - - -/*======================================================================== - * Structure used to pass context around among the routines. - *========================================================================*/ - - -/* void; state communicated via *context */ -typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int); - -typedef struct xfs_attr_list_context { - struct xfs_trans *tp; - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor; /* position in list */ - char *alist; /* output buffer */ - int seen_enough; /* T/F: seen enough of list? */ - ssize_t count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize; /* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch; /* T/F: resynch with cursor */ - put_listent_func_t put_listent; /* list output fmt function */ - int index; /* index into output buffer */ -} xfs_attr_list_context_t; - - -/*======================================================================== - * Function prototypes for the kernel. - *========================================================================*/ - -/* - * Overall external interface routines. - */ -int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); -int xfs_attr_list_int(struct xfs_attr_list_context *); -int xfs_inode_hasattr(struct xfs_inode *ip); -int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); -int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char *value, int *valuelenp, int flags); -int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - unsigned char *value, int valuelen, int flags); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); -int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, - int flags, struct attrlist_cursor_kern *cursor); - - -#endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c @@ -406,10 +406,10 @@ xfs_getbmap_report_one( struct xfs_bmbt_irec *got) { struct kgetbmap *p = out + bmv->bmv_entries; - bool shared = false, trimmed = false; + bool shared = false; int error; - error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed); + error = xfs_reflink_trim_around_shared(ip, got, &shared); if (error) return error; @@ -1043,44 +1043,6 @@ out_trans_cancel: } static int -xfs_adjust_extent_unmap_boundaries( - struct xfs_inode *ip, - xfs_fileoff_t *startoffset_fsb, - xfs_fileoff_t *endoffset_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; - int nimap, error; - xfs_extlen_t mod = 0; - - nimap = 1; - error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); - if (error) - return error; - - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod); - if (mod) - *startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - - nimap = 1; - error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); - if (error) - return error; - - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && mod != mp->m_sb.sb_rextsize) - *endoffset_fsb -= mod; - } - - return 0; -} - -static int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, @@ -1133,19 +1095,8 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. If it's a RT file - * and we can't use unwritten extents then we actually need to ensure - * to zero the whole extent, otherwise we just need to take of block - * boundaries, and xfs_bunmapi will handle the rest. + * Need to zero the stuff we're not freeing, on disk. */ - if (XFS_IS_REALTIME_INODE(ip) && - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, - &endoffset_fsb); - if (error) - return error; - } - if (endoffset_fsb > startoffset_fsb) { while (!done) { error = xfs_unmap_extent(ip, startoffset_fsb, @@ -1824,6 +1775,12 @@ xfs_swap_extents( if (error) goto out_unlock; + if (xfs_inode_has_cow_data(tip)) { + error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); + if (error) + return error; + } + /* * Extent "swapping" with rmap requires a permanent reservation and * a block reservation because it's really just a remap operation diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c @@ -37,6 +37,32 @@ static kmem_zone_t *xfs_buf_zone; #define xb_to_gfp(flags) \ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) +/* + * Locking orders + * + * xfs_buf_ioacct_inc: + * xfs_buf_ioacct_dec: + * b_sema (caller holds) + * b_lock + * + * xfs_buf_stale: + * b_sema (caller holds) + * b_lock + * lru_lock + * + * xfs_buf_rele: + * b_lock + * pag_buf_lock + * lru_lock + * + * xfs_buftarg_wait_rele + * lru_lock + * b_lock (trylock due to inversion) + * + * xfs_buftarg_isolate + * lru_lock + * b_lock (trylock due to inversion) + */ static inline int xfs_buf_is_vmapped( @@ -749,6 +775,30 @@ _xfs_buf_read( return xfs_buf_submit(bp); } +/* + * If the caller passed in an ops structure and the buffer doesn't have ops + * assigned, set the ops and use them to verify the contents. If the contents + * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no + * recorded errors and is already in XBF_DONE state. + */ +int +xfs_buf_ensure_ops( + struct xfs_buf *bp, + const struct xfs_buf_ops *ops) +{ + ASSERT(bp->b_flags & XBF_DONE); + ASSERT(bp->b_error == 0); + + if (!ops || bp->b_ops) + return 0; + + bp->b_ops = ops; + bp->b_ops->verify_read(bp); + if (bp->b_error) + bp->b_flags &= ~XBF_DONE; + return bp->b_error; +} + xfs_buf_t * xfs_buf_read_map( struct xfs_buftarg *target, @@ -762,26 +812,32 @@ xfs_buf_read_map( flags |= XBF_READ; bp = xfs_buf_get_map(target, map, nmaps, flags); - if (bp) { - trace_xfs_buf_read(bp, flags, _RET_IP_); + if (!bp) + return NULL; - if (!(bp->b_flags & XBF_DONE)) { - XFS_STATS_INC(target->bt_mount, xb_get_read); - bp->b_ops = ops; - _xfs_buf_read(bp, flags); - } else if (flags & XBF_ASYNC) { - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ - xfs_buf_relse(bp); - return NULL; - } else { - /* We do not want read in the flags */ - bp->b_flags &= ~XBF_READ; - } + trace_xfs_buf_read(bp, flags, _RET_IP_); + + if (!(bp->b_flags & XBF_DONE)) { + XFS_STATS_INC(target->bt_mount, xb_get_read); + bp->b_ops = ops; + _xfs_buf_read(bp, flags); + return bp; } + xfs_buf_ensure_ops(bp, ops); + + if (flags & XBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + xfs_buf_relse(bp); + return NULL; + } + + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + ASSERT(bp->b_ops != NULL || ops == NULL); return bp; } @@ -1006,8 +1062,18 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); - release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + /* + * We grab the b_lock here first to serialise racing xfs_buf_rele() + * calls. The pag_buf_lock being taken on the last reference only + * serialises against racing lookups in xfs_buf_find(). IOWs, the second + * to last reference we drop here is not serialised against the last + * reference until we take bp->b_lock. Hence if we don't grab b_lock + * first, the last "release" reference can win the race to the lock and + * free the buffer before the second-to-last reference is processed, + * leading to a use-after-free scenario. + */ spin_lock(&bp->b_lock); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); if (!release) { /* * Drop the in-flight state if the buffer is already on the LRU @@ -1989,6 +2055,13 @@ xfs_buf_delwri_submit_buffers( * is only safely useable for callers that can track I/O completion by higher * level means, e.g. AIL pushing as the @buffer_list is consumed in this * function. + * + * Note: this function will skip buffers it would block on, and in doing so + * leaves them on @buffer_list so they can be retried on a later pass. As such, + * it is up to the caller to ensure that the buffer list is fully submitted or + * cancelled appropriately when they are finished with the list. Failure to + * cancel or resubmit the list until it is empty will result in leaked buffers + * at unmount time. */ int xfs_buf_delwri_submit_nowait( diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h @@ -385,4 +385,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) +int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); + #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c @@ -470,20 +470,13 @@ xfs_fs_goingdown( */ void xfs_do_force_shutdown( - xfs_mount_t *mp, + struct xfs_mount *mp, int flags, char *fname, int lnnum) { - int logerror; - - logerror = flags & SHUTDOWN_LOG_IO_ERROR; + bool logerror = flags & SHUTDOWN_LOG_IO_ERROR; - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, - __func__, flags, lnnum, fname, __return_address); - } /* * No need to duplicate efforts. */ @@ -499,27 +492,34 @@ xfs_do_force_shutdown( if (xfs_log_force_umount(mp, logerror)) return; + if (flags & SHUTDOWN_FORCE_UMOUNT) { + xfs_alert(mp, +"User initiated shutdown received. Shutting down filesystem"); + return; + } + + xfs_notice(mp, +"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, + __func__, flags, lnnum, fname, __return_address); + if (flags & SHUTDOWN_CORRUPT_INCORE) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, - "Corruption of in-memory data detected. Shutting down filesystem"); +"Corruption of in-memory data detected. Shutting down filesystem"); if (XFS_ERRLEVEL_HIGH <= xfs_error_level) xfs_stack_trace(); - } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); - } - } - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_alert(mp, - "Please umount the filesystem and rectify the problem(s)"); + } else if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); } + + xfs_alert(mp, + "Please unmount the filesystem and rectify the problem(s)"); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c @@ -604,14 +604,6 @@ xfs_ioc_space( uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; int error; - /* - * Only allow the sys admin to reserve space unless - * unwritten extents are enabled. - */ - if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) return -EPERM; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c @@ -62,6 +62,21 @@ xfs_bmbt_to_iomap( iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); } +static void +xfs_hole_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb); + iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); + iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); +} + xfs_extlen_t xfs_eof_alignment( struct xfs_inode *ip, @@ -502,6 +517,7 @@ xfs_file_iomap_begin_delay( struct inode *inode, loff_t offset, loff_t count, + unsigned flags, struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); @@ -538,15 +554,23 @@ xfs_file_iomap_begin_delay( goto out_unlock; } + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); - if (!eof && got.br_startoff <= offset_fsb) { - if (xfs_is_reflink_inode(ip)) { - bool shared; + if (eof) + got.br_startoff = end_fsb; /* fake hole until the end */ - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), - maxbytes_fsb); + if (got.br_startoff <= offset_fsb) { + /* + * For reflink files we may need a delalloc reservation when + * overwriting shared extents. This includes zeroing of + * existing extents that contain data. + */ + if (xfs_is_reflink_inode(ip) && + ((flags & IOMAP_WRITE) || + got.br_state != XFS_EXT_UNWRITTEN)) { xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got, &shared); + error = xfs_reflink_reserve_cow(ip, &got); if (error) goto out_unlock; } @@ -555,6 +579,11 @@ xfs_file_iomap_begin_delay( goto done; } + if (flags & IOMAP_ZERO) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); + goto out_unlock; + } + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; @@ -1003,16 +1032,17 @@ xfs_file_iomap_begin( struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; int nimaps = 1, error = 0; - bool shared = false, trimmed = false; + bool shared = false; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, iomap); + return xfs_file_iomap_begin_delay(inode, offset, length, flags, + iomap); } /* @@ -1038,8 +1068,7 @@ xfs_file_iomap_begin( if (flags & IOMAP_REPORT) { /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared, - &trimmed); + error = xfs_reflink_trim_around_shared(ip, &imap, &shared); if (error) goto out_unlock; } @@ -1065,7 +1094,7 @@ xfs_file_iomap_begin( if (error) goto out_unlock; } else { - error = xfs_reflink_reserve_cow(ip, &imap, &shared); + error = xfs_reflink_reserve_cow(ip, &imap); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c @@ -182,8 +182,7 @@ int xfs_reflink_trim_around_shared( struct xfs_inode *ip, struct xfs_bmbt_irec *irec, - bool *shared, - bool *trimmed) + bool *shared) { xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -209,7 +208,7 @@ xfs_reflink_trim_around_shared( if (error) return error; - *shared = *trimmed = false; + *shared = false; if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; @@ -222,8 +221,6 @@ xfs_reflink_trim_around_shared( */ irec->br_blockcount = flen; *shared = true; - if (flen != aglen) - *trimmed = true; return 0; } else { /* @@ -233,7 +230,6 @@ xfs_reflink_trim_around_shared( * start of the shared region. */ irec->br_blockcount = fbno - agbno; - *trimmed = true; return 0; } } @@ -241,7 +237,7 @@ xfs_reflink_trim_around_shared( /* * Trim the passed in imap to the next shared/unshared extent boundary, and * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. In this case *shared is set to true, else to false. + * COW fork. * * Note that imap will always contain the block numbers for the existing blocks * in the data fork, as the upper layers need them for read-modify-write @@ -250,14 +246,14 @@ xfs_reflink_trim_around_shared( int xfs_reflink_reserve_cow( struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - bool *shared) + struct xfs_bmbt_irec *imap) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got; int error = 0; - bool eof = false, trimmed; + bool eof = false; struct xfs_iext_cursor icur; + bool shared; /* * Search the COW fork extent list first. This serves two purposes: @@ -273,18 +269,16 @@ xfs_reflink_reserve_cow( if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - - *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + error = xfs_reflink_trim_around_shared(ip, imap, &shared); if (error) return error; /* Not shared? Just report the (potentially capped) extent. */ - if (!*shared) + if (!shared) return 0; /* @@ -368,7 +362,6 @@ xfs_find_trim_cow_extent( xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_iext_cursor icur; struct xfs_bmbt_irec got; - bool trimmed; *found = false; @@ -376,9 +369,13 @@ xfs_find_trim_cow_extent( * If we don't find an overlapping extent, trim the range we need to * allocate to fit the hole we found. */ - if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) || - got.br_startoff > offset_fsb) - return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) + got.br_startoff = offset_fsb + count_fsb; + if (got.br_startoff > offset_fsb) { + xfs_trim_extent(imap, imap->br_startoff, + got.br_startoff - imap->br_startoff); + return xfs_reflink_trim_around_shared(ip, imap, shared); + } *shared = true; if (isnullstartblock(got.br_startblock)) { diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h @@ -10,10 +10,10 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, - struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); + struct xfs_bmbt_irec *irec, bool *shared); extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared); + struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c @@ -29,30 +29,30 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) char *desc; int endpoint; } xstats[] = { - { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, - { "abt", XFSSTAT_END_ALLOC_BTREE }, - { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, - { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, - { "dir", XFSSTAT_END_DIRECTORY_OPS }, - { "trans", XFSSTAT_END_TRANSACTIONS }, - { "ig", XFSSTAT_END_INODE_OPS }, - { "log", XFSSTAT_END_LOG_OPS }, - { "push_ail", XFSSTAT_END_TAIL_PUSHING }, - { "xstrat", XFSSTAT_END_WRITE_CONVERT }, - { "rw", XFSSTAT_END_READ_WRITE_OPS }, - { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, - { "icluster", XFSSTAT_END_INODE_CLUSTER }, - { "vnodes", XFSSTAT_END_VNODE_OPS }, - { "buf", XFSSTAT_END_BUF }, - { "abtb2", XFSSTAT_END_ABTB_V2 }, - { "abtc2", XFSSTAT_END_ABTC_V2 }, - { "bmbt2", XFSSTAT_END_BMBT_V2 }, - { "ibt2", XFSSTAT_END_IBT_V2 }, - { "fibt2", XFSSTAT_END_FIBT_V2 }, - { "rmapbt", XFSSTAT_END_RMAP_V2 }, - { "refcntbt", XFSSTAT_END_REFCOUNT }, + { "extent_alloc", xfsstats_offset(xs_abt_lookup) }, + { "abt", xfsstats_offset(xs_blk_mapr) }, + { "blk_map", xfsstats_offset(xs_bmbt_lookup) }, + { "bmbt", xfsstats_offset(xs_dir_lookup) }, + { "dir", xfsstats_offset(xs_trans_sync) }, + { "trans", xfsstats_offset(xs_ig_attempts) }, + { "ig", xfsstats_offset(xs_log_writes) }, + { "log", xfsstats_offset(xs_try_logspace)}, + { "push_ail", xfsstats_offset(xs_xstrat_quick)}, + { "xstrat", xfsstats_offset(xs_write_calls) }, + { "rw", xfsstats_offset(xs_attr_get) }, + { "attr", xfsstats_offset(xs_iflush_count)}, + { "icluster", xfsstats_offset(vn_active) }, + { "vnodes", xfsstats_offset(xb_get) }, + { "buf", xfsstats_offset(xs_abtb_2) }, + { "abtb2", xfsstats_offset(xs_abtc_2) }, + { "abtc2", xfsstats_offset(xs_bmbt_2) }, + { "bmbt2", xfsstats_offset(xs_ibt_2) }, + { "ibt2", xfsstats_offset(xs_fibt_2) }, + { "fibt2", xfsstats_offset(xs_rmap_2) }, + { "rmapbt", xfsstats_offset(xs_refcbt_2) }, + { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ - { "qm", XFSSTAT_END_QM }, + { "qm", xfsstats_offset(xs_xstrat_bytes)}, }; /* Loop over all stats groups */ @@ -104,6 +104,10 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) #ifdef CONFIG_PROC_FS /* legacy quota interfaces */ #ifdef CONFIG_XFS_QUOTA + +#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims) +#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot) + static int xqm_proc_show(struct seq_file *m, void *v) { /* maximum; incore; ratio free to inuse; freelist */ @@ -119,7 +123,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) int j; seq_printf(m, "qm"); - for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); return 0; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h @@ -41,17 +41,14 @@ enum { * XFS global statistics */ struct __xfsstats { -# define XFSSTAT_END_EXTENT_ALLOC 4 uint32_t xs_allocx; uint32_t xs_allocb; uint32_t xs_freex; uint32_t xs_freeb; -# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) uint32_t xs_abt_lookup; uint32_t xs_abt_compare; uint32_t xs_abt_insrec; uint32_t xs_abt_delrec; -# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) uint32_t xs_blk_mapr; uint32_t xs_blk_mapw; uint32_t xs_blk_unmap; @@ -59,21 +56,17 @@ struct __xfsstats { uint32_t xs_del_exlist; uint32_t xs_look_exlist; uint32_t xs_cmp_exlist; -# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) uint32_t xs_bmbt_lookup; uint32_t xs_bmbt_compare; uint32_t xs_bmbt_insrec; uint32_t xs_bmbt_delrec; -# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) uint32_t xs_dir_lookup; uint32_t xs_dir_create; uint32_t xs_dir_remove; uint32_t xs_dir_getdents; -# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) uint32_t xs_trans_sync; uint32_t xs_trans_async; uint32_t xs_trans_empty; -# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) uint32_t xs_ig_attempts; uint32_t xs_ig_found; uint32_t xs_ig_frecycle; @@ -81,13 +74,11 @@ struct __xfsstats { uint32_t xs_ig_dup; uint32_t xs_ig_reclaims; uint32_t xs_ig_attrchg; -# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) uint32_t xs_log_writes; uint32_t xs_log_blocks; uint32_t xs_log_noiclogs; uint32_t xs_log_force; uint32_t xs_log_force_sleep; -# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) uint32_t xs_try_logspace; uint32_t xs_sleep_logspace; uint32_t xs_push_ail; @@ -98,22 +89,17 @@ struct __xfsstats { uint32_t xs_push_ail_flushing; uint32_t xs_push_ail_restarts; uint32_t xs_push_ail_flush; -# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) uint32_t xs_xstrat_quick; uint32_t xs_xstrat_split; -# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) uint32_t xs_write_calls; uint32_t xs_read_calls; -# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) uint32_t xs_attr_get; uint32_t xs_attr_set; uint32_t xs_attr_remove; uint32_t xs_attr_list; -# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) uint32_t xs_iflush_count; uint32_t xs_icluster_flushcnt; uint32_t xs_icluster_flushinode; -# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) uint32_t vn_active; /* # vnodes not on free lists */ uint32_t vn_alloc; /* # times vn_alloc called */ uint32_t vn_get; /* # times vn_get called */ @@ -122,7 +108,6 @@ struct __xfsstats { uint32_t vn_reclaim; /* # times vn_reclaim called */ uint32_t vn_remove; /* # times vn_remove called */ uint32_t vn_free; /* # times vn_free called */ -#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) uint32_t xb_get; uint32_t xb_create; uint32_t xb_get_locked; @@ -133,28 +118,19 @@ struct __xfsstats { uint32_t xb_page_found; uint32_t xb_get_read; /* Version 2 btree counters */ -#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) uint32_t xs_abtb_2[__XBTS_MAX]; -#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) uint32_t xs_abtc_2[__XBTS_MAX]; -#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) uint32_t xs_bmbt_2[__XBTS_MAX]; -#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) uint32_t xs_ibt_2[__XBTS_MAX]; -#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) uint32_t xs_fibt_2[__XBTS_MAX]; -#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) uint32_t xs_rmap_2[__XBTS_MAX]; -#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) uint32_t xs_refcbt_2[__XBTS_MAX]; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) uint32_t xs_qm_dqreclaims; uint32_t xs_qm_dqreclaim_misses; uint32_t xs_qm_dquot_dups; uint32_t xs_qm_dqcachemisses; uint32_t xs_qm_dqcachehits; uint32_t xs_qm_dqwants; -#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) uint32_t xs_qm_dquot; uint32_t xs_qm_dquot_unused; /* Extra precision counters */ @@ -163,10 +139,12 @@ struct __xfsstats { uint64_t xs_read_bytes; }; +#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t)) + struct xfsstats { union { struct __xfsstats s; - uint32_t a[XFSSTAT_END_XQMSTAT]; + uint32_t a[xfsstats_offset(xs_qm_dquot)]; }; }; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c @@ -43,6 +43,7 @@ #include <linux/dax.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/magic.h> #include <linux/mount.h> #include <linux/mempool.h> #include <linux/writeback.h> @@ -933,6 +934,32 @@ xfs_fs_alloc_inode( return NULL; } +#ifdef DEBUG +static void +xfs_check_delalloc( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + + if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) + return; + do { + if (isnullstartblock(got.br_startblock)) { + xfs_warn(ip->i_mount, + "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", + ip->i_ino, + whichfork == XFS_DATA_FORK ? "data" : "cow", + got.br_startoff, got.br_blockcount); + } + } while (xfs_iext_next_extent(ifp, &icur, &got)); +} +#else +#define xfs_check_delalloc(ip, whichfork) do { } while (0) +#endif + /* * Now that the generic code is guaranteed not to be accessing * the linux inode, we can inactivate and reclaim the inode. @@ -951,7 +978,12 @@ xfs_fs_destroy_inode( xfs_inactive(ip); - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } + XFS_STATS_INC(ip->i_mount, vn_reclaim); /* @@ -1097,7 +1129,7 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - statp->f_type = XFS_SB_MAGIC; + statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; id = huge_encode_dev(mp->m_ddev_targp->bt_dev); @@ -1650,7 +1682,7 @@ xfs_fs_fill_super( * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. */ - sb->s_magic = XFS_SB_MAGIC; + sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h @@ -220,6 +220,7 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, uint); void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); +bool xfs_trans_buf_is_dirty(struct xfs_buf *bp); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); void xfs_extent_free_init_defer_op(void); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c @@ -531,17 +531,33 @@ xfsaild( set_current_state(TASK_INTERRUPTIBLE); /* - * Check kthread_should_stop() after we set the task state - * to guarantee that we either see the stop bit and exit or - * the task state is reset to runnable such that it's not - * scheduled out indefinitely and detects the stop bit at - * next iteration. - * + * Check kthread_should_stop() after we set the task state to + * guarantee that we either see the stop bit and exit or the + * task state is reset to runnable such that it's not scheduled + * out indefinitely and detects the stop bit at next iteration. * A memory barrier is included in above task state set to * serialize again kthread_stop(). */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); + + /* + * The caller forces out the AIL before stopping the + * thread in the common case, which means the delwri + * queue is drained. In the shutdown case, the queue may + * still hold relogged buffers that haven't been + * submitted because they were pinned since added to the + * queue. + * + * Log I/O error processing stales the underlying buffer + * and clears the delwri state, expecting the buf to be + * removed on the next submission attempt. That won't + * happen if we're shutting down, so this is the last + * opportunity to release such buffers from the queue. + */ + ASSERT(list_empty(&ailp->ail_buf_list) || + XFS_FORCED_SHUTDOWN(ailp->ail_mount)); + xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c @@ -264,11 +264,39 @@ xfs_trans_read_buf_map( return -EIO; } + /* + * Check if the caller is trying to read a buffer that is + * already attached to the transaction yet has no buffer ops + * assigned. Ops are usually attached when the buffer is + * attached to the transaction, or by the read caller if + * special circumstances. That didn't happen, which is not + * how this is supposed to go. + * + * If the buffer passes verification we'll let this go, but if + * not we have to shut down. Let the transaction cleanup code + * release this buffer when it kills the tranaction. + */ + ASSERT(bp->b_ops != NULL); + error = xfs_buf_ensure_ops(bp, ops); + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + + if (tp->t_flags & XFS_TRANS_DIRTY) + xfs_force_shutdown(tp->t_mountp, + SHUTDOWN_META_IO_ERROR); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; + } + bip = bp->b_log_item; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_read_buf_recur(bip); + ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } @@ -316,11 +344,25 @@ xfs_trans_read_buf_map( _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_read_buf(bp->b_log_item); } + ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } +/* Has this buffer been dirtied by anyone? */ +bool +xfs_trans_buf_is_dirty( + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_log_item; + + if (!bip) + return false; + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); +} + /* * Release a buffer previously joined to the transaction. If the buffer is * modified within this transaction, decrement the recursion count but do not diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h @@ -29,6 +29,7 @@ #define HPFS_SUPER_MAGIC 0xf995e849 #define ISOFS_SUPER_MAGIC 0x9660 #define JFFS2_SUPER_MAGIC 0x72b6 +#define XFS_SUPER_MAGIC 0x58465342 /* "XFSB" */ #define PSTOREFS_MAGIC 0x6165676C #define EFIVARFS_MAGIC 0xde5e81e4 #define HOSTFS_SUPER_MAGIC 0x00c0ffee