whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 1b0350c355b2f32bcbd453310bf6dfb50cbe965f
parent d2467adb66108f1b7fec8cadbdb8ca140e88effe
Author: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date:   Thu,  4 Oct 2018 09:17:38 -0700

Merge tag 'xfs-fixes-for-4.19-rc6' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Dave writes:
  "XFS fixes for 4.19-rc6

   Accumlated regression and bug fixes for 4.19-rc6, including:

   o make iomap correctly mark dirty pages for sub-page block sizes
   o fix regression in handling extent-to-btree format conversion errors
   o fix torn log wrap detection for new logs
   o various corrupt inode detection fixes
   o various delalloc state fixes
   o cleanup all the missed transaction cancel cases missed from changes merged
     in 4.19-rc1
   o fix lockdep false positive on transaction allocation
   o fix locking and reference counting on buffer log items"

* tag 'xfs-fixes-for-4.19-rc6' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: fix error handling in xfs_bmap_extents_to_btree
  iomap: set page dirty after partial delalloc on mkwrite
  xfs: remove invalid log recovery first/last cycle check
  xfs: validate inode di_forkoff
  xfs: skip delalloc COW blocks in xfs_reflink_end_cow
  xfs: don't treat unknown di_flags2 as corruption in scrub
  xfs: remove duplicated include from alloc.c
  xfs: don't bring in extents in xfs_bmap_punch_delalloc_range
  xfs: fix transaction leak in xfs_reflink_allocate_cow()
  xfs: avoid lockdep false positives in xfs_trans_alloc
  xfs: refactor xfs_buf_log_item reference count handling
  xfs: clean up xfs_trans_brelse()
  xfs: don't unlock invalidated buf on aborted tx commit
  xfs: remove last of unnecessary xfs_defer_cancel() callers
  xfs: don't crash the vfs on a garbage inline symlink

Diffstat:
Mfs/iomap.c | 2+-
Mfs/xfs/libxfs/xfs_attr.c | 28++++++++--------------------
Mfs/xfs/libxfs/xfs_attr_remote.c | 10++--------
Mfs/xfs/libxfs/xfs_bmap.c | 24+++++++++++++-----------
Mfs/xfs/libxfs/xfs_format.h | 2++
Mfs/xfs/libxfs/xfs_inode_buf.c | 30++++++++++++++++++++++++++++++
Mfs/xfs/scrub/alloc.c | 1-
Mfs/xfs/scrub/inode.c | 4+++-
Mfs/xfs/xfs_bmap_util.c | 20+++++++-------------
Mfs/xfs/xfs_buf_item.c | 119++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mfs/xfs/xfs_buf_item.h | 1+
Mfs/xfs/xfs_inode.c | 10+---------
Mfs/xfs/xfs_iops.c | 12+++++++++++-
Mfs/xfs/xfs_log_recover.c | 10----------
Mfs/xfs/xfs_reflink.c | 137+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mfs/xfs/xfs_trace.h | 1-
Mfs/xfs/xfs_trans.c | 10++++++++--
Mfs/xfs/xfs_trans_buf.c | 99+++++++++++++++++++------------------------------------------------------------
18 files changed, 256 insertions(+), 264 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c @@ -1051,6 +1051,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, } else { WARN_ON_ONCE(!PageUptodate(page)); iomap_page_create(inode, page); + set_page_dirty(page); } return length; @@ -1090,7 +1091,6 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) length -= ret; } - set_page_dirty(page); wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c @@ -587,7 +587,7 @@ xfs_attr_leaf_addname( */ error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -675,7 +675,7 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -693,9 +693,6 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -738,15 +735,12 @@ xfs_attr_leaf_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -864,7 +858,7 @@ restart: state = NULL; error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -888,7 +882,7 @@ restart: */ error = xfs_da3_split(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -984,7 +978,7 @@ restart: if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1013,9 +1007,6 @@ out: if (error) return error; return retval; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* @@ -1107,7 +1098,7 @@ xfs_attr_node_removename( if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1138,7 +1129,7 @@ xfs_attr_node_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1150,9 +1141,6 @@ xfs_attr_node_removename( out: xfs_da_state_free(state); return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c @@ -485,7 +485,7 @@ xfs_attr_rmtval_set( blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, &nmap); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -553,9 +553,6 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -625,7 +622,7 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, &done); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -638,7 +635,4 @@ xfs_attr_rmtval_remove( return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c @@ -673,7 +673,8 @@ xfs_bmap_extents_to_btree( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); /* - * Make space in the inode incore. + * Make space in the inode incore. This needs to be undone if we fail + * to expand the root. */ xfs_iroot_realloc(ip, 1, whichfork); ifp->if_flags |= XFS_IFBROOT; @@ -711,16 +712,15 @@ xfs_bmap_extents_to_btree( args.minlen = args.maxlen = args.prod = 1; args.wasdel = wasdel; *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - ASSERT(ifp->if_broot == NULL); - goto err1; - } + error = xfs_alloc_vextent(&args); + if (error) + goto out_root_realloc; if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { - ASSERT(ifp->if_broot == NULL); error = -ENOSPC; - goto err1; + goto out_root_realloc; } + /* * Allocation can't fail, the space was reserved. */ @@ -732,9 +732,10 @@ xfs_bmap_extents_to_btree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); if (!abp) { - error = -ENOSPC; - goto err2; + error = -EFSCORRUPTED; + goto out_unreserve_dquot; } + /* * Fill in the child block. */ @@ -775,11 +776,12 @@ xfs_bmap_extents_to_btree( *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); return 0; -err2: +out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); -err1: +out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h @@ -1016,6 +1016,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +/* Do not use bit 15, di_flags is legacy and unchanging now */ + #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c @@ -415,6 +415,31 @@ xfs_dinode_verify_fork( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_forkoff( + struct xfs_dinode *dip, + struct xfs_mount *mp) +{ + if (!XFS_DFORK_Q(dip)) + return NULL; + + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) + return __this_address; + break; + case XFS_DINODE_FMT_LOCAL: /* fall through ... */ + case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ + case XFS_DINODE_FMT_BTREE: + if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + return __this_address; + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -470,6 +495,11 @@ xfs_dinode_verify( if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) return __this_address; + /* check for illegal values of forkoff */ + fa = xfs_dinode_verify_forkoff(dip, mp); + if (fa) + return fa; + /* Do we have appropriate data fork formats for the mode? */ switch (mode & S_IFMT) { case S_IFIFO: diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c @@ -17,7 +17,6 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_alloc.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c @@ -126,6 +126,7 @@ xchk_inode_flags( { struct xfs_mount *mp = sc->mp; + /* di_flags are all taken, last bit cannot be used */ if (flags & ~XFS_DIFLAG_ANY) goto bad; @@ -172,8 +173,9 @@ xchk_inode_flags2( { struct xfs_mount *mp = sc->mp; + /* Unknown di_flags2 could be from a future kernel */ if (flags2 & ~XFS_DIFLAG2_ANY) - goto bad; + xchk_ino_set_warning(sc, ino); /* reflink flag requires reflink feature */ if ((flags2 & XFS_DIFLAG2_REFLINK) && diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c @@ -702,13 +702,9 @@ xfs_bmap_punch_delalloc_range( struct xfs_iext_cursor icur; int error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) goto out_unlock; @@ -1584,7 +1580,7 @@ xfs_swap_extent_rmap( tirec.br_blockcount, &irec, &nimaps, 0); if (error) - goto out_defer; + goto out; ASSERT(nimaps == 1); ASSERT(tirec.br_startoff == irec.br_startoff); trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); @@ -1599,22 +1595,22 @@ xfs_swap_extent_rmap( /* Remove the mapping from the donor file. */ error = xfs_bmap_unmap_extent(tp, tip, &uirec); if (error) - goto out_defer; + goto out; /* Remove the mapping from the source file. */ error = xfs_bmap_unmap_extent(tp, ip, &irec); if (error) - goto out_defer; + goto out; /* Map the donor file's blocks into the source file. */ error = xfs_bmap_map_extent(tp, ip, &uirec); if (error) - goto out_defer; + goto out; /* Map the source file's blocks into the donor file. */ error = xfs_bmap_map_extent(tp, tip, &irec); if (error) - goto out_defer; + goto out; error = xfs_defer_finish(tpp); tp = *tpp; @@ -1636,8 +1632,6 @@ xfs_swap_extent_rmap( tip->i_d.di_flags2 = tip_flags2; return 0; -out_defer: - xfs_defer_cancel(tp); out: trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); tip->i_d.di_flags2 = tip_flags2; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c @@ -532,6 +532,49 @@ xfs_buf_item_push( } /* + * Drop the buffer log item refcount and take appropriate action. This helper + * determines whether the bli must be freed or not, since a decrement to zero + * does not necessarily mean the bli is unused. + * + * Return true if the bli is freed, false otherwise. + */ +bool +xfs_buf_item_put( + struct xfs_buf_log_item *bip) +{ + struct xfs_log_item *lip = &bip->bli_item; + bool aborted; + bool dirty; + + /* drop the bli ref and return if it wasn't the last one */ + if (!atomic_dec_and_test(&bip->bli_refcount)) + return false; + + /* + * We dropped the last ref and must free the item if clean or aborted. + * If the bli is dirty and non-aborted, the buffer was clean in the + * transaction but still awaiting writeback from previous changes. In + * that case, the bli is freed on buffer writeback completion. + */ + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp); + dirty = bip->bli_flags & XFS_BLI_DIRTY; + if (dirty && !aborted) + return false; + + /* + * The bli is aborted or clean. An aborted item may be in the AIL + * regardless of dirty state. For example, consider an aborted + * transaction that invalidated a dirty bli and cleared the dirty + * state. + */ + if (aborted) + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bip->bli_buf); + return true; +} + +/* * Release the buffer associated with the buf log item. If there is no dirty * logged data associated with the buffer recorded in the buf log item, then * free the buf log item and remove the reference to it in the buffer. @@ -556,76 +599,42 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool aborted; - bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); - bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); + bool released; + bool hold = bip->bli_flags & XFS_BLI_HOLD; + bool stale = bip->bli_flags & XFS_BLI_STALE; #if defined(DEBUG) || defined(XFS_WARN) - bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); + bool ordered = bip->bli_flags & XFS_BLI_ORDERED; + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; #endif - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); - - /* Clear the buffer's association with this transaction. */ - bp->b_transp = NULL; - - /* - * The per-transaction state has been copied above so clear it from the - * bli. - */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - - /* - * If the buf item is marked stale, then don't do anything. We'll - * unlock the buffer and free the buf item when the buffer is unpinned - * for the last time. - */ - if (bip->bli_flags & XFS_BLI_STALE) { - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - if (!aborted) { - atomic_dec(&bip->bli_refcount); - return; - } - } - trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. If we are aborting the transaction, this may - * be the only reference to the buf item, so we free it anyway - * regardless of whether it is dirty or not. A dirty abort implies a - * shutdown, anyway. - * * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. */ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || (ordered && dirty && !xfs_buf_item_dirty_format(bip))); + ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); /* - * Clean buffers, by definition, cannot be in the AIL. However, aborted - * buffers may be in the AIL regardless of dirty state. An aborted - * transaction that invalidates a buffer already in the AIL may have - * marked it stale and cleared the dirty state, for example. - * - * Therefore if we are aborting a buffer and we've just taken the last - * reference away, we have to check if it is in the AIL before freeing - * it. We need to free it in this case, because an aborted transaction - * has already shut the filesystem down and this is the last chance we - * will have to do so. + * Clear the buffer's association with this transaction and + * per-transaction state from the bli, which has been copied above. */ - if (atomic_dec_and_test(&bip->bli_refcount)) { - if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!dirty) - xfs_buf_item_relse(bp); - } + bp->b_transp = NULL; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - if (!hold) - xfs_buf_relse(bp); + /* + * Unref the item and unlock the buffer unless held or stale. Stale + * buffers remain locked until final unpin unless the bli is freed by + * the unref call. The latter implies shutdown because buffer + * invalidation dirties the bli and transaction. + */ + released = xfs_buf_item_put(bip); + if (hold || (stale && !released)) + return; + ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + xfs_buf_relse(bp); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h @@ -51,6 +51,7 @@ struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); +bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c @@ -1563,7 +1563,7 @@ xfs_itruncate_extents_flags( error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS, &done); if (error) - goto out_bmap_cancel; + goto out; /* * Duplicate the transaction that has the permanent @@ -1599,14 +1599,6 @@ xfs_itruncate_extents_flags( out: *tpp = tp; return error; -out_bmap_cancel: - /* - * If the bunmapi call encounters an error, return to the caller where - * the transaction can be properly aborted. We just need to make sure - * we're not holding any resources that we were not when we came in. - */ - xfs_defer_cancel(tp); - goto out; } int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c @@ -471,8 +471,18 @@ xfs_vn_get_link_inline( struct inode *inode, struct delayed_call *done) { + char *link; + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); - return XFS_I(inode)->i_df.if_u1.if_data; + + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if + * if_data is junk. + */ + link = XFS_I(inode)->i_df.if_u1.if_data; + if (!link) + return ERR_PTR(-EFSCORRUPTED); + return link; } STATIC int diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c @@ -1570,16 +1570,6 @@ xlog_find_zeroed( if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); return 0; - } else if (first_cycle != 1) { - /* - * If the cycle of the last block is zero, the cycle of - * the first block must be 1. If it's not, maybe we're - * not looking at a log... Bail out. - */ - xfs_warn(log->l_mp, - "Log inconsistent or not a log (last==0, first!=1)"); - error = -EINVAL; - goto bp_err; } /* we have a partially zeroed log */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c @@ -352,6 +352,47 @@ xfs_reflink_convert_cow( return error; } +/* + * Find the extent that maps the given range in the COW fork. Even if the extent + * is not shared we might have a preallocation for it in the COW fork. If so we + * use it that rather than trigger a new allocation. + */ +static int +xfs_find_trim_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + bool *shared, + bool *found) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + bool trimmed; + + *found = false; + + /* + * If we don't find an overlapping extent, trim the range we need to + * allocate to fit the hole we found. + */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) || + got.br_startoff > offset_fsb) + return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + + *shared = true; + if (isnullstartblock(got.br_startblock)) { + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + return 0; + } + + /* real extent found - no need to allocate */ + xfs_trim_extent(&got, offset_fsb, count_fsb); + *imap = got; + *found = true; + return 0; +} + /* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( @@ -363,78 +404,64 @@ xfs_reflink_allocate_cow( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; - struct xfs_bmbt_irec got; - struct xfs_trans *tp = NULL; + struct xfs_trans *tp; int nimaps, error = 0; - bool trimmed; + bool found; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; - struct xfs_iext_cursor icur; -retry: - ASSERT(xfs_is_reflink_inode(ip)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_is_reflink_inode(ip)); - /* - * Even if the extent is not shared we might have a preallocation for - * it in the COW fork. If so use it. - */ - if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) && - got.br_startoff <= offset_fsb) { - *shared = true; - - /* If we have a real allocation in the COW fork we're done. */ - if (!isnullstartblock(got.br_startblock)) { - xfs_trim_extent(&got, offset_fsb, count_fsb); - *imap = got; - goto convert; - } + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + return error; + if (found) + goto convert; - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - } else { - error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); - if (error || !*shared) - goto out; - } + resaligned = xfs_aligned_fsb_count(imap->br_startoff, + imap->br_blockcount, xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - if (!tp) { - resaligned = xfs_aligned_fsb_count(imap->br_startoff, - imap->br_blockcount, xfs_get_cowextsz_hint(ip)); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + xfs_iunlock(ip, *lockmode); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + *lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, *lockmode); - xfs_iunlock(ip, *lockmode); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - *lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, *lockmode); + if (error) + return error; - if (error) - return error; + error = xfs_qm_dqattach_locked(ip, false); + if (error) + goto out_trans_cancel; - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out; - goto retry; + /* + * Check for an overlapping extent again now that we dropped the ilock. + */ + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + goto out_trans_cancel; + if (found) { + xfs_trans_cancel(tp); + goto convert; } error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); - nimaps = 1; - /* Allocate the entire reservation as unwritten blocks. */ + nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, resblks, imap, &nimaps); if (error) - goto out_trans_cancel; + goto out_unreserve; xfs_inode_set_cowblocks_tag(ip); - - /* Finish up. */ error = xfs_trans_commit(tp); if (error) return error; @@ -447,12 +474,12 @@ retry: return -ENOSPC; convert: return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); -out_trans_cancel: + +out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, XFS_QMOPT_RES_REGBLKS); -out: - if (tp) - xfs_trans_cancel(tp); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } @@ -666,14 +693,12 @@ xfs_reflink_end_cow( if (!del.br_blockcount) goto prev_extent; - ASSERT(!isnullstartblock(got.br_startblock)); - /* - * Don't remap unwritten extents; these are - * speculatively preallocated CoW extents that have been - * allocated but have not yet been involved in a write. + * Only remap real extent that contain data. With AIO + * speculatively preallocations can leak into the range we + * are called upon, and we need to skip them. */ - if (got.br_state == XFS_EXT_UNWRITTEN) + if (!xfs_bmap_is_real_extent(&got)) goto prev_extent; /* Unmap the old blocks in the data fork. */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h @@ -473,7 +473,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c @@ -259,6 +259,14 @@ xfs_trans_alloc( struct xfs_trans *tp; int error; + /* + * Allocate the handle before we do our freeze accounting and setting up + * GFP_NOFS allocation context so that we avoid lockdep false positives + * by doing GFP_KERNEL allocations inside sb_start_intwrite(). + */ + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -270,8 +278,6 @@ xfs_trans_alloc( mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); - tp = kmem_zone_zalloc(xfs_trans_zone, - (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; tp->t_mountp = mp; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c @@ -322,49 +322,38 @@ xfs_trans_read_buf_map( } /* - * Release the buffer bp which was previously acquired with one of the - * xfs_trans_... buffer allocation routines if the buffer has not - * been modified within this transaction. If the buffer is modified - * within this transaction, do decrement the recursion count but do - * not release the buffer even if the count goes to 0. If the buffer is not - * modified within the transaction, decrement the recursion count and - * release the buffer if the recursion count goes to 0. + * Release a buffer previously joined to the transaction. If the buffer is + * modified within this transaction, decrement the recursion count but do not + * release the buffer even if the count goes to 0. If the buffer is not modified + * within the transaction, decrement the recursion count and release the buffer + * if the recursion count goes to 0. * - * If the buffer is to be released and it was not modified before - * this transaction began, then free the buf_log_item associated with it. + * If the buffer is to be released and it was not already dirty before this + * transaction began, then also free the buf_log_item associated with it. * - * If the transaction pointer is NULL, make this just a normal - * brelse() call. + * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call. */ void xfs_trans_brelse( - xfs_trans_t *tp, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_buf *bp) { - struct xfs_buf_log_item *bip; - int freed; + struct xfs_buf_log_item *bip = bp->b_log_item; - /* - * Default to a normal brelse() call if the tp is NULL. - */ - if (tp == NULL) { - ASSERT(bp->b_transp == NULL); + ASSERT(bp->b_transp == tp); + + if (!tp) { xfs_buf_relse(bp); return; } - ASSERT(bp->b_transp == tp); - bip = bp->b_log_item; + trace_xfs_trans_brelse(bip); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); - trace_xfs_trans_brelse(bip); - /* - * If the release is just for a recursive lock, - * then decrement the count and return. + * If the release is for a recursive lookup, then decrement the count + * and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; @@ -372,64 +361,24 @@ xfs_trans_brelse( } /* - * If the buffer is dirty within this transaction, we can't + * If the buffer is invalidated or dirty in this transaction, we can't * release it until we commit. */ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; - - /* - * If the buffer has been invalidated, then we can't release - * it until the transaction commits to disk unless it is re-dirtied - * as part of this transaction. This prevents us from pulling - * the item from the AIL before we should. - */ if (bip->bli_flags & XFS_BLI_STALE) return; - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - /* - * Free up the log item descriptor tracking the released item. + * Unlink the log item from the transaction and clear the hold flag, if + * set. We wouldn't want the next user of the buffer to get confused. */ + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); xfs_trans_del_item(&bip->bli_item); + bip->bli_flags &= ~XFS_BLI_HOLD; - /* - * Clear the hold flag in the buf log item if it is set. - * We wouldn't want the next user of the buffer to - * get confused. - */ - if (bip->bli_flags & XFS_BLI_HOLD) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } - - /* - * Drop our reference to the buf log item. - */ - freed = atomic_dec_and_test(&bip->bli_refcount); - - /* - * If the buf item is not tracking data in the log, then we must free it - * before releasing the buffer back to the free pool. - * - * If the fs has shutdown and we dropped the last reference, it may fall - * on us to release a (possibly dirty) bli if it never made it to the - * AIL (e.g., the aborted unpin already happened and didn't release it - * due to our reference). Since we're already shutdown and need - * ail_lock, just force remove from the AIL and release the bli here. - */ - if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { - xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { -/*** - ASSERT(bp->b_pincount == 0); -***/ - ASSERT(atomic_read(&bip->bli_refcount) == 0); - ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); - xfs_buf_item_relse(bp); - } + /* drop the reference to the bli */ + xfs_buf_item_put(bip); bp->b_transp = NULL; xfs_buf_relse(bp);