whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit abe72ff4134028ff2189d29629c40a40bee0a989
parent 7c98a42618271210c60b79128b220107d35938d9
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sat, 24 Nov 2018 09:11:52 -0800

Merge tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Darrick Wong:
 "Dave and I have continued our work fixing corruption problems that can
  be found when running long-term burn-in exercisers on xfs. Here are
  some patches fixing most of the problems, but there will likely be
  more. :/

   - Numerous corruption fixes for copy on write

   - Numerous corruption fixes for blocksize < pagesize writes

   - Don't miscalculate AG reservations for small final AGs

   - Fix page cache truncation to work properly for reflink and extent
     shifting

   - Fix use-after-free when retrying failed inode/dquot buffer logging

   - Fix corruptions seen when using copy_file_range in directio mode"

* tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  iomap: readpages doesn't zero page tail beyond EOF
  vfs: vfs_dedupe_file_range() doesn't return EOPNOTSUPP
  iomap: dio data corruption and spurious errors when pipes fill
  iomap: sub-block dio needs to zeroout beyond EOF
  iomap: FUA is wrong for DIO O_DSYNC writes into unwritten extents
  xfs: delalloc -> unwritten COW fork allocation can go wrong
  xfs: flush removing page cache in xfs_reflink_remap_prep
  xfs: extent shifting doesn't fully invalidate page cache
  xfs: finobt AG reserves don't consider last AG can be a runt
  xfs: fix transient reference count error in xfs_buf_resubmit_failed_buffers
  xfs: uncached buffer tracing needs to print bno
  xfs: make xfs_file_remap_range() static
  xfs: fix shared extent data corruption due to missing cow reservation

Diffstat:
Mfs/iomap.c | 53+++++++++++++++++++++++++++++++++++++++++------------
Mfs/read_write.c | 15+++++++--------
Mfs/xfs/libxfs/xfs_bmap.c | 5++++-
Mfs/xfs/libxfs/xfs_ialloc_btree.c | 11+++++++----
Mfs/xfs/xfs_bmap_util.c | 10++--------
Mfs/xfs/xfs_bmap_util.h | 3+++
Mfs/xfs/xfs_buf_item.c | 28+++++++++++++++++++++-------
Mfs/xfs/xfs_file.c | 2+-
Mfs/xfs/xfs_reflink.c | 18++++++++++++++----
Mfs/xfs/xfs_trace.h | 5++++-
10 files changed, 104 insertions(+), 46 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c @@ -142,13 +142,14 @@ static void iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) { + loff_t orig_pos = *pos; + loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; unsigned block_size = (1 << block_bits); unsigned poff = offset_in_page(*pos); unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); unsigned first = poff >> block_bits; unsigned last = (poff + plen - 1) >> block_bits; - unsigned end = offset_in_page(i_size_read(inode)) >> block_bits; /* * If the block size is smaller than the page size we need to check the @@ -183,8 +184,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ - if (first <= end && last > end) - plen -= (last - end) * block_size; + if (orig_pos <= isize && orig_pos + length > isize) { + unsigned end = offset_in_page(isize - 1) >> block_bits; + + if (first <= end && last > end) + plen -= (last - end) * block_size; + } *offp = poff; *lenp = plen; @@ -1580,7 +1585,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct bio *bio; bool need_zeroout = false; bool use_fua = false; - int nr_pages, ret; + int nr_pages, ret = 0; size_t copied = 0; if ((pos | length | align) & ((1 << blkbits) - 1)) @@ -1596,12 +1601,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, if (iomap->flags & IOMAP_F_NEW) { need_zeroout = true; - } else { + } else if (iomap->type == IOMAP_MAPPED) { /* - * Use a FUA write if we need datasync semantics, this - * is a pure data IO that doesn't require any metadata - * updates and the underlying device supports FUA. This - * allows us to avoid cache flushes on IO completion. + * Use a FUA write if we need datasync semantics, this is a pure + * data IO that doesn't require any metadata updates (including + * after IO completion such as unwritten extent conversion) and + * the underlying device supports FUA. This allows us to avoid + * cache flushes on IO completion. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && (dio->flags & IOMAP_DIO_WRITE_FUA) && @@ -1644,8 +1650,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, ret = bio_iov_iter_get_pages(bio, &iter); if (unlikely(ret)) { + /* + * We have to stop part way through an IO. We must fall + * through to the sub-block tail zeroing here, otherwise + * this short IO may expose stale data in the tail of + * the block we haven't written data to. + */ bio_put(bio); - return copied ? copied : ret; + goto zero_tail; } n = bio->bi_iter.bi_size; @@ -1676,13 +1688,21 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, dio->submit.cookie = submit_bio(bio); } while (nr_pages); - if (need_zeroout) { + /* + * We need to zeroout the tail of a sub-block write if the extent type + * requires zeroing or the write extends beyond EOF. If we don't zero + * the block tail in the latter case, we can expose stale data via mmap + * reads of the EOF block. + */ +zero_tail: + if (need_zeroout || + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); } - return copied; + return copied ? copied : ret; } static loff_t @@ -1857,6 +1877,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->wait_for_completion = true; ret = 0; } + + /* + * Splicing to pipes can fail on a full pipe. We have to + * swallow this to make it look like a short IO + * otherwise the higher splice layers will completely + * mishandle the error and stop moving data. + */ + if (ret == -EFAULT) + ret = 0; break; } pos += ret; diff --git a/fs/read_write.c b/fs/read_write.c @@ -2094,17 +2094,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) off = same->src_offset; len = same->src_length; - ret = -EISDIR; if (S_ISDIR(src->i_mode)) - goto out; + return -EISDIR; - ret = -EINVAL; if (!S_ISREG(src->i_mode)) - goto out; + return -EINVAL; + + if (!file->f_op->remap_file_range) + return -EOPNOTSUPP; ret = remap_verify_area(file, off, len, false); if (ret < 0) - goto out; + return ret; ret = 0; if (off + len > i_size_read(src)) @@ -2147,10 +2148,8 @@ next_fdput: fdput(dst_fd); next_loop: if (fatal_signal_pending(current)) - goto out; + break; } - -out: return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c @@ -1694,10 +1694,13 @@ xfs_bmap_add_extent_delay_real( case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. - * The right neighbor is contiguous, the left is not. + * The right neighbor is contiguous, the left is not. Take care + * with delay -> unwritten extent allocation here because the + * delalloc record we are overwriting is always written. */ PREV.br_startblock = new->br_startblock; PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; xfs_iext_next(ifp, &bma->icur); xfs_iext_remove(bma->ip, &bma->icur, state); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -538,15 +538,18 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agnumber_t agno) { + xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_inobt_mxr[0] == 0) return 0; return xfs_btree_calc_size(mp->m_inobt_mnr, - (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / - XFS_INODES_PER_CHUNK); + (uint64_t)agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); } static int @@ -594,7 +597,7 @@ xfs_finobt_calc_reserves( if (error) return error; - *ask += xfs_inobt_max_size(mp); + *ask += xfs_inobt_max_size(mp, agno); *used += tree_len; return 0; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c @@ -1042,7 +1042,7 @@ out_trans_cancel: goto out_unlock; } -static int +int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, @@ -1195,13 +1195,7 @@ xfs_prepare_shift( * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1); - if (error) - return error; - error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - offset >> PAGE_SHIFT, -1); - if (error) - return error; + error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); /* * Clean out anything hanging around in the cow fork now that diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h @@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); +int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + #endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c @@ -1233,9 +1233,23 @@ xfs_buf_iodone( } /* - * Requeue a failed buffer for writeback + * Requeue a failed buffer for writeback. * - * Return true if the buffer has been re-queued properly, false otherwise + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + * + * Return true if the buffer was added to the buffer list, false if it was + * already on the buffer list. */ bool xfs_buf_resubmit_failed_buffers( @@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers( struct list_head *buffer_list) { struct xfs_log_item *lip; + bool ret; + + ret = xfs_buf_delwri_queue(bp, buffer_list); /* - * Clear XFS_LI_FAILED flag from all items before resubmit - * - * XFS_LI_FAILED set/clear is protected by ail_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - /* Add this buffer back to the delayed write list */ - return xfs_buf_delwri_queue(bp, buffer_list); + return ret; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c @@ -920,7 +920,7 @@ out_unlock: } -loff_t +STATIC loff_t xfs_file_remap_range( struct file *file_in, loff_t pos_in, diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c @@ -296,6 +296,7 @@ xfs_reflink_reserve_cow( if (error) return error; + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); trace_xfs_reflink_cow_alloc(ip, &got); return 0; } @@ -1351,10 +1352,19 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; - /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, - round_down(pos_out, PAGE_SIZE), - round_up(pos_out + *len, PAGE_SIZE) - 1); + /* + * If pos_out > EOF, we may have dirtied blocks between EOF and + * pos_out. In that case, we need to extend the flush and unmap to cover + * from EOF to the end of the copy length. + */ + if (pos_out > XFS_ISIZE(dest)) { + loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); + ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); + } else { + ret = xfs_flush_unmap_range(dest, pos_out, *len); + } + if (ret) + goto out_unlock; return 1; out_unlock: diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h @@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; + if (bp->b_bn == XFS_BUF_DADDR_NULL) + __entry->bno = bp->b_maps[0].bm_bn; + else + __entry->bno = bp->b_bn; __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count);