summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-25 15:46:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-25 15:46:04 -0700
commit19240e6b2a6c456d1c2975400a04b6b01f957cf0 (patch)
tree989908a19a86e2cbfe70d71d18da121b907b8738 /fs
parent17763641ff8a189bc6fb1fcd5c2a9844ee0dd89c (diff)
parentfddc9923c6d41de9fe7b1f323a3cece53e046c88 (diff)
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: - Two sets of NVMe pull requests from Christoph: - Fixes for the Fibre Channel host/target to fix spec compliance - Allow a zero keep alive timeout - Make the debug printk for broken SGLs work better - Fix queue zeroing during initialization - Set of RDMA and FC fixes - Target div-by-zero fix - bsg double-free fix. - ndb unknown ioctl fix from Josef. - Buffered vs O_DIRECT page cache inconsistency fix. Has been floating around for a long time, well reviewed. From Lukas. - brd overflow fix from Mikulas. - Fix for a loop regression in this merge window, where using a union for two members of the loop_cmd turned out to be a really bad idea. From Omar. - Fix for an iostat regression fix in this series, using the wrong API to get at the block queue. From Shaohua. - Fix for a potential blktrace delection deadlock. From Waiman. * 'for-linus' of git://git.kernel.dk/linux-block: (30 commits) nvme-fcloop: fix port deletes and callbacks nvmet-fc: sync header templates with comments nvmet-fc: ensure target queue id within range. nvmet-fc: on port remove call put outside lock nvme-rdma: don't fully stop the controller in error recovery nvme-rdma: give up reconnect if state change fails nvme-core: Use nvme_wq to queue async events and fw activation nvme: fix sqhd reference when admin queue connect fails block: fix a crash caused by wrong API fs: Fix page cache inconsistency when mixing buffered and AIO DIO nvmet: implement valid sqhd values in completions nvme-fabrics: Allow 0 as KATO value nvme: allow timed-out ios to retry nvme: stop aer posting if controller state not live nvme-pci: Print invalid SGL only once nvme-pci: initialize queue memory before interrupts nvmet-fc: fix failing max io queue connections nvme-fc: use transport-specific sgl format nvme: add transport SGL definitions nvme.h: remove FC transport-specific error values ...
Diffstat (limited to 'fs')
-rw-r--r--fs/direct-io.c49
-rw-r--r--fs/iomap.c29
2 files changed, 59 insertions, 19 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 5fa2211e49ae..62cf812ed0e5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
{
loff_t offset = dio->iocb->ki_pos;
ssize_t transferred = 0;
+ int err;
/*
* AIO submission can race with bio completion to get here while
@@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
if (ret == 0)
ret = transferred;
+ /*
+ * Try again to invalidate clean pages which might have been cached by
+ * non-direct readahead, or faulted in by get_user_pages() if the source
+ * of the write was an mmap'ed region of the file we're writing. Either
+ * one is a pretty crazy thing to do, so we don't support it 100%. If
+ * this invalidation fails, tough, the write still worked...
+ */
+ if (ret > 0 && dio->op == REQ_OP_WRITE &&
+ dio->inode->i_mapping->nrpages) {
+ err = invalidate_inode_pages2_range(dio->inode->i_mapping,
+ offset >> PAGE_SHIFT,
+ (offset + ret - 1) >> PAGE_SHIFT);
+ WARN_ON_ONCE(err);
+ }
+
if (dio->end_io) {
- int err;
// XXX: ki_pos??
err = dio->end_io(dio->iocb, offset, ret, dio->private);
@@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
struct dio *dio = bio->bi_private;
unsigned long remaining;
unsigned long flags;
+ bool defer_completion = false;
/* cleanup the bio */
dio_bio_complete(dio, bio);
@@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- if (dio->result && dio->defer_completion) {
+ /*
+ * Defer completion when defer_completion is set or
+ * when the inode has pages mapped and this is AIO write.
+ * We need to invalidate those pages because there is a
+ * chance they contain stale data in the case buffered IO
+ * went in between AIO submission and completion into the
+ * same region.
+ */
+ if (dio->result)
+ defer_completion = dio->defer_completion ||
+ (dio->op == REQ_OP_WRITE &&
+ dio->inode->i_mapping->nrpages);
+ if (defer_completion) {
INIT_WORK(&dio->complete_work, dio_aio_complete_work);
queue_work(dio->inode->i_sb->s_dio_done_wq,
&dio->complete_work);
@@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
* so that we can call ->fsync.
*/
- if (dio->is_async && iov_iter_rw(iter) == WRITE &&
- ((iocb->ki_filp->f_flags & O_DSYNC) ||
- IS_SYNC(iocb->ki_filp->f_mapping->host))) {
- retval = dio_set_defer_completion(dio);
+ if (dio->is_async && iov_iter_rw(iter) == WRITE) {
+ retval = 0;
+ if ((iocb->ki_filp->f_flags & O_DSYNC) ||
+ IS_SYNC(iocb->ki_filp->f_mapping->host))
+ retval = dio_set_defer_completion(dio);
+ else if (!dio->inode->i_sb->s_dio_done_wq) {
+ /*
+ * In case of AIO write racing with buffered read we
+ * need to defer completion. We can't decide this now,
+ * however the workqueue needs to be initialized here.
+ */
+ retval = sb_init_dio_done_wq(dio->inode->i_sb);
+ }
if (retval) {
/*
* We grab i_mutex only for reads so we don't have
diff --git a/fs/iomap.c b/fs/iomap.c
index 269b24a01f32..8194d30bdca0 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -713,8 +713,24 @@ struct iomap_dio {
static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
struct kiocb *iocb = dio->iocb;
+ struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ /*
+ * Try again to invalidate clean pages which might have been cached by
+ * non-direct readahead, or faulted in by get_user_pages() if the source
+ * of the write was an mmap'ed region of the file we're writing. Either
+ * one is a pretty crazy thing to do, so we don't support it 100%. If
+ * this invalidation fails, tough, the write still worked...
+ */
+ if (!dio->error &&
+ (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ iocb->ki_pos >> PAGE_SHIFT,
+ (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ }
+
if (dio->end_io) {
ret = dio->end_io(iocb,
dio->error ? dio->error : dio->size,
@@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
ret = iomap_dio_complete(dio);
- /*
- * Try again to invalidate clean pages which might have been cached by
- * non-direct readahead, or faulted in by get_user_pages() if the source
- * of the write was an mmap'ed region of the file we're writing. Either
- * one is a pretty crazy thing to do, so we don't support it 100%. If
- * this invalidation fails, tough, the write still worked...
- */
- if (iov_iter_rw(iter) == WRITE) {
- int err = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT, end >> PAGE_SHIFT);
- WARN_ON_ONCE(err);
- }
-
return ret;
out_free_dio: