From 1805f2ca3f27f25c8e637edc4b40bab237f9d07c Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 20 Oct 2017 17:53:41 -0600
Subject: Btrfs: remove redundant btrfs_balance_delayed_items

In functions like btrfs_create(), we run both
btrfs_balance_delayed_items() and btrfs_btree_balance_dirty() after
the operation, but btrfs_btree_balance_dirty() is surely going to run
btrfs_balance_delayed_items().

This keeps only btrfs_btree_balance_dirty().

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1a7f3cb5be9..2bab93f20328 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6560,7 +6560,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 
 out_unlock:
 	btrfs_end_transaction(trans);
-	btrfs_balance_delayed_items(fs_info);
 	btrfs_btree_balance_dirty(fs_info);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -6641,7 +6640,6 @@ out_unlock:
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_balance_delayed_items(fs_info);
 	btrfs_btree_balance_dirty(fs_info);
 	return err;
 
@@ -6716,7 +6714,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
 	}
 
-	btrfs_balance_delayed_items(fs_info);
 fail:
 	if (trans)
 		btrfs_end_transaction(trans);
@@ -6794,7 +6791,6 @@ out_fail:
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_balance_delayed_items(fs_info);
 	btrfs_btree_balance_dirty(fs_info);
 	return err;
 
@@ -10688,7 +10684,6 @@ out:
 	btrfs_end_transaction(trans);
 	if (ret)
 		iput(inode);
-	btrfs_balance_delayed_items(fs_info);
 	btrfs_btree_balance_dirty(fs_info);
 	return ret;
 
-- 
cgit v1.2.3


From 9f5316c17b0833c606b94d91beb6518c2e8a8012 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 23 Oct 2017 23:02:54 -0600
Subject: Btrfs: free btrfs_device in place

It's pointless to defer it to a kthread helper as we're not under a
special context.

For reference, commit 1f78160ce1b1 ("Btrfs: using rcu lock in the reader
side of devices list") introduced RCU freeing for device structures.

Originally the blkdev_put was called from free_device and rcu_barrier had
to be called. This is no longer required, bdev and our device structures
are now freed separately.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ enhance changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 14 ++------------
 fs/btrfs/volumes.h |  1 -
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a25684287501..f3c4ad2ddfe6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -824,26 +824,16 @@ again:
 	mutex_unlock(&uuid_mutex);
 }
 
-static void __free_device(struct work_struct *work)
+static void free_device(struct rcu_head *head)
 {
 	struct btrfs_device *device;
 
-	device = container_of(work, struct btrfs_device, rcu_work);
+	device = container_of(head, struct btrfs_device, rcu);
 	rcu_string_free(device->name);
 	bio_put(device->flush_bio);
 	kfree(device);
 }
 
-static void free_device(struct rcu_head *head)
-{
-	struct btrfs_device *device;
-
-	device = container_of(head, struct btrfs_device, rcu);
-
-	INIT_WORK(&device->rcu_work, __free_device);
-	schedule_work(&device->rcu_work);
-}
-
 static void btrfs_close_bdev(struct btrfs_device *device)
 {
 	if (device->bdev && device->writeable) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ff15208344a7..bf0d53a96b4a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -133,7 +133,6 @@ struct btrfs_device {
 
 	struct btrfs_work work;
 	struct rcu_head rcu;
-	struct work_struct rcu_work;
 
 	/* readahead state */
 	spinlock_t reada_lock;
-- 
cgit v1.2.3


From 1cb34c8ecd4623909c66bccb91ee7f6483e04441 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sat, 21 Oct 2017 01:45:33 +0800
Subject: btrfs: clean up btrfs_dev_stat_inc usage

btrfs_end_bio() is using btrfs_dev_stat_inc() and then
btrfs_dev_stat_print_on_error() separately instead use
btrfs_dev_stat_inc_and_print() directly.

As of now there isn't any bio in btrfs which is - a non-empty write and
also the REQ_PREFLUSH flag is set. So in actual the condition

   if (bio->bi_opf & REQ_PREFLUSH)

is never true in btrfs_end_bio(), and so there won't be any redundant
error log by using btrfs_dev_stat_inc_and_print() separately one for
write and another for flush.

This consolidation will help to add the device critical error handles in
the function btrfs_dev_stat_inc_and_print() and which can be renamed as
needed.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f3c4ad2ddfe6..baf047b8cf99 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5993,15 +5993,14 @@ static void btrfs_end_bio(struct bio *bio)
 			dev = bbio->stripes[stripe_index].dev;
 			if (dev->bdev) {
 				if (bio_op(bio) == REQ_OP_WRITE)
-					btrfs_dev_stat_inc(dev,
+					btrfs_dev_stat_inc_and_print(dev,
 						BTRFS_DEV_STAT_WRITE_ERRS);
 				else
-					btrfs_dev_stat_inc(dev,
+					btrfs_dev_stat_inc_and_print(dev,
 						BTRFS_DEV_STAT_READ_ERRS);
 				if (bio->bi_opf & REQ_PREFLUSH)
-					btrfs_dev_stat_inc(dev,
+					btrfs_dev_stat_inc_and_print(dev,
 						BTRFS_DEV_STAT_FLUSH_ERRS);
-				btrfs_dev_stat_print_on_error(dev);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 9deae9689231964972a94bb56a79b669f9d47ac1 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 24 Oct 2017 13:47:37 +0300
Subject: btrfs: Fix memory barriers usage with device stats counters

Commit addc3fa74e5b ("Btrfs: Fix the problem that the dirty flag of dev
stats is cleared") reworked the way device stats changes are tracked. A
new atomic dev_stats_ccnt counter was introduced which is incremented
every time any of the device stats counters are changed. This serves as
a flag whether there are any pending stats changes. However, this patch
only partially implemented the correct memory barriers necessary:

- It only ordered the stores to the counters but not the reads e.g.
  btrfs_run_dev_stats
- It completely omitted any comments documenting the intended design and
  how the memory barriers pair with each-other

This patch provides the necessary comments as well as adds a missing
smp_rmb in btrfs_run_dev_stats. Furthermore since dev_stats_cnt is only
a snapshot at best there was no point in reading the counter twice -
once in btrfs_dev_stats_dirty and then again when assigning stats_cnt.
Just collapse both reads into 1.

Fixes: addc3fa74e5b ("Btrfs: Fix the problem that the dirty flag of dev stats is cleared")
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 18 ++++++++++++++++--
 fs/btrfs/volumes.h | 12 ++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index baf047b8cf99..c96635a26803 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7080,10 +7080,24 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
-		if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
+		stats_cnt = atomic_read(&device->dev_stats_ccnt);
+		if (!device->dev_stats_valid || stats_cnt == 0)
 			continue;
 
-		stats_cnt = atomic_read(&device->dev_stats_ccnt);
+
+		/*
+		 * There is a LOAD-LOAD control dependency between the value of
+		 * dev_stats_ccnt and updating the on-disk values which requires
+		 * reading the in-memory counters. Such control dependencies
+		 * require explicit read memory barriers.
+		 *
+		 * This memory barriers pairs with smp_mb__before_atomic in
+		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
+		 * barrier implied by atomic_xchg in
+		 * btrfs_dev_stats_read_and_reset
+		 */
+		smp_rmb();
+
 		ret = update_dev_stat_item(trans, fs_info, device);
 		if (!ret)
 			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bf0d53a96b4a..38fd368ddfe7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -497,6 +497,12 @@ static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
 				      int index)
 {
 	atomic_inc(dev->dev_stat_values + index);
+	/*
+	 * This memory barrier orders stores updating statistics before stores
+	 * updating dev_stats_ccnt.
+	 *
+	 * It pairs with smp_rmb() in btrfs_run_dev_stats().
+	 */
 	smp_mb__before_atomic();
 	atomic_inc(&dev->dev_stats_ccnt);
 }
@@ -522,6 +528,12 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
 				      int index, unsigned long val)
 {
 	atomic_set(dev->dev_stat_values + index, val);
+	/*
+	 * This memory barrier orders stores updating statistics before stores
+	 * updating dev_stats_ccnt.
+	 *
+	 * It pairs with smp_rmb() in btrfs_run_dev_stats().
+	 */
 	smp_mb__before_atomic();
 	atomic_inc(&dev->dev_stats_ccnt);
 }
-- 
cgit v1.2.3


From 4660c49f9b4ad50eab410de09eeacc2e3aebf1f5 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 20 Oct 2017 18:10:58 +0300
Subject: btrfs: Remove redundant memory barrier in dev stats

As per atomic_t.txt documentation :
 - RMW operations that have a return value are fully ordered;

atomic_xchg is one such operation so it already includes everything it
needs w.r.t memory ordering and add a comment to be more explicit about
that.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 38fd368ddfe7..177814a7c758 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -519,7 +519,13 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
 	int ret;
 
 	ret = atomic_xchg(dev->dev_stat_values + index, 0);
-	smp_mb__before_atomic();
+	/*
+	 * atomic_xchg implies a full memory barriers as per atomic_t.txt:
+	 * - RMW operations that have a return value are fully ordered;
+	 *
+	 * This implicit memory barriers is paired with the smp_rmb in
+	 * btrfs_run_dev_stats
+	 */
 	atomic_inc(&dev->dev_stats_ccnt);
 	return ret;
 }
-- 
cgit v1.2.3


From ac244ef1da76383842a9a4c3404e35d0669f2e53 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 20 Oct 2017 18:10:59 +0300
Subject: btrfs: Remove unused function

It's sole callsite was removed in a previous patch so just nuke it for good.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 177814a7c758..294c4eb6a272 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -488,11 +488,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info, u64 chunk_offset);
 
-static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
-{
-	return atomic_read(&dev->dev_stats_ccnt);
-}
-
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
 				      int index)
 {
-- 
cgit v1.2.3


From d3fac6ba7dccc54f52087bb9d7b9be27aa6fa622 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 24 Oct 2017 11:50:39 +0300
Subject: btrfs: Remove redundant mirror_num arg

The following callpath is always invoked with mirror_num set to 0, so
let's remove it as an argument and directly pass 0 to __do_redpage. No
functional change.

extent_readpages
  __extent_readpages
    __do_contiguous_readpages
      __do_readpage

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 012d63870b99..c6e82a3e706d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3094,7 +3094,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 					     u64 start, u64 end,
 					     get_extent_t *get_extent,
 					     struct extent_map **em_cached,
-					     struct bio **bio, int mirror_num,
+					     struct bio **bio,
 					     unsigned long *bio_flags,
 					     u64 *prev_em_start)
 {
@@ -3116,7 +3116,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 
 	for (index = 0; index < nr_pages; index++) {
 		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
-			      mirror_num, bio_flags, 0, prev_em_start);
+			      0, bio_flags, 0, prev_em_start);
 		put_page(pages[index]);
 	}
 }
@@ -3125,8 +3125,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
 			       struct page *pages[],
 			       int nr_pages, get_extent_t *get_extent,
 			       struct extent_map **em_cached,
-			       struct bio **bio, int mirror_num,
-			       unsigned long *bio_flags,
+			       struct bio **bio, unsigned long *bio_flags,
 			       u64 *prev_em_start)
 {
 	u64 start = 0;
@@ -3147,7 +3146,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
 			__do_contiguous_readpages(tree, &pages[first_index],
 						  index - first_index, start,
 						  end, get_extent, em_cached,
-						  bio, mirror_num, bio_flags,
+						  bio, bio_flags,
 						  prev_em_start);
 			start = page_start;
 			end = start + PAGE_SIZE - 1;
@@ -3159,8 +3158,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
 		__do_contiguous_readpages(tree, &pages[first_index],
 					  index - first_index, start,
 					  end, get_extent, em_cached, bio,
-					  mirror_num, bio_flags,
-					  prev_em_start);
+					  bio_flags, prev_em_start);
 }
 
 static int __extent_read_full_page(struct extent_io_tree *tree,
@@ -4171,12 +4169,12 @@ int extent_readpages(struct extent_io_tree *tree,
 		if (nr < ARRAY_SIZE(pagepool))
 			continue;
 		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
-				   &bio, 0, &bio_flags, &prev_em_start);
+				   &bio, &bio_flags, &prev_em_start);
 		nr = 0;
 	}
 	if (nr)
 		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
-				   &bio, 0, &bio_flags, &prev_em_start);
+				   &bio, &bio_flags, &prev_em_start);
 
 	if (em_cached)
 		free_extent_map(em_cached);
-- 
cgit v1.2.3


From 617c54a88eaba611bfaa917918c773a91c53692a Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 23 Oct 2017 13:51:48 +0300
Subject: btrfs: Make btrfs_async_run_delayed_root use a loop rather than
 multiple labels

Currently btrfs_async_run_delayed_root's implementation uses 3 goto
labels to mimic the functionality of a simple do {} while loop. Refactor
the function to use a do {} while construct, making intention clear and
code easier to follow. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 52 +++++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 056276101c63..cf0a6a25156c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1302,40 +1302,42 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 	if (!path)
 		goto out;
 
-again:
-	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
-		goto free_path;
+	do {
+		if (atomic_read(&delayed_root->items) <
+		    BTRFS_DELAYED_BACKGROUND / 2)
+			break;
 
-	delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
-	if (!delayed_node)
-		goto free_path;
+		delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+		if (!delayed_node)
+			break;
 
-	path->leave_spinning = 1;
-	root = delayed_node->root;
+		path->leave_spinning = 1;
+		root = delayed_node->root;
 
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
-		goto release_path;
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			btrfs_release_path(path);
+			btrfs_release_prepared_delayed_node(delayed_node);
+			total_done++;
+			continue;
+		}
 
-	block_rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->delayed_block_rsv;
+		block_rsv = trans->block_rsv;
+		trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
-	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
+		__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 
-	trans->block_rsv = block_rsv;
-	btrfs_end_transaction(trans);
-	btrfs_btree_balance_dirty_nodelay(root->fs_info);
+		trans->block_rsv = block_rsv;
+		btrfs_end_transaction(trans);
+		btrfs_btree_balance_dirty_nodelay(root->fs_info);
 
-release_path:
-	btrfs_release_path(path);
-	total_done++;
+		btrfs_release_path(path);
+		btrfs_release_prepared_delayed_node(delayed_node);
+		total_done++;
 
-	btrfs_release_prepared_delayed_node(delayed_node);
-	if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) ||
-	    total_done < async_work->nr)
-		goto again;
+	} while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
+		 || total_done < async_work->nr);
 
-free_path:
 	btrfs_free_path(path);
 out:
 	wake_up(&delayed_root->wait);
-- 
cgit v1.2.3


From 8577787fac828a9873064ccec986f4be6156615d Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 23 Oct 2017 13:51:49 +0300
Subject: btrfs: Move checks from btrfs_wq_run_delayed_node to
 btrfs_balance_delayed_items

btrfs_balance_delayed_items is the sole caller of
btrfs_wq_run_delayed_node and already includes one of the checks whether
the delayed inodes should be run. On the other hand
btrfs_wq_run_delayed_node duplicates that check and performs an
additional one for wq congestion.

Let's remove the duplicate check and move the congestion one in
btrfs_balance_delayed_items, leaving btrfs_wq_run_delayed_node to only
care about setting up the wq run. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index cf0a6a25156c..1c0bab4080a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1350,10 +1350,6 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 {
 	struct btrfs_async_delayed_work *async_work;
 
-	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND ||
-	    btrfs_workqueue_normal_congested(fs_info->delayed_workers))
-		return 0;
-
 	async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
 	if (!async_work)
 		return -ENOMEM;
@@ -1389,7 +1385,8 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
 
-	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+	if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
+		btrfs_workqueue_normal_congested(fs_info->delayed_workers))
 		return;
 
 	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
-- 
cgit v1.2.3


From 47dba17171a76ea2a2a7153f9aa9f28579bfd350 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 10 Oct 2017 15:51:02 -0600
Subject: Btrfs: remove rcu_barrier in btrfs_close_devices

It was introduced because btrfs used to do blkdev_put in a deferred
work, now that btrfs has blkdev_put in place, this rcu_barrier can be
removed.

modprobe -r btrfs will do btrfs_cleanup_fs_uuids(), where it cleanup
every %fs_devices on the list, but when we do btrfs_close_devices(), we
have replaced the devices on the list with dummy ones which only have
the same name and uuid, so modprobe -r btrfs will free those instead of
what we were using, this change won't cause a problem for it.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ copied 2nd paragraph from mailinglist discussion ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c96635a26803..7ea81e1b361f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -936,12 +936,6 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		__btrfs_close_devices(fs_devices);
 		free_fs_devices(fs_devices);
 	}
-	/*
-	 * Wait for rcu kworkers under __btrfs_close_devices
-	 * to finish all blkdev_puts so device is really
-	 * free when umount is done.
-	 */
-	rcu_barrier();
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6b7d6e933433a43062ce9355a4126b59bd2519e3 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 1 Nov 2017 11:32:18 +0200
Subject: btrfs: Move loop termination condition in while()

Fallocating a file in btrfs goes through several stages. The one before
actually inserting the fallocated extents is to create a qgroup
reservation, covering the desired range. To this end there is a loop in
btrfs_fallocate which checks to see if there are holes in the fallocated
range or !PREALLOC extents past EOF and if so create qgroup reservations
for them. Unfortunately, the main condition of the loop is burried right
at the end of its body rather than in the actual while statement which
makes it non-obvious. Fix this by moving the condition in the while
statement where it belongs. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eb1bac7c8553..89fb9eff714f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2922,7 +2922,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 
 	/* First, check if we exceed the qgroup limit */
 	INIT_LIST_HEAD(&reserve_list);
-	while (1) {
+	while (cur_offset < alloc_end) {
 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
 				      alloc_end - cur_offset, 0);
 		if (IS_ERR(em)) {
@@ -2958,8 +2958,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 		}
 		free_extent_map(em);
 		cur_offset = last_byte;
-		if (cur_offset >= alloc_end)
-			break;
 	}
 
 	/*
-- 
cgit v1.2.3


From 96b09dde92515956c992c1b330f00399487f47b3 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 1 Nov 2017 11:36:05 +0200
Subject: btrfs: Use locked_end rather than open coding it

Right before we go into this loop locked_end is set to alloc_end - 1 and
is being used in nearby functions, no need to have exceptions. This just
makes the code consistent, no functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 89fb9eff714f..559d716221df 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2896,8 +2896,8 @@ static long btrfs_fallocate(struct file *file, int mode,
 		 */
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
 				 locked_end, &cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode,
-							    alloc_end - 1);
+		ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+
 		if (ordered &&
 		    ordered->file_offset + ordered->len > alloc_start &&
 		    ordered->file_offset < alloc_end) {
-- 
cgit v1.2.3


From 2c9973847fa0230ea82935a9c486cb38ef599893 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 6 Nov 2017 10:28:00 +0800
Subject: btrfs: move volume_mutex into the btrfs_rm_device()

A cleanup patch no functional change, we hold volume_mutex before
calling btrfs_rm_device, so move it into the function itself.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c   | 4 ----
 fs/btrfs/volumes.c | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2ef8acaac688..64b947487ca6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2675,14 +2675,12 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 		goto out;
 	}
 
-	mutex_lock(&fs_info->volume_mutex);
 	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
 		ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
 	} else {
 		vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 		ret = btrfs_rm_device(fs_info, vol_args->name, 0);
 	}
-	mutex_unlock(&fs_info->volume_mutex);
 	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
 
 	if (!ret) {
@@ -2726,9 +2724,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	mutex_lock(&fs_info->volume_mutex);
 	ret = btrfs_rm_device(fs_info, vol_args->name, 0);
-	mutex_unlock(&fs_info->volume_mutex);
 
 	if (!ret)
 		btrfs_info(fs_info, "disk deleted %s", vol_args->name);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7ea81e1b361f..02b2c9ec6fcd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1842,6 +1842,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	u64 num_devices;
 	int ret = 0;
 
+	mutex_lock(&fs_info->volume_mutex);
 	mutex_lock(&uuid_mutex);
 
 	num_devices = fs_info->fs_devices->num_devices;
@@ -1956,6 +1957,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 
 out:
 	mutex_unlock(&uuid_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
 	return ret;
 
 error_undo:
-- 
cgit v1.2.3


From 33d85fda13ea2712bb9c9e687176798583d2359c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 31 Oct 2017 14:08:16 +0800
Subject: btrfs: Don't generate UUID for non-fs tree

btrfs_create_tree() will unconditionally generate UUID for any root.
So for quota tree and data reloc tree created by kernel, they will have
unique UUIDs.

However UUID in root item is only referred by UUID tree, which only
records UUID for fs trees.  This makes unique UUIDs for quota/data reloc
tree meaningless.

Leave the UUID as zero for non-fs tree, making btrfs-debug-tree output
less confusing.

Reported-by: Misono Tomohiro <misono.tomohiro@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8ecccfc36de..ec9aa34bb6eb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1243,7 +1243,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root;
 	struct btrfs_key key;
 	int ret = 0;
-	uuid_le uuid;
+	uuid_le uuid = NULL_UUID_LE;
 
 	root = btrfs_alloc_root(fs_info, GFP_KERNEL);
 	if (!root)
@@ -1284,7 +1284,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	btrfs_set_root_used(&root->root_item, leaf->len);
 	btrfs_set_root_last_snapshot(&root->root_item, 0);
 	btrfs_set_root_dirid(&root->root_item, 0);
-	uuid_le_gen(&uuid);
+	if (is_fstree(objectid))
+		uuid_le_gen(&uuid);
 	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
 	root->root_item.drop_level = 0;
 
-- 
cgit v1.2.3


From c74a0b0237c47806e43c1dc6f875c88fb9ee2525 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 6 Nov 2017 16:36:15 +0800
Subject: btrfs: rename btrfs_add_device to btrfs_add_dev_item

Function btrfs_add_device() is adding the device item so rename to
reflect that in the function. Similarly we have btrfs_rm_dev_item().

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 02b2c9ec6fcd..9d2af9f53a65 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1645,7 +1645,7 @@ error:
  * the device information is stored in the chunk root
  * the btrfs_device struct should be fully filled in
  */
-static int btrfs_add_device(struct btrfs_trans_handle *trans,
+static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_fs_info *fs_info,
 			    struct btrfs_device *device)
 {
@@ -2435,7 +2435,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		}
 	}
 
-	ret = btrfs_add_device(trans, fs_info, device);
+	ret = btrfs_add_dev_item(trans, fs_info, device);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto error_sysfs;
-- 
cgit v1.2.3


From f5c29bd9dbd3e90e03ab7697ecc373b49394e62e Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 2 Nov 2017 17:21:50 -0600
Subject: Btrfs: add __init macro to btrfs init functions

Adding __init macro gives kernel a hint that this function is only used
during the initialization phase and its memory resources can be freed up
after.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.h | 2 +-
 fs/btrfs/ctree.h       | 6 +++---
 fs/btrfs/delayed-ref.c | 2 +-
 fs/btrfs/delayed-ref.h | 2 +-
 fs/btrfs/file.c        | 2 +-
 fs/btrfs/inode.c       | 2 +-
 fs/btrfs/super.c       | 4 ++--
 fs/btrfs/sysfs.c       | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 0868cc554f14..6b692903a23c 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -75,7 +75,7 @@ struct compressed_bio {
 	u32 sums;
 };
 
-void btrfs_init_compress(void);
+void __init btrfs_init_compress(void);
 void btrfs_exit_compress(void);
 
 int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 13c260b525a1..a56d00311578 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3197,7 +3197,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
-int btrfs_init_cachep(void);
+int __init btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
@@ -3248,7 +3248,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 			   struct file *dst_file, u64 dst_loff);
 
 /* file.c */
-int btrfs_auto_defrag_init(void);
+int __init btrfs_auto_defrag_init(void);
 void btrfs_auto_defrag_exit(void);
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct btrfs_inode *inode);
@@ -3283,7 +3283,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root);
 
 /* sysfs.c */
-int btrfs_init_sysfs(void);
+int __init btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
 void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 83be8f9fd906..a1a40cf382e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -937,7 +937,7 @@ void btrfs_delayed_ref_exit(void)
 	kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
 }
 
-int btrfs_delayed_ref_init(void)
+int __init btrfs_delayed_ref_init(void)
 {
 	btrfs_delayed_ref_head_cachep = kmem_cache_create(
 				"btrfs_delayed_ref_head",
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a43af432f859..c4f625e5a691 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -203,7 +203,7 @@ extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
 extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
 extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
 
-int btrfs_delayed_ref_init(void);
+int __init btrfs_delayed_ref_init(void);
 void btrfs_delayed_ref_exit(void);
 
 static inline struct btrfs_delayed_extent_op *
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 559d716221df..d1eba3394660 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3143,7 +3143,7 @@ void btrfs_auto_defrag_exit(void)
 	kmem_cache_destroy(btrfs_inode_defrag_cachep);
 }
 
-int btrfs_auto_defrag_init(void)
+int __init btrfs_auto_defrag_init(void)
 {
 	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
 					sizeof(struct inode_defrag), 0,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bab93f20328..c8a3e84d7114 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9569,7 +9569,7 @@ void btrfs_destroy_cachep(void)
 	kmem_cache_destroy(btrfs_free_space_cachep);
 }
 
-int btrfs_init_cachep(void)
+int __init btrfs_init_cachep(void)
 {
 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3a4dce153645..7a7abe827ac4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2324,7 +2324,7 @@ static struct miscdevice btrfs_misc = {
 MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
 MODULE_ALIAS("devname:btrfs-control");
 
-static int btrfs_interface_init(void)
+static int __init btrfs_interface_init(void)
 {
 	return misc_register(&btrfs_misc);
 }
@@ -2334,7 +2334,7 @@ static void btrfs_interface_exit(void)
 	misc_deregister(&btrfs_misc);
 }
 
-static void btrfs_print_mod_info(void)
+static void __init btrfs_print_mod_info(void)
 {
 	pr_info("Btrfs loaded, crc32c=%s"
 #ifdef CONFIG_BTRFS_DEBUG
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a28bba801264..a8bafed931f4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -897,7 +897,7 @@ static int btrfs_init_debugfs(void)
 	return 0;
 }
 
-int btrfs_init_sysfs(void)
+int __init btrfs_init_sysfs(void)
 {
 	int ret;
 
-- 
cgit v1.2.3


From 057aac3e628f38df984b419f7458455bf698eb7e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 7 Nov 2017 11:22:54 +0200
Subject: btrfs: Reduce scope of delayed_rsv->lock in may_commit_trans

After commit 996478ca9c460886ac1 ("btrfs: change how we decide to commit
transactions during flushing") there is no need to hold the delayed_rsv
during the percpu_counter_compare call since we get the byte's snapshot
earlier. So hold the lock only while reading delayed_rsv.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2f4328511ac8..41770ee1313e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4945,12 +4945,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 		bytes = 0;
 	else
 		bytes -= delayed_rsv->size;
+	spin_unlock(&delayed_rsv->lock);
+
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
 				   bytes) < 0) {
-		spin_unlock(&delayed_rsv->lock);
 		return -ENOSPC;
 	}
-	spin_unlock(&delayed_rsv->lock);
 
 commit:
 	trans = btrfs_join_transaction(fs_info->extent_root);
-- 
cgit v1.2.3


From 4c274bc67b8104c6aff89b4176395e368e8c231c Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 1 Nov 2017 17:19:27 -0600
Subject: Btrfs: document rules about bio async submit

These rules have been hidden in several if-else and are not
straightforward to follow, for example, dio submit hook's nocsum case
has a bug , i.e. doing async submit instead of sync submit, which has
been fixed recently.

This is documenting the rules for reference.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c8a3e84d7114..f259d89804b2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,7 +1951,21 @@ static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
 
 /*
  * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read
+ * on write, or reading the csums from the tree before a read.
+ *
+ * Rules about async/sync submit,
+ * a) read:				sync submit
+ *
+ * b) write without checksum:		sync submit
+ *
+ * c) write with checksum:
+ *    c-1) if bio is issued by fsync:	sync submit
+ *         (sync_writers != 0)
+ *
+ *    c-2) if root is reloc root:	sync submit
+ *         (only in case of buffered IO)
+ *
+ *    c-3) otherwise:			async submit
  */
 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
@@ -8456,6 +8470,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
 	bool write = bio_op(bio) == REQ_OP_WRITE;
 	blk_status_t ret;
 
+	/* Check btrfs_submit_bio_hook() for rules about async submit. */
 	if (async_submit)
 		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
-- 
cgit v1.2.3


From f06c5965abb79573a69874e2731581c7c336819f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Jun 2017 17:08:23 +0200
Subject: btrfs: rename device free rcu helper to free_device_rcu

Make it clear that it is an RCU helper, we want to use the name
free_device for a wrapper freeing all device members.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9d2af9f53a65..869048830df9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -824,7 +824,7 @@ again:
 	mutex_unlock(&uuid_mutex);
 }
 
-static void free_device(struct rcu_head *head)
+static void free_device_rcu(struct rcu_head *head)
 {
 	struct btrfs_device *device;
 
@@ -906,7 +906,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 				struct btrfs_device, dev_list);
 		list_del(&device->dev_list);
 		btrfs_close_bdev(device);
-		call_rcu(&device->rcu, free_device);
+		call_rcu(&device->rcu, free_device_rcu);
 	}
 
 	WARN_ON(fs_devices->open_devices);
@@ -1938,7 +1938,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		btrfs_scratch_superblocks(device->bdev, device->name->str);
 
 	btrfs_close_bdev(device);
-	call_rcu(&device->rcu, free_device);
+	call_rcu(&device->rcu, free_device_rcu);
 
 	if (cur_devices->open_devices == 0) {
 		struct btrfs_fs_devices *fs_devices;
@@ -2010,7 +2010,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
 	}
 
 	btrfs_close_bdev(srcdev);
-	call_rcu(&srcdev->rcu, free_device);
+	call_rcu(&srcdev->rcu, free_device_rcu);
 
 	/* if this is no devs we rather delete the fs_devices */
 	if (!fs_devices->num_devices) {
@@ -2069,7 +2069,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
 
 	btrfs_close_bdev(tgtdev);
-	call_rcu(&tgtdev->rcu, free_device);
+	call_rcu(&tgtdev->rcu, free_device_rcu);
 }
 
 static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
-- 
cgit v1.2.3


From 48dae9cf3f95acc4bd51b2324e10f8fd5c302be8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 30 Oct 2017 18:10:25 +0100
Subject: btrfs: introduce free_device helper

A helper to free a device and all it's dynamically allocated members,
like the rcu_string name or flush_bio. This is going to replace all
open coded places.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 869048830df9..6a1186e4ea57 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -180,6 +180,13 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 	return fs_devs;
 }
 
+static void free_device(struct btrfs_device *device)
+{
+	rcu_string_free(device->name);
+	bio_put(device->flush_bio);
+	kfree(device);
+}
+
 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct btrfs_device *device;
@@ -220,6 +227,11 @@ void btrfs_cleanup_fs_uuids(void)
 	}
 }
 
+/*
+ * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
+ * Returned struct is not linked onto any lists and must be destroyed using
+ * free_device.
+ */
 static struct btrfs_device *__alloc_device(void)
 {
 	struct btrfs_device *dev;
@@ -6257,8 +6269,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
  *		is generated.
  *
  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
- * on error.  Returned struct is not linked onto any lists and can be
- * destroyed with kfree() right away.
+ * on error.  Returned struct is not linked onto any lists and must be
+ * destroyed with free_device.
  */
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 					const u64 *devid,
-- 
cgit v1.2.3


From 55de480346cccff9c436f802de499a44b25968a2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 30 Oct 2017 18:55:47 +0100
Subject: btrfs: use free_device where opencoded

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 39 +++++++++++----------------------------
 1 file changed, 11 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6a1186e4ea57..f93994a49321 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -195,9 +195,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(fs_devices->devices.next,
 				    struct btrfs_device, dev_list);
 		list_del(&device->dev_list);
-		rcu_string_free(device->name);
-		bio_put(device->flush_bio);
-		kfree(device);
+		free_device(device);
 	}
 	kfree(fs_devices);
 }
@@ -589,9 +587,7 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 			} else {
 				fs_devs->num_devices--;
 				list_del(&dev->dev_list);
-				rcu_string_free(dev->name);
-				bio_put(dev->flush_bio);
-				kfree(dev);
+				free_device(dev);
 			}
 			break;
 		}
@@ -643,8 +639,7 @@ static noinline int device_list_add(const char *path,
 
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name) {
-			bio_put(device->flush_bio);
-			kfree(device);
+			free_device(device);
 			return -ENOMEM;
 		}
 		rcu_assign_pointer(device->name, name);
@@ -756,8 +751,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 			name = rcu_string_strdup(orig_dev->name->str,
 					GFP_KERNEL);
 			if (!name) {
-				bio_put(device->flush_bio);
-				kfree(device);
+				free_device(device);
 				goto error;
 			}
 			rcu_assign_pointer(device->name, name);
@@ -821,9 +815,7 @@ again:
 		}
 		list_del_init(&device->dev_list);
 		fs_devices->num_devices--;
-		rcu_string_free(device->name);
-		bio_put(device->flush_bio);
-		kfree(device);
+		free_device(device);
 	}
 
 	if (fs_devices->seed) {
@@ -841,9 +833,7 @@ static void free_device_rcu(struct rcu_head *head)
 	struct btrfs_device *device;
 
 	device = container_of(head, struct btrfs_device, rcu);
-	rcu_string_free(device->name);
-	bio_put(device->flush_bio);
-	kfree(device);
+	free_device(device);
 }
 
 static void btrfs_close_bdev(struct btrfs_device *device)
@@ -2355,8 +2345,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
-		bio_put(device->flush_bio);
-		kfree(device);
+		free_device(device);
 		ret = -ENOMEM;
 		goto error;
 	}
@@ -2364,9 +2353,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
-		rcu_string_free(device->name);
-		bio_put(device->flush_bio);
-		kfree(device);
+		free_device(device);
 		ret = PTR_ERR(trans);
 		goto error;
 	}
@@ -2508,9 +2495,7 @@ error_trans:
 		sb->s_flags |= SB_RDONLY;
 	if (trans)
 		btrfs_end_transaction(trans);
-	rcu_string_free(device->name);
-	bio_put(device->flush_bio);
-	kfree(device);
+	free_device(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
 	if (seeding_dev && !unlocked) {
@@ -2576,8 +2561,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
-		bio_put(device->flush_bio);
-		kfree(device);
+		free_device(device);
 		ret = -ENOMEM;
 		goto error;
 	}
@@ -6293,8 +6277,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 
 		ret = find_next_devid(fs_info, &tmp);
 		if (ret) {
-			bio_put(dev->flush_bio);
-			kfree(dev);
+			free_device(dev);
 			return ERR_PTR(ret);
 		}
 	}
-- 
cgit v1.2.3


From 5c4cf6c91d717b76008b0c8fef7b2947b5da2a4f Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 30 Oct 2017 19:29:46 +0100
Subject: btrfs: simplify exit paths in btrfs_init_new_device

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f93994a49321..3fa83ff9b30f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2345,17 +2345,15 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 
 	name = rcu_string_strdup(device_path, GFP_KERNEL);
 	if (!name) {
-		free_device(device);
 		ret = -ENOMEM;
-		goto error;
+		goto error_free_device;
 	}
 	rcu_assign_pointer(device->name, name);
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
-		free_device(device);
 		ret = PTR_ERR(trans);
-		goto error;
+		goto error_free_device;
 	}
 
 	q = bdev_get_queue(bdev);
@@ -2495,6 +2493,7 @@ error_trans:
 		sb->s_flags |= SB_RDONLY;
 	if (trans)
 		btrfs_end_transaction(trans);
+error_free_device:
 	free_device(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
-- 
cgit v1.2.3


From 9c6b1c4de1c64cd35d029ed6e0428d0a57fa632b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Jun 2017 22:30:00 +0200
Subject: btrfs: document device locking

Overview of the main locks protecting various device-related structures.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3fa83ff9b30f..372f515808d1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -145,6 +145,71 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 			     struct btrfs_bio **bbio_ret,
 			     int mirror_num, int need_raid_map);
 
+/*
+ * Device locking
+ * ==============
+ *
+ * There are several mutexes that protect manipulation of devices and low-level
+ * structures like chunks but not block groups, extents or files
+ *
+ * uuid_mutex (global lock)
+ * ------------------------
+ * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
+ * the SCAN_DEV ioctl registration or from mount either implicitly (the first
+ * device) or requested by the device= mount option
+ *
+ * the mutex can be very coarse and can cover long-running operations
+ *
+ * protects: updates to fs_devices counters like missing devices, rw devices,
+ * seeding, structure cloning, openning/closing devices at mount/umount time
+ *
+ * global::fs_devs - add, remove, updates to the global list
+ *
+ * does not protect: manipulation of the fs_devices::devices list!
+ *
+ * btrfs_device::name - renames (write side), read is RCU
+ *
+ * fs_devices::device_list_mutex (per-fs, with RCU)
+ * ------------------------------------------------
+ * protects updates to fs_devices::devices, ie. adding and deleting
+ *
+ * simple list traversal with read-only actions can be done with RCU protection
+ *
+ * may be used to exclude some operations from running concurrently without any
+ * modifications to the list (see write_all_supers)
+ *
+ * volume_mutex
+ * ------------
+ * coarse lock owned by a mounted filesystem; used to exclude some operations
+ * that cannot run in parallel and affect the higher-level properties of the
+ * filesystem like: device add/deleting/resize/replace, or balance
+ *
+ * balance_mutex
+ * -------------
+ * protects balance structures (status, state) and context accessed from
+ * several places (internally, ioctl)
+ *
+ * chunk_mutex
+ * -----------
+ * protects chunks, adding or removing during allocation, trim or when a new
+ * device is added/removed
+ *
+ * cleaner_mutex
+ * -------------
+ * a big lock that is held by the cleaner thread and prevents running subvolume
+ * cleaning together with relocation or delayed iputs
+ *
+ *
+ * Lock nesting
+ * ============
+ *
+ * uuid_mutex
+ *   volume_mutex
+ *     device_list_mutex
+ *       chunk_mutex
+ *     balance_mutex
+ */
+
 DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 struct list_head *btrfs_get_fs_uuids(void)
-- 
cgit v1.2.3


From 08ffcae8c92e7154ba5fb78c926463a21cba6501 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 19 Jun 2017 16:55:35 +0200
Subject: btrfs: simplify btrfs_close_bdev

Split the conditions a bit.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 372f515808d1..96c6c0fa1531 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -903,13 +903,15 @@ static void free_device_rcu(struct rcu_head *head)
 
 static void btrfs_close_bdev(struct btrfs_device *device)
 {
-	if (device->bdev && device->writeable) {
+	if (!device->bdev)
+		return;
+
+	if (device->writeable) {
 		sync_blockdev(device->bdev);
 		invalidate_bdev(device->bdev);
 	}
 
-	if (device->bdev)
-		blkdev_put(device->bdev, device->mode);
+	blkdev_put(device->bdev, device->mode);
 }
 
 static void btrfs_prepare_close_one_device(struct btrfs_device *device)
-- 
cgit v1.2.3


From c5593ca3c8a82eb397804e22a34d2c7061b55f56 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Jun 2017 00:09:21 +0200
Subject: btrfs: switch to RCU for device traversal in btrfs_ioctl_dev_info

We don't need to use the mutex as we do not modify the devices nor the
list itself and just read some information:

does not change during device lifetime:
- devid
- uuid
- name (ie. the path)

may change in parallel to the ioctl call, but can lead only to reporting
inacurracy:
- bytes_used
- total_bytes

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 64b947487ca6..f1884c075cd3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2775,7 +2775,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_ioctl_dev_info_args *di_args;
 	struct btrfs_device *dev;
-	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	int ret = 0;
 	char *s_uuid = NULL;
 
@@ -2786,7 +2785,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
 	if (!btrfs_is_empty_uuid(di_args->uuid))
 		s_uuid = di_args->uuid;
 
-	mutex_lock(&fs_devices->device_list_mutex);
+	rcu_read_lock();
 	dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
 
 	if (!dev) {
@@ -2801,17 +2800,15 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
 	if (dev->name) {
 		struct rcu_string *name;
 
-		rcu_read_lock();
 		name = rcu_dereference(dev->name);
 		strncpy(di_args->path, name->str, sizeof(di_args->path));
-		rcu_read_unlock();
 		di_args->path[sizeof(di_args->path) - 1] = 0;
 	} else {
 		di_args->path[0] = '\0';
 	}
 
 out:
-	mutex_unlock(&fs_devices->device_list_mutex);
+	rcu_read_unlock();
 	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
 		ret = -EFAULT;
 
-- 
cgit v1.2.3


From d03262c75da97503d026516456da41f38aa82067 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Jun 2017 00:09:21 +0200
Subject: btrfs: switch to RCU for device traversal in btrfs_ioctl_fs_info

We don't need to use the mutex as we do not modify the devices nor the
list itself and just read information about device counts.
Move copying fsid out of the protected section, not applicable to RCU
same as the rest of the retrieved information.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f1884c075cd3..ab7ff3efcba6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2749,16 +2749,16 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
 	if (!fi_args)
 		return -ENOMEM;
 
-	mutex_lock(&fs_devices->device_list_mutex);
+	rcu_read_lock();
 	fi_args->num_devices = fs_devices->num_devices;
-	memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
 
-	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
 		if (device->devid > fi_args->max_id)
 			fi_args->max_id = device->devid;
 	}
-	mutex_unlock(&fs_devices->device_list_mutex);
+	rcu_read_unlock();
 
+	memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
 	fi_args->nodesize = fs_info->nodesize;
 	fi_args->sectorsize = fs_info->sectorsize;
 	fi_args->clone_alignment = fs_info->sectorsize;
-- 
cgit v1.2.3


From 1538e6c52e19175a5f4da48eb88e059d5d12d7a0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Jun 2017 00:28:47 +0200
Subject: btrfs: use non-RCU list traversal in write_all_supers callees

We take the fs_devices::device_list_mutex mutex in write_all_supers
which will prevent any add/del changes to the device list. Therefore we
don't need to use the RCU variant list_for_each_entry_rcu in any of the
called functions.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ec9aa34bb6eb..06c593775b82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3395,9 +3395,10 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	int errors_wait = 0;
 	blk_status_t ret;
 
+	lockdep_assert_held(&info->fs_devices->device_list_mutex);
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
-	list_for_each_entry_rcu(dev, head, dev_list) {
+	list_for_each_entry(dev, head, dev_list) {
 		if (dev->missing)
 			continue;
 		if (!dev->bdev)
@@ -3410,7 +3411,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	}
 
 	/* wait for all the barriers */
-	list_for_each_entry_rcu(dev, head, dev_list) {
+	list_for_each_entry(dev, head, dev_list) {
 		if (dev->missing)
 			continue;
 		if (!dev->bdev) {
@@ -3509,7 +3510,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 		}
 	}
 
-	list_for_each_entry_rcu(dev, head, dev_list) {
+	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
 			continue;
@@ -3550,7 +3551,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 	}
 
 	total_errors = 0;
-	list_for_each_entry_rcu(dev, head, dev_list) {
+	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev)
 			continue;
 		if (!dev->in_fs_metadata || !dev->writeable)
-- 
cgit v1.2.3


From 66b0c887bbf61555fde648587644485388dddb78 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 31 Oct 2017 16:30:47 +0100
Subject: btrfs: prepare to drop gfp mask parameter from clear_extent_bit

Use __clear_extent_bit directly in case we want to pass unknown
gfp flags. Otherwise all clear_extent_bit callers use GFP_NOFS, so we
can sink them to the function and reduce argument count, at the cost
that __clear_extent_bit has to be exported.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c |  6 +++---
 fs/btrfs/extent_io.h | 12 ++++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c6e82a3e706d..78f7787454f7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -581,7 +581,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  *
  * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
-static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			      unsigned bits, int wake, int delete,
 			      struct extent_state **cached_state,
 			      gfp_t mask, struct extent_changeset *changeset)
@@ -4232,9 +4232,9 @@ static int try_release_extent_state(struct extent_map_tree *map,
 		 * at this point we can safely clear everything except the
 		 * locked bit and the nodatasum bit
 		 */
-		ret = clear_extent_bit(tree, start, end,
+		ret = __clear_extent_bit(tree, start, end,
 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
-				 0, 0, NULL, mask);
+				 0, 0, NULL, mask, NULL);
 
 		/* if clear_extent_bit failed for enomem reasons,
 		 * we can't allow the release to continue.
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 93dcae0c3183..2cdfc64a1356 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -301,6 +301,10 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     unsigned bits, int wake, int delete,
 		     struct extent_state **cached, gfp_t mask);
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     unsigned bits, int wake, int delete,
+		     struct extent_state **cached, gfp_t mask,
+		     struct extent_changeset *changeset);
 
 static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
@@ -311,8 +315,8 @@ static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
 		u64 end, struct extent_state **cached, gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
-				mask);
+	return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+				mask, NULL);
 }
 
 static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
@@ -342,8 +346,8 @@ static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
 static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
 		u64 end, struct extent_state **cached_state, gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
-				cached_state, mask);
+	return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+				cached_state, mask, NULL);
 }
 
 static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
-- 
cgit v1.2.3


From ae0f162534e98afccc7d055cfaa3d3e920a928f0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 31 Oct 2017 16:37:52 +0100
Subject: btrfs: sink gfp parameter to clear_extent_bit

All callers use GFP_NOFS, we don't have to pass it as an argument. The
built-in tests pass GFP_KERNEL, but they run only at module load time
and NOFS works there as well.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c         | 10 +++++-----
 fs/btrfs/extent_io.h         | 10 ++++------
 fs/btrfs/file.c              |  2 +-
 fs/btrfs/free-space-cache.c  |  8 +++-----
 fs/btrfs/inode.c             | 22 ++++++++++------------
 fs/btrfs/ioctl.c             |  2 +-
 fs/btrfs/qgroup.c            |  3 +--
 fs/btrfs/tests/inode-tests.c | 12 ++++--------
 fs/btrfs/transaction.c       |  3 +--
 9 files changed, 30 insertions(+), 42 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 78f7787454f7..88131863d827 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1295,10 +1295,10 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     unsigned bits, int wake, int delete,
-		     struct extent_state **cached, gfp_t mask)
+		     struct extent_state **cached)
 {
 	return __clear_extent_bit(tree, start, end, bits, wake, delete,
-				  cached, mask, NULL);
+				  cached, GFP_NOFS, NULL);
 }
 
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1348,7 +1348,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
-					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+					 EXTENT_LOCKED, 1, 0, NULL);
 		return 0;
 	}
 	return 1;
@@ -1744,7 +1744,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 				 unsigned long page_ops)
 {
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
-			 NULL, GFP_NOFS);
+			 NULL);
 
 	__process_pages_contig(inode->i_mapping, locked_page,
 			       start >> PAGE_SHIFT, end >> PAGE_SHIFT,
@@ -4207,7 +4207,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
 			 EXTENT_DO_ACCOUNTING,
-			 1, 1, &cached_state, GFP_NOFS);
+			 1, 1, &cached_state);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2cdfc64a1356..b679309545d8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -300,7 +300,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		unsigned bits, struct extent_changeset *changeset);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     unsigned bits, int wake, int delete,
-		     struct extent_state **cached, gfp_t mask);
+		     struct extent_state **cached);
 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     unsigned bits, int wake, int delete,
 		     struct extent_state **cached, gfp_t mask,
@@ -308,8 +308,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
 static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
-				GFP_NOFS);
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
 }
 
 static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
@@ -327,8 +326,7 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
 	if (bits & EXTENT_LOCKED)
 		wake = 1;
 
-	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL,
-			GFP_NOFS);
+	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
 }
 
 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -362,7 +360,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
 {
 	return clear_extent_bit(tree, start, end,
 				EXTENT_DIRTY | EXTENT_DELALLOC |
-				EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+				EXTENT_DO_ACCOUNTING, 0, 0, NULL);
 }
 
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d1eba3394660..b85b6d7d0ccd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1519,7 +1519,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		clear_extent_bit(&inode->io_tree, start_pos, last_pos,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-				 0, 0, cached_state, GFP_NOFS);
+				 0, 0, cached_state);
 		*lockstart = start_pos;
 		*lockend = last_pos;
 		ret = 1;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4426d1c73e50..b8ab90c9a9fb 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -993,8 +993,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
-				 GFP_NOFS);
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
 		goto fail;
 	}
 	leaf = path->nodes[0];
@@ -1008,7 +1007,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
 					 inode->i_size - 1,
 					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
-					 NULL, GFP_NOFS);
+					 NULL);
 			btrfs_release_path(path);
 			goto fail;
 		}
@@ -1105,8 +1104,7 @@ static int flush_dirty_cache(struct inode *inode)
 	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	if (ret)
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
-				 GFP_NOFS);
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
 
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f259d89804b2..ad7c6ddb4cbd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1203,7 +1203,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	u64 cur_end;
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
-			 1, 0, NULL, GFP_NOFS);
+			 1, 0, NULL);
 	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 		BUG_ON(!async_cow); /* -ENOMEM */
@@ -3000,7 +3000,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
 		clear_extent_bit(io_tree, ordered_extent->file_offset,
 			ordered_extent->file_offset + ordered_extent->len - 1,
-			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+			EXTENT_DEFRAG, 0, 0, &cached_state);
 	}
 
 	if (nolock)
@@ -3070,7 +3070,7 @@ out:
 				 ordered_extent->len - 1,
 				 clear_bits,
 				 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
-				 0, &cached_state, GFP_NOFS);
+				 0, &cached_state);
 	}
 
 	if (trans)
@@ -4812,7 +4812,7 @@ again:
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
 			  EXTENT_DIRTY | EXTENT_DELALLOC |
 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-			  0, 0, &cached_state, GFP_NOFS);
+			  0, 0, &cached_state);
 
 	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
 					&cached_state, 0);
@@ -5248,8 +5248,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
 		clear_extent_bit(io_tree, start, end,
 				 EXTENT_LOCKED | EXTENT_DIRTY |
 				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-				 EXTENT_DEFRAG, 1, 1,
-				 &cached_state, GFP_NOFS);
+				 EXTENT_DEFRAG, 1, 1, &cached_state);
 
 		cond_resched();
 		spin_lock(&io_tree->lock);
@@ -7936,7 +7935,7 @@ unlock:
 	if (lockstart < lockend) {
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
 				 lockend, unlock_bits, 1, 0,
-				 &cached_state, GFP_NOFS);
+				 &cached_state);
 	} else {
 		free_extent_state(cached_state);
 	}
@@ -7947,7 +7946,7 @@ unlock:
 
 unlock_err:
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+			 unlock_bits, 1, 0, &cached_state);
 err:
 	if (dio_data)
 		current->journal_info = dio_data;
@@ -8989,8 +8988,7 @@ again:
 					 EXTENT_DIRTY | EXTENT_DELALLOC |
 					 EXTENT_DELALLOC_NEW |
 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
-					 EXTENT_DEFRAG, 1, 0, &cached_state,
-					 GFP_NOFS);
+					 EXTENT_DEFRAG, 1, 0, &cached_state);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -9047,7 +9045,7 @@ again:
 				 EXTENT_LOCKED | EXTENT_DIRTY |
 				 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
 				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
-				 &cached_state, GFP_NOFS);
+				 &cached_state);
 
 		__btrfs_releasepage(page, GFP_NOFS);
 	}
@@ -9175,7 +9173,7 @@ again:
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
 			  EXTENT_DIRTY | EXTENT_DELALLOC |
 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-			  0, 0, &cached_state, GFP_NOFS);
+			  0, 0, &cached_state);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, end, 0,
 					&cached_state, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ab7ff3efcba6..e7f37c46d6a9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1190,7 +1190,7 @@ again:
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
 			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
-			  &cached_state, GFP_NOFS);
+			  &cached_state);
 
 	if (i_done != page_cnt) {
 		spin_lock(&BTRFS_I(inode)->lock);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 168fd03ca3ac..9e61dd624f7b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2883,8 +2883,7 @@ cleanup:
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(&reserved->range_changed, &uiter)))
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
-				 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
-				 GFP_NOFS);
+				 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
 	extent_changeset_release(reserved);
 	return ret;
 }
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 30affb60da51..4a0a60d3275d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -1001,8 +1001,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			       BTRFS_MAX_EXTENT_SIZE >> 1,
 			       (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
 			       EXTENT_DELALLOC | EXTENT_DIRTY |
-			       EXTENT_UPTODATE, 0, 0,
-			       NULL, GFP_KERNEL);
+			       EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
 		goto out;
@@ -1070,8 +1069,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 			       BTRFS_MAX_EXTENT_SIZE + sectorsize,
 			       BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
 			       EXTENT_DIRTY | EXTENT_DELALLOC |
-			       EXTENT_UPTODATE, 0, 0,
-			       NULL, GFP_KERNEL);
+			       EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
 		goto out;
@@ -1104,8 +1102,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
 	/* Empty */
 	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
 			       EXTENT_DIRTY | EXTENT_DELALLOC |
-			       EXTENT_UPTODATE, 0, 0,
-			       NULL, GFP_KERNEL);
+			       EXTENT_UPTODATE, 0, 0, NULL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
 		goto out;
@@ -1121,8 +1118,7 @@ out:
 	if (ret)
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_UPTODATE, 0, 0,
-				 NULL, GFP_KERNEL);
+				 EXTENT_UPTODATE, 0, 0, NULL);
 	iput(inode);
 	btrfs_free_dummy_root(root);
 	btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5a8c2649af2f..dac688c696c3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1016,8 +1016,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
 		 * it's safe to do it (through clear_btree_io_tree()).
 		 */
 		err = clear_extent_bit(dirty_pages, start, end,
-				       EXTENT_NEED_WAIT,
-				       0, 0, &cached_state, GFP_NOFS);
+				       EXTENT_NEED_WAIT, 0, 0, &cached_state);
 		if (err == -ENOMEM)
 			err = 0;
 		if (!err)
-- 
cgit v1.2.3


From f08dc36f781af622be5398ac3ab2ec9c3749889d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 31 Oct 2017 17:02:39 +0100
Subject: btrfs: sink gfp parameter to clear_extent_uptodate

There's only one callsite with GFP_NOFS.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.h | 4 ++--
 fs/btrfs/inode.c     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b679309545d8..9f6a5133ce8a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -342,10 +342,10 @@ static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
 }
 
 static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-		u64 end, struct extent_state **cached_state, gfp_t mask)
+		u64 end, struct extent_state **cached_state)
 {
 	return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
-				cached_state, mask, NULL);
+				cached_state, GFP_NOFS, NULL);
 }
 
 static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ad7c6ddb4cbd..f9efaecfc9d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3084,7 +3084,7 @@ out:
 		else
 			start = ordered_extent->file_offset;
 		end = ordered_extent->file_offset + ordered_extent->len - 1;
-		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
+		clear_extent_uptodate(io_tree, start, end, NULL);
 
 		/* Drop the cache for the part of the extent we didn't write. */
 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
-- 
cgit v1.2.3


From 712e36c5f2a7fa561aa3876b0e466df072aa6a1e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 31 Oct 2017 17:08:27 +0100
Subject: btrfs: use GFP_KERNEL in btrfs_alloc_inode

This callback is called directly from VFS, no locks are held at the
allocation time.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9efaecfc9d1..2e92c582cda0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9430,7 +9430,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	struct btrfs_inode *ei;
 	struct inode *inode;
 
-	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
 
-- 
cgit v1.2.3


From ad7b0368f33cffe67fecd302028915926e50ef7e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 8 Nov 2017 08:54:25 +0800
Subject: btrfs: tree-checker: Add checker for dir item

Add checker for dir item, for key types DIR_ITEM, DIR_INDEX and
XATTR_ITEM.

This checker does comprehensive checks for:

1) dir_item header and its data size
   Against item boundary and maximum name/xattr length.
   This part is mostly the same as old verify_dir_item().

2) dir_type
   Against maximum file types, and against key type.
   Since XATTR key should only have FT_XATTR dir item, and normal dir
   item type should not have XATTR key.

   The check between key->type and dir_type is newly introduced by this
   patch.

3) name hash
   For XATTR and DIR_ITEM key, key->offset is name hash (crc32c).
   Check the hash of the name against the key to ensure it's correct.

   The name hash check is only found in btrfs-progs before this patch.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Su Yue <suy.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index ce4ed6ec8f39..66dac0a4b01f 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -30,6 +30,7 @@
 #include "tree-checker.h"
 #include "disk-io.h"
 #include "compression.h"
+#include "hash.h"
 
 /*
  * Error message should follow the following format:
@@ -222,6 +223,141 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
 	return 0;
 }
 
+/*
+ * Customized reported for dir_item, only important new info is key->objectid,
+ * which represents inode number
+ */
+__printf(4, 5)
+static void dir_item_err(const struct btrfs_root *root,
+			 const struct extent_buffer *eb, int slot,
+			 const char *fmt, ...)
+{
+	struct btrfs_key key;
+	struct va_format vaf;
+	va_list args;
+
+	btrfs_item_key_to_cpu(eb, &key, slot);
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	btrfs_crit(root->fs_info,
+	"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+		btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
+		btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+	va_end(args);
+}
+
+static int check_dir_item(struct btrfs_root *root,
+			  struct extent_buffer *leaf,
+			  struct btrfs_key *key, int slot)
+{
+	struct btrfs_dir_item *di;
+	u32 item_size = btrfs_item_size_nr(leaf, slot);
+	u32 cur = 0;
+
+	di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+	while (cur < item_size) {
+		char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+		u32 name_len;
+		u32 data_len;
+		u32 max_name_len;
+		u32 total_size;
+		u32 name_hash;
+		u8 dir_type;
+
+		/* header itself should not cross item boundary */
+		if (cur + sizeof(*di) > item_size) {
+			dir_item_err(root, leaf, slot,
+		"dir item header crosses item boundary, have %lu boundary %u",
+				cur + sizeof(*di), item_size);
+			return -EUCLEAN;
+		}
+
+		/* dir type check */
+		dir_type = btrfs_dir_type(leaf, di);
+		if (dir_type >= BTRFS_FT_MAX) {
+			dir_item_err(root, leaf, slot,
+			"invalid dir item type, have %u expect [0, %u)",
+				dir_type, BTRFS_FT_MAX);
+			return -EUCLEAN;
+		}
+
+		if (key->type == BTRFS_XATTR_ITEM_KEY &&
+		    dir_type != BTRFS_FT_XATTR) {
+			dir_item_err(root, leaf, slot,
+		"invalid dir item type for XATTR key, have %u expect %u",
+				dir_type, BTRFS_FT_XATTR);
+			return -EUCLEAN;
+		}
+		if (dir_type == BTRFS_FT_XATTR &&
+		    key->type != BTRFS_XATTR_ITEM_KEY) {
+			dir_item_err(root, leaf, slot,
+			"xattr dir type found for non-XATTR key");
+			return -EUCLEAN;
+		}
+		if (dir_type == BTRFS_FT_XATTR)
+			max_name_len = XATTR_NAME_MAX;
+		else
+			max_name_len = BTRFS_NAME_LEN;
+
+		/* Name/data length check */
+		name_len = btrfs_dir_name_len(leaf, di);
+		data_len = btrfs_dir_data_len(leaf, di);
+		if (name_len > max_name_len) {
+			dir_item_err(root, leaf, slot,
+			"dir item name len too long, have %u max %u",
+				name_len, max_name_len);
+			return -EUCLEAN;
+		}
+		if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
+			dir_item_err(root, leaf, slot,
+			"dir item name and data len too long, have %u max %u",
+				name_len + data_len,
+				BTRFS_MAX_XATTR_SIZE(root->fs_info));
+			return -EUCLEAN;
+		}
+
+		if (data_len && dir_type != BTRFS_FT_XATTR) {
+			dir_item_err(root, leaf, slot,
+			"dir item with invalid data len, have %u expect 0",
+				data_len);
+			return -EUCLEAN;
+		}
+
+		total_size = sizeof(*di) + name_len + data_len;
+
+		/* header and name/data should not cross item boundary */
+		if (cur + total_size > item_size) {
+			dir_item_err(root, leaf, slot,
+		"dir item data crosses item boundary, have %u boundary %u",
+				cur + total_size, item_size);
+			return -EUCLEAN;
+		}
+
+		/*
+		 * Special check for XATTR/DIR_ITEM, as key->offset is name
+		 * hash, should match its name
+		 */
+		if (key->type == BTRFS_DIR_ITEM_KEY ||
+		    key->type == BTRFS_XATTR_ITEM_KEY) {
+			read_extent_buffer(leaf, namebuf,
+					(unsigned long)(di + 1), name_len);
+			name_hash = btrfs_name_hash(namebuf, name_len);
+			if (key->offset != name_hash) {
+				dir_item_err(root, leaf, slot,
+		"name hash mismatch with key, have 0x%016x expect 0x%016llx",
+					name_hash, key->offset);
+				return -EUCLEAN;
+			}
+		}
+		cur += total_size;
+		di = (struct btrfs_dir_item *)((void *)di + total_size);
+	}
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
@@ -238,6 +374,11 @@ static int check_leaf_item(struct btrfs_root *root,
 	case BTRFS_EXTENT_CSUM_KEY:
 		ret = check_csum_item(root, leaf, key, slot);
 		break;
+	case BTRFS_DIR_ITEM_KEY:
+	case BTRFS_DIR_INDEX_KEY:
+	case BTRFS_XATTR_ITEM_KEY:
+		ret = check_dir_item(root, leaf, key, slot);
+		break;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From bae15d95e247f94ceb32caaf13d1d71ecbfc8735 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 8 Nov 2017 08:54:26 +0800
Subject: btrfs: Cleanup existing name_len checks

Since tree-checker has verified leaf when reading from disk, we don't
need the existing verify_dir_item() or btrfs_is_name_len_valid() checks.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h     |   5 ---
 fs/btrfs/dir-item.c  | 108 ---------------------------------------------------
 fs/btrfs/export.c    |   5 ---
 fs/btrfs/inode.c     |   4 --
 fs/btrfs/props.c     |   7 ----
 fs/btrfs/root-tree.c |   7 ----
 fs/btrfs/send.c      |   6 ---
 fs/btrfs/tree-log.c  |  47 +++++-----------------
 fs/btrfs/xattr.c     |   6 ---
 9 files changed, 9 insertions(+), 186 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a56d00311578..09b72b6996ce 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3060,15 +3060,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 					  struct btrfs_path *path, u64 dir,
 					  const char *name, u16 name_len,
 					  int mod);
-int verify_dir_item(struct btrfs_fs_info *fs_info,
-		    struct extent_buffer *leaf, int slot,
-		    struct btrfs_dir_item *dir_item);
 struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
 						 struct btrfs_path *path,
 						 const char *name,
 						 int name_len);
-bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
-			     unsigned long start, u16 name_len);
 
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 41cb9196eaa8..cbe421605cd5 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -403,8 +403,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
 			btrfs_dir_data_len(leaf, dir_item);
 		name_ptr = (unsigned long)(dir_item + 1);
 
-		if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item))
-			return NULL;
 		if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
 		    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
 			return dir_item;
@@ -450,109 +448,3 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 	}
 	return ret;
 }
-
-int verify_dir_item(struct btrfs_fs_info *fs_info,
-		    struct extent_buffer *leaf,
-		    int slot,
-		    struct btrfs_dir_item *dir_item)
-{
-	u16 namelen = BTRFS_NAME_LEN;
-	int ret;
-	u8 type = btrfs_dir_type(leaf, dir_item);
-
-	if (type >= BTRFS_FT_MAX) {
-		btrfs_crit(fs_info, "invalid dir item type: %d", (int)type);
-		return 1;
-	}
-
-	if (type == BTRFS_FT_XATTR)
-		namelen = XATTR_NAME_MAX;
-
-	if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
-		btrfs_crit(fs_info, "invalid dir item name len: %u",
-		       (unsigned)btrfs_dir_name_len(leaf, dir_item));
-		return 1;
-	}
-
-	namelen = btrfs_dir_name_len(leaf, dir_item);
-	ret = btrfs_is_name_len_valid(leaf, slot,
-				      (unsigned long)(dir_item + 1), namelen);
-	if (!ret)
-		return 1;
-
-	/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
-	if ((btrfs_dir_data_len(leaf, dir_item) +
-	     btrfs_dir_name_len(leaf, dir_item)) >
-					BTRFS_MAX_XATTR_SIZE(fs_info)) {
-		btrfs_crit(fs_info, "invalid dir item name + data len: %u + %u",
-			   (unsigned)btrfs_dir_name_len(leaf, dir_item),
-			   (unsigned)btrfs_dir_data_len(leaf, dir_item));
-		return 1;
-	}
-
-	return 0;
-}
-
-bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
-			     unsigned long start, u16 name_len)
-{
-	struct btrfs_fs_info *fs_info = leaf->fs_info;
-	struct btrfs_key key;
-	u32 read_start;
-	u32 read_end;
-	u32 item_start;
-	u32 item_end;
-	u32 size;
-	bool ret = true;
-
-	ASSERT(start > BTRFS_LEAF_DATA_OFFSET);
-
-	read_start = start - BTRFS_LEAF_DATA_OFFSET;
-	read_end = read_start + name_len;
-	item_start = btrfs_item_offset_nr(leaf, slot);
-	item_end = btrfs_item_end_nr(leaf, slot);
-
-	btrfs_item_key_to_cpu(leaf, &key, slot);
-
-	switch (key.type) {
-	case BTRFS_DIR_ITEM_KEY:
-	case BTRFS_XATTR_ITEM_KEY:
-	case BTRFS_DIR_INDEX_KEY:
-		size = sizeof(struct btrfs_dir_item);
-		break;
-	case BTRFS_INODE_REF_KEY:
-		size = sizeof(struct btrfs_inode_ref);
-		break;
-	case BTRFS_INODE_EXTREF_KEY:
-		size = sizeof(struct btrfs_inode_extref);
-		break;
-	case BTRFS_ROOT_REF_KEY:
-	case BTRFS_ROOT_BACKREF_KEY:
-		size = sizeof(struct btrfs_root_ref);
-		break;
-	default:
-		ret = false;
-		goto out;
-	}
-
-	if (read_start < item_start) {
-		ret = false;
-		goto out;
-	}
-	if (read_end > item_end) {
-		ret = false;
-		goto out;
-	}
-
-	/* there shall be item(s) before name */
-	if (read_start - item_start < size) {
-		ret = false;
-		goto out;
-	}
-
-out:
-	if (!ret)
-		btrfs_crit(fs_info, "invalid dir item name len: %u",
-			   (unsigned int)name_len);
-	return ret;
-}
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 3aeb5770f896..ddaccad469f8 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -283,11 +283,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
 		name_len = btrfs_inode_ref_name_len(leaf, iref);
 	}
 
-	ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len);
-	if (!ret) {
-		btrfs_free_path(path);
-		return -EIO;
-	}
 	read_extent_buffer(leaf, name, name_ptr, name_len);
 	btrfs_free_path(path);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2e92c582cda0..ff91b2e3979a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5907,7 +5907,6 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_file_private *private = file->private_data;
 	struct btrfs_dir_item *di;
@@ -5975,9 +5974,6 @@ again:
 		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
 			goto next;
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
-		if (verify_dir_item(fs_info, leaf, slot, di))
-			goto next;
-
 		name_len = btrfs_dir_name_len(leaf, di);
 		if ((total_len + sizeof(struct dir_entry) + name_len) >=
 		    PAGE_SIZE) {
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f6a05f836629..c39a940d0c75 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -164,7 +164,6 @@ static int iterate_object_props(struct btrfs_root *root,
 						 size_t),
 				void *ctx)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	char *name_buf = NULL;
 	char *value_buf = NULL;
@@ -215,12 +214,6 @@ static int iterate_object_props(struct btrfs_root *root,
 			name_ptr = (unsigned long)(di + 1);
 			data_ptr = name_ptr + name_len;
 
-			if (verify_dir_item(fs_info, leaf,
-					    path->slots[0], di)) {
-				ret = -EIO;
-				goto out;
-			}
-
 			if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
 			    memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
 						 name_ptr,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3338407ef0f0..aab0194efe46 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -387,13 +387,6 @@ again:
 		WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
 		WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
 		ptr = (unsigned long)(ref + 1);
-		ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr,
-					      name_len);
-		if (!ret) {
-			err = -EIO;
-			goto out;
-		}
-
 		WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
 		*sequence = btrfs_root_ref_sequence(leaf, ref);
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 20d3300bd268..f306c608dc28 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1059,12 +1059,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 			}
 		}
 
-		ret = btrfs_is_name_len_valid(eb, path->slots[0],
-			  (unsigned long)(di + 1), name_len + data_len);
-		if (!ret) {
-			ret = -EIO;
-			goto out;
-		}
 		if (name_len + data_len > buf_len) {
 			buf_len = name_len + data_len;
 			if (is_vmalloc_addr(buf)) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7bf9b31561db..a806182dfea6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1173,19 +1173,15 @@ next:
 	return 0;
 }
 
-static int extref_get_fields(struct extent_buffer *eb, int slot,
-			     unsigned long ref_ptr, u32 *namelen, char **name,
-			     u64 *index, u64 *parent_objectid)
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			     u32 *namelen, char **name, u64 *index,
+			     u64 *parent_objectid)
 {
 	struct btrfs_inode_extref *extref;
 
 	extref = (struct btrfs_inode_extref *)ref_ptr;
 
 	*namelen = btrfs_inode_extref_name_len(eb, extref);
-	if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name,
-				     *namelen))
-		return -EIO;
-
 	*name = kmalloc(*namelen, GFP_NOFS);
 	if (*name == NULL)
 		return -ENOMEM;
@@ -1200,19 +1196,14 @@ static int extref_get_fields(struct extent_buffer *eb, int slot,
 	return 0;
 }
 
-static int ref_get_fields(struct extent_buffer *eb, int slot,
-			  unsigned long ref_ptr, u32 *namelen, char **name,
-			  u64 *index)
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			  u32 *namelen, char **name, u64 *index)
 {
 	struct btrfs_inode_ref *ref;
 
 	ref = (struct btrfs_inode_ref *)ref_ptr;
 
 	*namelen = btrfs_inode_ref_name_len(eb, ref);
-	if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1),
-				     *namelen))
-		return -EIO;
-
 	*name = kmalloc(*namelen, GFP_NOFS);
 	if (*name == NULL)
 		return -ENOMEM;
@@ -1287,8 +1278,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 	while (ref_ptr < ref_end) {
 		if (log_ref_ver) {
-			ret = extref_get_fields(eb, slot, ref_ptr, &namelen,
-					  &name, &ref_index, &parent_objectid);
+			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+						&ref_index, &parent_objectid);
 			/*
 			 * parent object can change from one array
 			 * item to another.
@@ -1300,8 +1291,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				goto out;
 			}
 		} else {
-			ret = ref_get_fields(eb, slot, ref_ptr, &namelen,
-					     &name, &ref_index);
+			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+					     &ref_index);
 		}
 		if (ret)
 			goto out;
@@ -1835,7 +1826,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 					struct extent_buffer *eb, int slot,
 					struct btrfs_key *key)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret = 0;
 	u32 item_size = btrfs_item_size_nr(eb, slot);
 	struct btrfs_dir_item *di;
@@ -1848,8 +1838,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 	ptr_end = ptr + item_size;
 	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
-		if (verify_dir_item(fs_info, eb, slot, di))
-			return -EIO;
 		name_len = btrfs_dir_name_len(eb, di);
 		ret = replay_one_name(trans, root, path, eb, di, key);
 		if (ret < 0)
@@ -2024,11 +2012,6 @@ again:
 	ptr_end = ptr + item_size;
 	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
-		if (verify_dir_item(fs_info, eb, slot, di)) {
-			ret = -EIO;
-			goto out;
-		}
-
 		name_len = btrfs_dir_name_len(eb, di);
 		name = kmalloc(name_len, GFP_NOFS);
 		if (!name) {
@@ -2109,7 +2092,6 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
 			      struct btrfs_path *path,
 			      const u64 ino)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_key search_key;
 	struct btrfs_path *log_path;
 	int i;
@@ -2151,11 +2133,6 @@ process_leaf:
 			u32 this_len = sizeof(*di) + name_len + data_len;
 			char *name;
 
-			ret = verify_dir_item(fs_info, path->nodes[0], i, di);
-			if (ret) {
-				ret = -EIO;
-				goto out;
-			}
 			name = kmalloc(name_len, GFP_NOFS);
 			if (!name) {
 				ret = -ENOMEM;
@@ -4572,12 +4549,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 			this_len = sizeof(*extref) + this_name_len;
 		}
 
-		ret = btrfs_is_name_len_valid(eb, slot, name_ptr,
-					      this_name_len);
-		if (!ret) {
-			ret = -EIO;
-			goto out;
-		}
 		if (this_name_len > name_len) {
 			char *new_name;
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 2c7e53f9ff1b..ad298c248da4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -267,7 +267,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
 	struct btrfs_key key;
 	struct inode *inode = d_inode(dentry);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
 	int ret = 0;
@@ -336,11 +335,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 			u32 this_len = sizeof(*di) + name_len + data_len;
 			unsigned long name_ptr = (unsigned long)(di + 1);
 
-			if (verify_dir_item(fs_info, leaf, slot, di)) {
-				ret = -EIO;
-				goto err;
-			}
-
 			total_size += name_len + 1;
 			/*
 			 * We are just looking for how big our buffer needs to
-- 
cgit v1.2.3


From 433175992c1775db6cbc7c92294345408a333bee Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 03:46:07 +0200
Subject: btrfs: sink get_extent parameter to extent_writepages

There's only one caller.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 3 +--
 fs/btrfs/extent_io.h | 1 -
 fs/btrfs/inode.c     | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 88131863d827..1d7aebd020ad 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4121,14 +4121,13 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
-		      get_extent_t *get_extent,
 		      struct writeback_control *wbc)
 {
 	int ret = 0;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
-		.get_extent = get_extent,
+		.get_extent = btrfs_get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9f6a5133ce8a..e07f9e1c34e8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -411,7 +411,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 			      int mode);
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
-		      get_extent_t *get_extent,
 		      struct writeback_control *wbc);
 int btree_write_cache_pages(struct address_space *mapping,
 			    struct writeback_control *wbc);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ff91b2e3979a..a49dd803fb03 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8903,7 +8903,7 @@ static int btrfs_writepages(struct address_space *mapping,
 	struct extent_io_tree *tree;
 
 	tree = &BTRFS_I(mapping->host)->io_tree;
-	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+	return extent_writepages(tree, mapping, wbc);
 }
 
 static int
-- 
cgit v1.2.3


From 916b929831a92a2a432274cd264311893f22a46d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 03:47:28 +0200
Subject: btrfs: sink get_extent parameter to extent_write_locked_range

There's only one caller.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 5 ++---
 fs/btrfs/extent_io.h | 3 +--
 fs/btrfs/inode.c     | 1 -
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1d7aebd020ad..d9a76e33ac3c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4077,8 +4077,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 }
 
 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
-			      u64 start, u64 end, get_extent_t *get_extent,
-			      int mode)
+			      u64 start, u64 end, int mode)
 {
 	int ret = 0;
 	struct address_space *mapping = inode->i_mapping;
@@ -4089,7 +4088,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
-		.get_extent = get_extent,
+		.get_extent = btrfs_get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
 	};
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e07f9e1c34e8..b67fc0153b73 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -407,8 +407,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc);
 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
-			      u64 start, u64 end, get_extent_t *get_extent,
-			      int mode);
+			      u64 start, u64 end, int mode);
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
 		      struct writeback_control *wbc);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a49dd803fb03..ae62d59edd4b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -769,7 +769,6 @@ retry:
 						  inode, async_extent->start,
 						  async_extent->start +
 						  async_extent->ram_size - 1,
-						  btrfs_get_extent,
 						  WB_SYNC_ALL);
 			else if (ret)
 				unlock_page(async_cow->locked_page);
-- 
cgit v1.2.3


From deac642d7e0fd83efd3372c4093fe60ac7436db6 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 03:47:28 +0200
Subject: btrfs: sink get_extent parameter to extent_write_full_page

There's only one caller.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 3 +--
 fs/btrfs/extent_io.h | 1 -
 fs/btrfs/inode.c     | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d9a76e33ac3c..7754124b07b8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4058,14 +4058,13 @@ static noinline void flush_write_bio(void *data)
 }
 
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
-			  get_extent_t *get_extent,
 			  struct writeback_control *wbc)
 {
 	int ret;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
-		.get_extent = get_extent,
+		.get_extent = btrfs_get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b67fc0153b73..abe4feea1539 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -404,7 +404,6 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
-			  get_extent_t *get_extent,
 			  struct writeback_control *wbc);
 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 			      u64 start, u64 end, int mode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ae62d59edd4b..7403ef8856bd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8891,7 +8891,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 		return AOP_WRITEPAGE_ACTIVATE;
 	}
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+	ret = extent_write_full_page(tree, page, wbc);
 	btrfs_add_delayed_iput(inode);
 	return ret;
 }
-- 
cgit v1.2.3


From 3c98c62f7a761ab132950cf36795c8001be72b47 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:01:08 +0200
Subject: btrfs: drop get_extent from extent_page_data

Previous patches cleaned up all places where
extent_page_data::get_extent was set and it was btrfs_get_extent all the
time, so we can simply call that instead.

This also reduces size of extent_page_data by 8 bytes which has positive
effect on stack consumption on various functions on the write out path.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7754124b07b8..3ecd3ab73981 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -109,8 +109,6 @@ struct tree_entry {
 struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
-	get_extent_t *get_extent;
-
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
 	 */
@@ -3373,7 +3371,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 							 page_end, NULL, 1);
 			break;
 		}
-		em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur,
+		em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
 				     end - cur + 1, 1);
 		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
@@ -4064,7 +4062,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
-		.get_extent = btrfs_get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
@@ -4087,7 +4084,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
-		.get_extent = btrfs_get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
 	};
@@ -4125,7 +4121,6 @@ int extent_writepages(struct extent_io_tree *tree,
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
-		.get_extent = btrfs_get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
-- 
cgit v1.2.3


From 2135fb9bb4b8d05d288d994c4f9f8077ce90d890 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:09:57 +0200
Subject: btrfs: sink get_extent parameter to extent_fiemap

All callers pass btrfs_get_extent_fiemap and we don't expect anything
else in the context of extent_fiemap.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 6 +++---
 fs/btrfs/extent_io.h | 2 +-
 fs/btrfs/inode.c     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3ecd3ab73981..d62e0194fc3b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4441,7 +4441,7 @@ static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
 }
 
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		__u64 start, __u64 len, get_extent_t *get_extent)
+		__u64 start, __u64 len)
 {
 	int ret = 0;
 	u64 off = start;
@@ -4524,7 +4524,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			 &cached_state);
 
 	em = get_extent_skip_holes(inode, start, last_for_get_extent,
-				   get_extent);
+				   btrfs_get_extent_fiemap);
 	if (!em)
 		goto out;
 	if (IS_ERR(em)) {
@@ -4613,7 +4613,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 		/* now scan forward to see if this is really the last extent. */
 		em = get_extent_skip_holes(inode, off, last_for_get_extent,
-					   get_extent);
+					   btrfs_get_extent_fiemap);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			goto out;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index abe4feea1539..afc169d04b22 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -417,7 +417,7 @@ int extent_readpages(struct extent_io_tree *tree,
 		     struct list_head *pages, unsigned nr_pages,
 		     get_extent_t get_extent);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		__u64 start, __u64 len, get_extent_t *get_extent);
+		__u64 start, __u64 len);
 void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7403ef8856bd..63cf6cd174d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8859,7 +8859,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		return ret;
 
-	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
+	return extent_fiemap(inode, fieinfo, start, len);
 }
 
 int btrfs_readpage(struct file *file, struct page *page)
-- 
cgit v1.2.3


From e3350e16eaa61422d37e541e1468ce7476b44a4e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:09:57 +0200
Subject: btrfs: sink get_extent parameter to get_extent_skip_holes

All callers pass btrfs_get_extent_fiemap and get_extent_skip_holes
itself is used only as a fiemap helper.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d62e0194fc3b..3f5debc5e56f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4292,9 +4292,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
  * This maps until we find something past 'last'
  */
 static struct extent_map *get_extent_skip_holes(struct inode *inode,
-						u64 offset,
-						u64 last,
-						get_extent_t *get_extent)
+						u64 offset, u64 last)
 {
 	u64 sectorsize = btrfs_inode_sectorsize(inode);
 	struct extent_map *em;
@@ -4308,7 +4306,8 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 		if (len == 0)
 			break;
 		len = ALIGN(len, sectorsize);
-		em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0);
+		em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
+				len, 0);
 		if (IS_ERR_OR_NULL(em))
 			return em;
 
@@ -4523,8 +4522,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			 &cached_state);
 
-	em = get_extent_skip_holes(inode, start, last_for_get_extent,
-				   btrfs_get_extent_fiemap);
+	em = get_extent_skip_holes(inode, start, last_for_get_extent);
 	if (!em)
 		goto out;
 	if (IS_ERR(em)) {
@@ -4612,8 +4610,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		}
 
 		/* now scan forward to see if this is really the last extent. */
-		em = get_extent_skip_holes(inode, off, last_for_get_extent,
-					   btrfs_get_extent_fiemap);
+		em = get_extent_skip_holes(inode, off, last_for_get_extent);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
 			goto out;
-- 
cgit v1.2.3


From 0932584b66e97aea91eb8c0b610e1d1083951b32 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:09:57 +0200
Subject: btrfs: sink get_extent parameter to extent_readpages

There's only one caller that passes btrfs_get_extent.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 11 +++++------
 fs/btrfs/extent_io.h |  3 +--
 fs/btrfs/inode.c     |  3 +--
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3f5debc5e56f..f86a6aae545a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4133,8 +4133,7 @@ int extent_writepages(struct extent_io_tree *tree,
 
 int extent_readpages(struct extent_io_tree *tree,
 		     struct address_space *mapping,
-		     struct list_head *pages, unsigned nr_pages,
-		     get_extent_t get_extent)
+		     struct list_head *pages, unsigned nr_pages)
 {
 	struct bio *bio = NULL;
 	unsigned page_idx;
@@ -4160,13 +4159,13 @@ int extent_readpages(struct extent_io_tree *tree,
 		pagepool[nr++] = page;
 		if (nr < ARRAY_SIZE(pagepool))
 			continue;
-		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
-				   &bio, &bio_flags, &prev_em_start);
+		__extent_readpages(tree, pagepool, nr, btrfs_get_extent,
+				&em_cached, &bio, &bio_flags, &prev_em_start);
 		nr = 0;
 	}
 	if (nr)
-		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
-				   &bio, &bio_flags, &prev_em_start);
+		__extent_readpages(tree, pagepool, nr, btrfs_get_extent,
+				&em_cached, &bio, &bio_flags, &prev_em_start);
 
 	if (em_cached)
 		free_extent_map(em_cached);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index afc169d04b22..6cf4a0e5b0ea 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -414,8 +414,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 			    struct writeback_control *wbc);
 int extent_readpages(struct extent_io_tree *tree,
 		     struct address_space *mapping,
-		     struct list_head *pages, unsigned nr_pages,
-		     get_extent_t get_extent);
+		     struct list_head *pages, unsigned nr_pages);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len);
 void set_page_extent_mapped(struct page *page);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 63cf6cd174d1..72c7b38fb17e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8911,8 +8911,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(mapping->host)->io_tree;
-	return extent_readpages(tree, mapping, pages, nr_pages,
-				btrfs_get_extent);
+	return extent_readpages(tree, mapping, pages, nr_pages);
 }
 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
-- 
cgit v1.2.3


From e4d17ef50719dc38ada5b4166cb670e0651ee4de Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:09:57 +0200
Subject: btrfs: sink get_extent parameter to __extent_readpages

All callers pass btrfs_get_extent.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f86a6aae545a..4e7c5d572667 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3121,7 +3121,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 
 static void __extent_readpages(struct extent_io_tree *tree,
 			       struct page *pages[],
-			       int nr_pages, get_extent_t *get_extent,
+			       int nr_pages,
 			       struct extent_map **em_cached,
 			       struct bio **bio, unsigned long *bio_flags,
 			       u64 *prev_em_start)
@@ -3143,7 +3143,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
 		} else {
 			__do_contiguous_readpages(tree, &pages[first_index],
 						  index - first_index, start,
-						  end, get_extent, em_cached,
+						  end, btrfs_get_extent,
+						  em_cached,
 						  bio, bio_flags,
 						  prev_em_start);
 			start = page_start;
@@ -3155,7 +3156,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
 	if (end)
 		__do_contiguous_readpages(tree, &pages[first_index],
 					  index - first_index, start,
-					  end, get_extent, em_cached, bio,
+					  end, btrfs_get_extent, em_cached, bio,
 					  bio_flags, prev_em_start);
 }
 
@@ -4159,13 +4160,13 @@ int extent_readpages(struct extent_io_tree *tree,
 		pagepool[nr++] = page;
 		if (nr < ARRAY_SIZE(pagepool))
 			continue;
-		__extent_readpages(tree, pagepool, nr, btrfs_get_extent,
-				&em_cached, &bio, &bio_flags, &prev_em_start);
+		__extent_readpages(tree, pagepool, nr, &em_cached, &bio,
+				&bio_flags, &prev_em_start);
 		nr = 0;
 	}
 	if (nr)
-		__extent_readpages(tree, pagepool, nr, btrfs_get_extent,
-				&em_cached, &bio, &bio_flags, &prev_em_start);
+		__extent_readpages(tree, pagepool, nr, &em_cached, &bio,
+				&bio_flags, &prev_em_start);
 
 	if (em_cached)
 		free_extent_map(em_cached);
-- 
cgit v1.2.3


From 4ef77695a0f28a42a67a027473d87f5cafa35674 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:09:57 +0200
Subject: btrfs: sink get_extent parameter to __do_contiguous_readpages

All callers pass btrfs_get_extent.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4e7c5d572667..eda8fe363132 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3090,7 +3090,6 @@ out:
 static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 					     struct page *pages[], int nr_pages,
 					     u64 start, u64 end,
-					     get_extent_t *get_extent,
 					     struct extent_map **em_cached,
 					     struct bio **bio,
 					     unsigned long *bio_flags,
@@ -3113,8 +3112,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 	}
 
 	for (index = 0; index < nr_pages; index++) {
-		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
-			      0, bio_flags, 0, prev_em_start);
+		__do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
+				bio, 0, bio_flags, 0, prev_em_start);
 		put_page(pages[index]);
 	}
 }
@@ -3143,8 +3142,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
 		} else {
 			__do_contiguous_readpages(tree, &pages[first_index],
 						  index - first_index, start,
-						  end, btrfs_get_extent,
-						  em_cached,
+						  end, em_cached,
 						  bio, bio_flags,
 						  prev_em_start);
 			start = page_start;
@@ -3156,7 +3154,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
 	if (end)
 		__do_contiguous_readpages(tree, &pages[first_index],
 					  index - first_index, start,
-					  end, btrfs_get_extent, em_cached, bio,
+					  end, em_cached, bio,
 					  bio_flags, prev_em_start);
 }
 
-- 
cgit v1.2.3


From 6af49dbde9532c95f53d2c45fe9cc0012226c5e7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:09:57 +0200
Subject: btrfs: sink get_extent parameter to read_extent_buffer_pages

All callers pass btree_get_extent, which needs to be exported.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 8 ++++----
 fs/btrfs/disk-io.h   | 3 +++
 fs/btrfs/extent_io.c | 6 +++---
 fs/btrfs/extent_io.h | 2 +-
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 06c593775b82..cbf37df05a88 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -220,7 +220,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
  */
-static struct extent_map *btree_get_extent(struct btrfs_inode *inode,
+struct extent_map *btree_get_extent(struct btrfs_inode *inode,
 		struct page *page, size_t pg_offset, u64 start, u64 len,
 		int create)
 {
@@ -455,7 +455,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
 	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
 	while (1) {
 		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
-					       btree_get_extent, mirror_num);
+					       mirror_num);
 		if (!ret) {
 			if (!verify_parent_transid(io_tree, eb,
 						   parent_transid, 0))
@@ -1012,7 +1012,7 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
 	if (IS_ERR(buf))
 		return;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, WAIT_NONE, btree_get_extent, 0);
+				 buf, WAIT_NONE, 0);
 	free_extent_buffer(buf);
 }
 
@@ -1031,7 +1031,7 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
 	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
 
 	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
-				       btree_get_extent, mirror_num);
+				       mirror_num);
 	if (ret) {
 		free_extent_buffer(buf);
 		return ret;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 7f7c35d6347a..301151a50ac1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -149,6 +149,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
 				void (*flush_fn)(void *));
+struct extent_map *btree_get_extent(struct btrfs_inode *inode,
+		struct page *page, size_t pg_offset, u64 start, u64 len,
+		int create);
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
 int __init btrfs_end_io_wq_init(void);
 void btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eda8fe363132..932d805a81e3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -21,6 +21,7 @@
 #include "locking.h"
 #include "rcu-string.h"
 #include "backref.h"
+#include "disk-io.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -5248,8 +5249,7 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, int wait,
-			     get_extent_t *get_extent, int mirror_num)
+			     struct extent_buffer *eb, int wait, int mirror_num)
 {
 	unsigned long i;
 	struct page *page;
@@ -5309,7 +5309,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 
 			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
-						      get_extent, &bio,
+						      btree_get_extent, &bio,
 						      mirror_num, &bio_flags,
 						      REQ_META);
 			if (err) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6cf4a0e5b0ea..c28f5ef88f42 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -435,7 +435,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, int wait,
-			     get_extent_t *get_extent, int mirror_num);
+			     int mirror_num);
 void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
 
 static inline unsigned long num_extent_pages(u64 start, u64 len)
-- 
cgit v1.2.3


From 02cfe779ccaec421cd4ed53a574c8a6354efa7fe Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 15 Nov 2017 16:04:40 +0100
Subject: btrfs: ref-verify: Remove unused parameter from walk_up_tree() to
 kill warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc-4.1.2:

    fs/btrfs/ref-verify.c: In function ‘btrfs_build_ref_tree’:
    fs/btrfs/ref-verify.c:1017: warning: ‘root’ is used uninitialized in this function

The variable is indeed passed uninitialized, but it is never used by the
callee.  However, not all versions of gcc are smart enough to notice.

Hence remove the unused parameter from walk_up_tree() to silence the
compiler warning.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ref-verify.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 34878699d363..171f3cce30e6 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -606,8 +606,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
 }
 
 /* Walk up to the next node that needs to be processed */
-static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
-			int *level)
+static int walk_up_tree(struct btrfs_path *path, int *level)
 {
 	int l;
 
@@ -984,7 +983,6 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
 int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_path *path;
-	struct btrfs_root *root;
 	struct extent_buffer *eb;
 	u64 bytenr = 0, num_bytes = 0;
 	int ret, level;
@@ -1014,7 +1012,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
 				     &bytenr, &num_bytes);
 		if (ret)
 			break;
-		ret = walk_up_tree(root, path, &level);
+		ret = walk_up_tree(path, &level);
 		if (ret < 0)
 			break;
 		if (ret > 0) {
-- 
cgit v1.2.3


From 71f8a8d2c1ee65589b3f0ec5f2306723ea76c1ee Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 9 Nov 2017 23:45:23 +0800
Subject: btrfs: set fs_devices->seed directly

This is in preparation to move a section of code in __btrfs_open_devices()
into a new function so that it can be reused. As we set seeding if any of
the device is having SB flag BTRFS_SUPER_FLAG_SEEDING, so do it in the
device list loop itself. No functional changes.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 96c6c0fa1531..51445cc496c0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1019,7 +1019,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
 	u64 devid;
-	int seeding = 1;
 	int ret = 0;
 
 	flags |= FMODE_EXCL;
@@ -1051,9 +1050,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 			device->writeable = 0;
+			fs_devices->seeding = 1;
 		} else {
 			device->writeable = !bdev_read_only(bdev);
-			seeding = 0;
 		}
 
 		q = bdev_get_queue(bdev);
@@ -1085,7 +1084,6 @@ error_brelse:
 		ret = -EINVAL;
 		goto out;
 	}
-	fs_devices->seeding = seeding;
 	fs_devices->opened = 1;
 	fs_devices->latest_bdev = latest_dev->bdev;
 	fs_devices->total_rw_bytes = 0;
-- 
cgit v1.2.3


From 9f050db43e0f671ff558f34479b06aafd5103eab Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 9 Nov 2017 23:45:25 +0800
Subject: btrfs: move check for device generation to the last

No functional changes. This helps to move the entire section into
a new function.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51445cc496c0..04d205226656 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1044,9 +1044,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			goto error_brelse;
 
 		device->generation = btrfs_super_generation(disk_super);
-		if (!latest_dev ||
-		    device->generation > latest_dev->generation)
-			latest_dev = device;
 
 		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 			device->writeable = 0;
@@ -1073,6 +1070,11 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 				 &fs_devices->alloc_list);
 		}
 		brelse(bh);
+
+		if (!latest_dev ||
+		    device->generation > latest_dev->generation)
+			latest_dev = device;
+
 		continue;
 
 error_brelse:
-- 
cgit v1.2.3


From 0fb08bccbce2c1900d18f7ecc01ff8b8f677ce3e Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 9 Nov 2017 23:45:24 +0800
Subject: btrfs: factor __btrfs_open_devices() to create
 btrfs_open_one_device()

No functional changes, create btrfs_open_one_device() from
__btrfs_open_devices(). This is a preparatory work to add dynamic
device scan.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[ minor whitespace fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 121 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 65 insertions(+), 56 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 04d205226656..20039d625210 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -659,6 +659,70 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 	}
 }
 
+static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
+			struct btrfs_device *device, fmode_t flags,
+			void *holder)
+{
+	struct request_queue *q;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 devid;
+	int ret;
+
+	if (device->bdev)
+		return -EINVAL;
+	if (!device->name)
+		return -EINVAL;
+
+	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+				    &bdev, &bh);
+	if (ret)
+		return ret;
+
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
+	if (devid != device->devid)
+		goto error_brelse;
+
+	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
+		goto error_brelse;
+
+	device->generation = btrfs_super_generation(disk_super);
+
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+		device->writeable = 0;
+		fs_devices->seeding = 1;
+	} else {
+		device->writeable = !bdev_read_only(bdev);
+	}
+
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
+	if (!blk_queue_nonrot(q))
+		fs_devices->rotating = 1;
+
+	device->bdev = bdev;
+	device->in_fs_metadata = 0;
+	device->mode = flags;
+
+	fs_devices->open_devices++;
+	if (device->writeable && device->devid != BTRFS_DEV_REPLACE_DEVID) {
+		fs_devices->rw_devices++;
+		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+	}
+	brelse(bh);
+
+	return 0;
+
+error_brelse:
+	brelse(bh);
+	blkdev_put(bdev, flags);
+
+	return -EINVAL;
+}
+
 /*
  * Add new device to list of registered devices
  *
@@ -1011,76 +1075,21 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 				fmode_t flags, void *holder)
 {
-	struct request_queue *q;
-	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
 	struct btrfs_device *device;
 	struct btrfs_device *latest_dev = NULL;
-	struct buffer_head *bh;
-	struct btrfs_super_block *disk_super;
-	u64 devid;
 	int ret = 0;
 
 	flags |= FMODE_EXCL;
 
 	list_for_each_entry(device, head, dev_list) {
-		if (device->bdev)
-			continue;
-		if (!device->name)
-			continue;
-
 		/* Just open everything we can; ignore failures here */
-		if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-					    &bdev, &bh))
+		if (btrfs_open_one_device(fs_devices, device, flags, holder))
 			continue;
 
-		disk_super = (struct btrfs_super_block *)bh->b_data;
-		devid = btrfs_stack_device_id(&disk_super->dev_item);
-		if (devid != device->devid)
-			goto error_brelse;
-
-		if (memcmp(device->uuid, disk_super->dev_item.uuid,
-			   BTRFS_UUID_SIZE))
-			goto error_brelse;
-
-		device->generation = btrfs_super_generation(disk_super);
-
-		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
-			device->writeable = 0;
-			fs_devices->seeding = 1;
-		} else {
-			device->writeable = !bdev_read_only(bdev);
-		}
-
-		q = bdev_get_queue(bdev);
-		if (blk_queue_discard(q))
-			device->can_discard = 1;
-		if (!blk_queue_nonrot(q))
-			fs_devices->rotating = 1;
-
-		device->bdev = bdev;
-		device->in_fs_metadata = 0;
-		device->mode = flags;
-
-		fs_devices->open_devices++;
-		if (device->writeable &&
-		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
-			fs_devices->rw_devices++;
-			list_add(&device->dev_alloc_list,
-				 &fs_devices->alloc_list);
-		}
-		brelse(bh);
-
 		if (!latest_dev ||
 		    device->generation > latest_dev->generation)
 			latest_dev = device;
-
-		continue;
-
-error_brelse:
-		brelse(bh);
-		blkdev_put(bdev, flags);
-		continue;
 	}
 	if (fs_devices->open_devices == 0) {
 		ret = -EINVAL;
-- 
cgit v1.2.3


From 343e4fc1c60971b0734de26dbbd475d433950982 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 15 Nov 2017 16:10:28 -0700
Subject: Btrfs: set plug for fsync

Setting plug can merge adjacent IOs before dispatching IOs to the disk
driver.

Without plug, it'd not be a problem for single disk usecases, but for
multiple disks using raid profile, a large IO can be split to several
IOs of stripe length, and plug can be helpful to bring them together
for each disk so that we can save several disk access.

Moreover, fsync issues synchronous writes, so plug can really take
effect.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b85b6d7d0ccd..1096398e1351 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2019,10 +2019,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
 {
 	int ret;
+	struct blk_plug plug;
 
+	/*
+	 * This is only called in fsync, which would do synchronous writes, so
+	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
+	 * multiple disks using raid profile, a large IO can be split to
+	 * several segments of stripe length (currently 64K).
+	 */
+	blk_start_plug(&plug);
 	atomic_inc(&BTRFS_I(inode)->sync_writers);
 	ret = btrfs_fdatawrite_range(inode, start, end);
 	atomic_dec(&BTRFS_I(inode)->sync_writers);
+	blk_finish_plug(&plug);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 71a635516ca521931be418827150f782b0a03ae7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 6 Nov 2017 19:23:00 +0100
Subject: btrfs: switch to on-stack csum buffer in csum_tree_block

The maximum size of a checksum buffer is known, BTRFS_CSUM_SIZE, and we
don't have to allocate it dynamically. This code path is not used at all
as we have only the crc32c and use an on-stack buffer already.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cbf37df05a88..5f9430063c50 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -285,7 +285,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
 			   int verify)
 {
 	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
-	char *result = NULL;
+	char result[BTRFS_CSUM_SIZE];
 	unsigned long len;
 	unsigned long cur_len;
 	unsigned long offset = BTRFS_CSUM_SIZE;
@@ -294,7 +294,6 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
 	unsigned long map_len;
 	int err;
 	u32 crc = ~(u32)0;
-	unsigned long inline_result;
 
 	len = buf->len - offset;
 	while (len > 0) {
@@ -308,13 +307,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
 		len -= cur_len;
 		offset += cur_len;
 	}
-	if (csum_size > sizeof(inline_result)) {
-		result = kzalloc(csum_size, GFP_NOFS);
-		if (!result)
-			return -ENOMEM;
-	} else {
-		result = (char *)&inline_result;
-	}
+	memset(result, 0, BTRFS_CSUM_SIZE);
 
 	btrfs_csum_final(crc, result);
 
@@ -329,15 +322,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
 				"%s checksum verify failed on %llu wanted %X found %X level %d",
 				fs_info->sb->s_id, buf->start,
 				val, found, btrfs_header_level(buf));
-			if (result != (char *)&inline_result)
-				kfree(result);
 			return -EUCLEAN;
 		}
 	} else {
 		write_extent_buffer(buf, result, 0, csum_size);
 	}
-	if (result != (char *)&inline_result)
-		kfree(result);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 41a1eadad719168ce5f6cfcfda6ea45134f2424a Mon Sep 17 00:00:00 2001
From: Edmund Nadolski <enadolski@suse.com>
Date: Mon, 20 Nov 2017 13:24:47 -0700
Subject: btrfs: btrfs_inode_log_parent should use defined inode_only values.

Replace hardcoded numeric argument values for inode_only with the
constants defined for that use.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a806182dfea6..ee1aaed1330e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5403,11 +5403,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 				  struct dentry *parent,
 				  const loff_t start,
 				  const loff_t end,
-				  int exists_only,
+				  int inode_only,
 				  struct btrfs_log_ctx *ctx)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
 	struct super_block *sb;
 	struct dentry *old_parent = NULL;
 	int ret = 0;
@@ -5573,7 +5572,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 	int ret;
 
 	ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
-			parent, start, end, 0, ctx);
+			parent, start, end, LOG_INODE_ALL, ctx);
 	dput(parent);
 
 	return ret;
@@ -5836,6 +5835,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		return 0;
 
 	return btrfs_log_inode_parent(trans, root, inode, parent, 0,
-				      LLONG_MAX, 1, NULL);
+				      LLONG_MAX, LOG_INODE_EXISTS, NULL);
 }
 
-- 
cgit v1.2.3


From bf46f52db96795e0c10292d55888640ab57f42c7 Mon Sep 17 00:00:00 2001
From: Edmund Nadolski <enadolski@suse.com>
Date: Mon, 20 Nov 2017 13:24:49 -0700
Subject: btrfs: remove dead code from btrfs_get_extent

Due to new_inline logic, the create == 0 is always true at this
point in the code, so the create != 0 branch can be removed.

Signed-off-by: Edmund Nadolski <enadolski@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 72c7b38fb17e..70a75b5f3046 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6946,7 +6946,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 	struct btrfs_trans_handle *trans = NULL;
 	const bool new_inline = !page || create;
 
-again:
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
 	if (em)
@@ -7087,7 +7086,7 @@ next:
 		em->orig_block_len = em->len;
 		em->orig_start = em->start;
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
-		if (create == 0 && !PageUptodate(page)) {
+		if (!PageUptodate(page)) {
 			if (btrfs_file_extent_compression(leaf, item) !=
 			    BTRFS_COMPRESS_NONE) {
 				ret = uncompress_inline(path, page, pg_offset,
@@ -7108,25 +7107,6 @@ next:
 				kunmap(page);
 			}
 			flush_dcache_page(page);
-		} else if (create && PageUptodate(page)) {
-			BUG();
-			if (!trans) {
-				kunmap(page);
-				free_extent_map(em);
-				em = NULL;
-
-				btrfs_release_path(path);
-				trans = btrfs_join_transaction(root);
-
-				if (IS_ERR(trans))
-					return ERR_CAST(trans);
-				goto again;
-			}
-			map = kmap(page);
-			write_extent_buffer(leaf, map + pg_offset, ptr,
-					    copy_size);
-			kunmap(page);
-			btrfs_mark_buffer_dirty(leaf);
 		}
 		set_extent_uptodate(io_tree, em->start,
 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
-- 
cgit v1.2.3


From 7c2871a2f4695fadc35ea463552b23180c7a7e34 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 8 Nov 2017 01:07:43 +0100
Subject: btrfs: switch btrfs_trans_handle::adding_csums to bool

The semantics of adding_csums matches bool, 'short' was most likely used
to save space in a698d0755adb6f2 ("Btrfs: add a type field for the
transaction handle").

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c       | 4 ++--
 fs/btrfs/transaction.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 70a75b5f3046..1c704eb82487 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2036,10 +2036,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	struct btrfs_ordered_sum *sum;
 
 	list_for_each_entry(sum, list, list) {
-		trans->adding_csums = 1;
+		trans->adding_csums = true;
 		btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
-		trans->adding_csums = 0;
+		trans->adding_csums = false;
 	}
 	return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index c55e44560103..a673142c003e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -118,7 +118,7 @@ struct btrfs_trans_handle {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *orig_rsv;
 	short aborted;
-	short adding_csums;
+	bool adding_csums;
 	bool allocating_chunk;
 	bool can_flush_pending_bgs;
 	bool reloc_reserved;
-- 
cgit v1.2.3


From 2dbda74ed9e5497697b913c780c928e25e70d832 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 8 Nov 2017 01:32:48 +0100
Subject: btrfs: remove unused member of btrfs_trans_handle

Last user was removed in a monster commit a22285a6a32390195235171
("Btrfs: Integrate metadata reservation with start_transaction") in
2010.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index a673142c003e..c48a4a03f1b4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -112,7 +112,6 @@ struct btrfs_trans_handle {
 	u64 bytes_reserved;
 	u64 chunk_bytes_reserved;
 	unsigned long use_count;
-	unsigned long blocks_reserved;
 	unsigned long delayed_ref_updates;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
-- 
cgit v1.2.3


From b50fff816cbd670ea545ce98ae374356f08f2d75 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 8 Nov 2017 01:39:58 +0100
Subject: btrfs: switch to refcount_t type for btrfs_trans_handle::use_count

The use_count is a reference counter, we can use the refcount_t type,
though we don't use the atomicity. This is not a performance critical
code and we could catch the underflows. The type is changed from long,
but the number of references will fit an int.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 12 ++++++------
 fs/btrfs/transaction.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dac688c696c3..6348573e26a7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -495,8 +495,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	if (current->journal_info) {
 		WARN_ON(type & TRANS_EXTWRITERS);
 		h = current->journal_info;
-		h->use_count++;
-		WARN_ON(h->use_count > 2);
+		refcount_inc(&h->use_count);
+		WARN_ON(refcount_read(&h->use_count) > 2);
 		h->orig_rsv = h->block_rsv;
 		h->block_rsv = NULL;
 		goto got_it;
@@ -567,7 +567,7 @@ again:
 	h->transid = cur_trans->transid;
 	h->transaction = cur_trans;
 	h->root = root;
-	h->use_count = 1;
+	refcount_set(&h->use_count, 1);
 	h->fs_info = root->fs_info;
 
 	h->type = type;
@@ -837,8 +837,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int must_run_delayed_refs = 0;
 
-	if (trans->use_count > 1) {
-		trans->use_count--;
+	if (refcount_read(&trans->use_count) > 1) {
+		refcount_dec(&trans->use_count);
 		trans->block_rsv = trans->orig_rsv;
 		return 0;
 	}
@@ -1868,7 +1868,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	DEFINE_WAIT(wait);
 
-	WARN_ON(trans->use_count > 1);
+	WARN_ON(refcount_read(&trans->use_count) > 1);
 
 	btrfs_abort_transaction(trans, err);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index c48a4a03f1b4..afa88f035654 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -111,7 +111,7 @@ struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
 	u64 chunk_bytes_reserved;
-	unsigned long use_count;
+	refcount_t use_count;
 	unsigned long delayed_ref_updates;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
-- 
cgit v1.2.3


From 1ca4bb63f6bcc0b4fa3cc6d5aea0a503186a3e20 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 8 Nov 2017 01:54:33 +0100
Subject: btrfs: reorder btrfs_trans_handle members for better packing

Recent updates to the structure left some holes, reorder the types so
the packing is tight. The size goes from 112 to 104 on 64bit.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index afa88f035654..edf53112a6f2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -111,11 +111,12 @@ struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
 	u64 chunk_bytes_reserved;
-	refcount_t use_count;
 	unsigned long delayed_ref_updates;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *orig_rsv;
+	refcount_t use_count;
+	unsigned int type;
 	short aborted;
 	bool adding_csums;
 	bool allocating_chunk;
@@ -123,7 +124,6 @@ struct btrfs_trans_handle {
 	bool reloc_reserved;
 	bool sync;
 	bool dirty;
-	unsigned int type;
 	struct btrfs_root *root;
 	struct btrfs_fs_info *fs_info;
 	struct list_head new_bgs;
-- 
cgit v1.2.3


From 165c8b022c492f7eb33f7c936ac063a6fd4e90a3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 8 Nov 2017 02:12:57 +0100
Subject: btrfs: use narrower type for btrfs_transaction::num_dirty_bgs

The u64 is an overkill here, we could not possibly create that many
blockgroups in one transaction.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/transaction.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 41770ee1313e..c3ccd9d2e946 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2894,7 +2894,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *global_rsv;
 	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
 	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
-	u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+	unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
 	u64 num_bytes, num_dirty_bgs_bytes;
 	int ret = 0;
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index edf53112a6f2..1805fd101767 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -70,7 +70,7 @@ struct btrfs_transaction {
 	struct list_head dirty_bgs;
 	struct list_head io_bgs;
 	struct list_head dropped_roots;
-	u64 num_dirty_bgs;
+	unsigned int num_dirty_bgs;
 
 	/*
 	 * we need to make sure block group deletion doesn't race with
-- 
cgit v1.2.3


From 5302e0896445ac3a9c707bd42c39c58a49959980 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 8 Nov 2017 01:54:33 +0100
Subject: btrfs: reorder btrfs_transaction members for better packing

There are now 20 bytes of holes, we can reduce that to 4 by minor
changes. Moving 'aborted' to the status and flags is also more logical,
similar for num_dirty_bgs. The size goes from 432 to 416.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 1805fd101767..6beee072b1bd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -58,6 +58,7 @@ struct btrfs_transaction {
 
 	/* Be protected by fs_info->trans_lock when we want to change it. */
 	enum btrfs_trans_state state;
+	int aborted;
 	struct list_head list;
 	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
@@ -70,7 +71,6 @@ struct btrfs_transaction {
 	struct list_head dirty_bgs;
 	struct list_head io_bgs;
 	struct list_head dropped_roots;
-	unsigned int num_dirty_bgs;
 
 	/*
 	 * we need to make sure block group deletion doesn't race with
@@ -79,11 +79,11 @@ struct btrfs_transaction {
 	 */
 	struct mutex cache_write_mutex;
 	spinlock_t dirty_bgs_lock;
+	unsigned int num_dirty_bgs;
 	/* Protected by spin lock fs_info->unused_bgs_lock. */
 	struct list_head deleted_bgs;
 	spinlock_t dropped_roots_lock;
 	struct btrfs_delayed_ref_root delayed_refs;
-	int aborted;
 	struct btrfs_fs_info *fs_info;
 };
 
-- 
cgit v1.2.3


From 431e98226cbcaab40b77aeef2e6ba20a47649962 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 15 Nov 2017 18:27:39 +0100
Subject: btrfs: move some zstd work data from stack to workspace

* ZSTD_inBuffer in_buf
* ZSTD_outBuffer out_buf

are used in all functions to pass the compression parameters and the
local variables consume some space. We can move them to the workspace
and reduce the stack consumption:

zstd.c:zstd_decompress                        -24 (136 -> 112)
zstd.c:zstd_decompress_bio                    -24 (144 -> 120)
zstd.c:zstd_compress_pages                    -24 (264 -> 240)

Signed-off-by: David Sterba <dsterba@suse.com>
Reviewed-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/zstd.c | 132 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 67 insertions(+), 65 deletions(-)

diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 17f2dd8fddb8..01a4eab602a3 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -43,6 +43,8 @@ struct workspace {
 	size_t size;
 	char *buf;
 	struct list_head list;
+	ZSTD_inBuffer in_buf;
+	ZSTD_outBuffer out_buf;
 };
 
 static void zstd_free_workspace(struct list_head *ws)
@@ -94,8 +96,6 @@ static int zstd_compress_pages(struct list_head *ws,
 	int nr_pages = 0;
 	struct page *in_page = NULL;  /* The current page to read */
 	struct page *out_page = NULL; /* The current page to write to */
-	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
-	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
 	unsigned long tot_in = 0;
 	unsigned long tot_out = 0;
 	unsigned long len = *total_out;
@@ -118,9 +118,9 @@ static int zstd_compress_pages(struct list_head *ws,
 
 	/* map in the first page of input data */
 	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-	in_buf.src = kmap(in_page);
-	in_buf.pos = 0;
-	in_buf.size = min_t(size_t, len, PAGE_SIZE);
+	workspace->in_buf.src = kmap(in_page);
+	workspace->in_buf.pos = 0;
+	workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 
 
 	/* Allocate and map in the output buffer */
@@ -130,14 +130,15 @@ static int zstd_compress_pages(struct list_head *ws,
 		goto out;
 	}
 	pages[nr_pages++] = out_page;
-	out_buf.dst = kmap(out_page);
-	out_buf.pos = 0;
-	out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+	workspace->out_buf.dst = kmap(out_page);
+	workspace->out_buf.pos = 0;
+	workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
 
 	while (1) {
 		size_t ret2;
 
-		ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf);
+		ret2 = ZSTD_compressStream(stream, &workspace->out_buf,
+				&workspace->in_buf);
 		if (ZSTD_isError(ret2)) {
 			pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
 					ZSTD_getErrorCode(ret2));
@@ -146,22 +147,22 @@ static int zstd_compress_pages(struct list_head *ws,
 		}
 
 		/* Check to see if we are making it bigger */
-		if (tot_in + in_buf.pos > 8192 &&
-				tot_in + in_buf.pos <
-				tot_out + out_buf.pos) {
+		if (tot_in + workspace->in_buf.pos > 8192 &&
+				tot_in + workspace->in_buf.pos <
+				tot_out + workspace->out_buf.pos) {
 			ret = -E2BIG;
 			goto out;
 		}
 
 		/* We've reached the end of our output range */
-		if (out_buf.pos >= max_out) {
-			tot_out += out_buf.pos;
+		if (workspace->out_buf.pos >= max_out) {
+			tot_out += workspace->out_buf.pos;
 			ret = -E2BIG;
 			goto out;
 		}
 
 		/* Check if we need more output space */
-		if (out_buf.pos == out_buf.size) {
+		if (workspace->out_buf.pos == workspace->out_buf.size) {
 			tot_out += PAGE_SIZE;
 			max_out -= PAGE_SIZE;
 			kunmap(out_page);
@@ -176,19 +177,20 @@ static int zstd_compress_pages(struct list_head *ws,
 				goto out;
 			}
 			pages[nr_pages++] = out_page;
-			out_buf.dst = kmap(out_page);
-			out_buf.pos = 0;
-			out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+			workspace->out_buf.dst = kmap(out_page);
+			workspace->out_buf.pos = 0;
+			workspace->out_buf.size = min_t(size_t, max_out,
+							PAGE_SIZE);
 		}
 
 		/* We've reached the end of the input */
-		if (in_buf.pos >= len) {
-			tot_in += in_buf.pos;
+		if (workspace->in_buf.pos >= len) {
+			tot_in += workspace->in_buf.pos;
 			break;
 		}
 
 		/* Check if we need more input */
-		if (in_buf.pos == in_buf.size) {
+		if (workspace->in_buf.pos == workspace->in_buf.size) {
 			tot_in += PAGE_SIZE;
 			kunmap(in_page);
 			put_page(in_page);
@@ -196,15 +198,15 @@ static int zstd_compress_pages(struct list_head *ws,
 			start += PAGE_SIZE;
 			len -= PAGE_SIZE;
 			in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-			in_buf.src = kmap(in_page);
-			in_buf.pos = 0;
-			in_buf.size = min_t(size_t, len, PAGE_SIZE);
+			workspace->in_buf.src = kmap(in_page);
+			workspace->in_buf.pos = 0;
+			workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
 		}
 	}
 	while (1) {
 		size_t ret2;
 
-		ret2 = ZSTD_endStream(stream, &out_buf);
+		ret2 = ZSTD_endStream(stream, &workspace->out_buf);
 		if (ZSTD_isError(ret2)) {
 			pr_debug("BTRFS: ZSTD_endStream returned %d\n",
 					ZSTD_getErrorCode(ret2));
@@ -212,11 +214,11 @@ static int zstd_compress_pages(struct list_head *ws,
 			goto out;
 		}
 		if (ret2 == 0) {
-			tot_out += out_buf.pos;
+			tot_out += workspace->out_buf.pos;
 			break;
 		}
-		if (out_buf.pos >= max_out) {
-			tot_out += out_buf.pos;
+		if (workspace->out_buf.pos >= max_out) {
+			tot_out += workspace->out_buf.pos;
 			ret = -E2BIG;
 			goto out;
 		}
@@ -235,9 +237,9 @@ static int zstd_compress_pages(struct list_head *ws,
 			goto out;
 		}
 		pages[nr_pages++] = out_page;
-		out_buf.dst = kmap(out_page);
-		out_buf.pos = 0;
-		out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+		workspace->out_buf.dst = kmap(out_page);
+		workspace->out_buf.pos = 0;
+		workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
 	}
 
 	if (tot_out >= tot_in) {
@@ -273,8 +275,6 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
 	unsigned long buf_start;
 	unsigned long total_out = 0;
-	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
-	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
 
 	stream = ZSTD_initDStream(
 			ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
@@ -284,18 +284,19 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 		goto done;
 	}
 
-	in_buf.src = kmap(pages_in[page_in_index]);
-	in_buf.pos = 0;
-	in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+	workspace->in_buf.src = kmap(pages_in[page_in_index]);
+	workspace->in_buf.pos = 0;
+	workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
 
-	out_buf.dst = workspace->buf;
-	out_buf.pos = 0;
-	out_buf.size = PAGE_SIZE;
+	workspace->out_buf.dst = workspace->buf;
+	workspace->out_buf.pos = 0;
+	workspace->out_buf.size = PAGE_SIZE;
 
 	while (1) {
 		size_t ret2;
 
-		ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+		ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+				&workspace->in_buf);
 		if (ZSTD_isError(ret2)) {
 			pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
 					ZSTD_getErrorCode(ret2));
@@ -303,38 +304,38 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
 			goto done;
 		}
 		buf_start = total_out;
-		total_out += out_buf.pos;
-		out_buf.pos = 0;
+		total_out += workspace->out_buf.pos;
+		workspace->out_buf.pos = 0;
 
-		ret = btrfs_decompress_buf2page(out_buf.dst, buf_start,
-				total_out, disk_start, orig_bio);
+		ret = btrfs_decompress_buf2page(workspace->out_buf.dst,
+				buf_start, total_out, disk_start, orig_bio);
 		if (ret == 0)
 			break;
 
-		if (in_buf.pos >= srclen)
+		if (workspace->in_buf.pos >= srclen)
 			break;
 
 		/* Check if we've hit the end of a frame */
 		if (ret2 == 0)
 			break;
 
-		if (in_buf.pos == in_buf.size) {
+		if (workspace->in_buf.pos == workspace->in_buf.size) {
 			kunmap(pages_in[page_in_index++]);
 			if (page_in_index >= total_pages_in) {
-				in_buf.src = NULL;
+				workspace->in_buf.src = NULL;
 				ret = -EIO;
 				goto done;
 			}
 			srclen -= PAGE_SIZE;
-			in_buf.src = kmap(pages_in[page_in_index]);
-			in_buf.pos = 0;
-			in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+			workspace->in_buf.src = kmap(pages_in[page_in_index]);
+			workspace->in_buf.pos = 0;
+			workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
 		}
 	}
 	ret = 0;
 	zero_fill_bio(orig_bio);
 done:
-	if (in_buf.src)
+	if (workspace->in_buf.src)
 		kunmap(pages_in[page_in_index]);
 	return ret;
 }
@@ -348,8 +349,6 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
 	ZSTD_DStream *stream;
 	int ret = 0;
 	size_t ret2;
-	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
-	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
 	unsigned long total_out = 0;
 	unsigned long pg_offset = 0;
 	char *kaddr;
@@ -364,16 +363,17 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
 
 	destlen = min_t(size_t, destlen, PAGE_SIZE);
 
-	in_buf.src = data_in;
-	in_buf.pos = 0;
-	in_buf.size = srclen;
+	workspace->in_buf.src = data_in;
+	workspace->in_buf.pos = 0;
+	workspace->in_buf.size = srclen;
 
-	out_buf.dst = workspace->buf;
-	out_buf.pos = 0;
-	out_buf.size = PAGE_SIZE;
+	workspace->out_buf.dst = workspace->buf;
+	workspace->out_buf.pos = 0;
+	workspace->out_buf.size = PAGE_SIZE;
 
 	ret2 = 1;
-	while (pg_offset < destlen && in_buf.pos < in_buf.size) {
+	while (pg_offset < destlen
+	       && workspace->in_buf.pos < workspace->in_buf.size) {
 		unsigned long buf_start;
 		unsigned long buf_offset;
 		unsigned long bytes;
@@ -384,7 +384,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
 			ret = -EIO;
 			goto finish;
 		}
-		ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+		ret2 = ZSTD_decompressStream(stream, &workspace->out_buf,
+				&workspace->in_buf);
 		if (ZSTD_isError(ret2)) {
 			pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
 					ZSTD_getErrorCode(ret2));
@@ -393,8 +394,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
 		}
 
 		buf_start = total_out;
-		total_out += out_buf.pos;
-		out_buf.pos = 0;
+		total_out += workspace->out_buf.pos;
+		workspace->out_buf.pos = 0;
 
 		if (total_out <= start_byte)
 			continue;
@@ -405,10 +406,11 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
 			buf_offset = 0;
 
 		bytes = min_t(unsigned long, destlen - pg_offset,
-				out_buf.size - buf_offset);
+				workspace->out_buf.size - buf_offset);
 
 		kaddr = kmap_atomic(dest_page);
-		memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes);
+		memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset,
+				bytes);
 		kunmap_atomic(kaddr);
 
 		pg_offset += bytes;
-- 
cgit v1.2.3


From 3f2dd7a0cef4d7ff964bb5e35beb500cbf3b7bcf Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 17 Nov 2017 15:14:19 +0800
Subject: btrfs: extent-tree: Make btrfs_inode_rsv_refill function static

This function is no longer used outside of extent-tree.c.
Make it static.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c3ccd9d2e946..696275b7b66a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5738,8 +5738,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
  * or return if we already have enough space.  This will also handle the resreve
  * tracepoint for the reserved amount.
  */
-int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
-			   enum btrfs_reserve_flush_enum flush)
+static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
+				  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
-- 
cgit v1.2.3


From 4a2d25cd93cbd2e5ad6c9aabd90da362b2dd9984 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 23 Nov 2017 10:51:43 +0200
Subject: btrfs: Remove redundant FLAG_VACANCY

Commit 9036c10208e1 ("Btrfs: update hole handling v2") added the
FLAG_VACANCY to denote holes, however there was already a consistent way
of flagging extents which represent hole - ->block_start =
EXTENT_MAP_HOLE. And also the only place where this flag is checked is
in the fiemap code, but the block_start value is also checked and every
other place in the filesystem detects holes by using block_start
value's. So remove the extra flag. This survived a full xfstest run.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c         | 4 +---
 fs/btrfs/extent_map.h        | 1 -
 fs/btrfs/inode.c             | 1 -
 fs/btrfs/tests/inode-tests.c | 5 -----
 include/trace/events/btrfs.h | 1 -
 5 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 932d805a81e3..ef72efef8b39 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4311,10 +4311,8 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 			return em;
 
 		/* if this isn't a hole return it */
-		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
-		    em->block_start != EXTENT_MAP_HOLE) {
+		if (em->block_start != EXTENT_MAP_HOLE)
 			return em;
-		}
 
 		/* this is a hole, advance to the next extent */
 		offset = extent_map_end(em);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 64365bbc9b16..e9e285d45c7e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,7 +13,6 @@
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
 #define EXTENT_FLAG_COMPRESSED 1
-#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
 #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
 #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c704eb82487..57785eadb95c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7118,7 +7118,6 @@ not_found:
 	em->len = len;
 not_found_em:
 	em->block_start = EXTENT_MAP_HOLE;
-	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
 insert:
 	btrfs_release_path(path);
 	if (em->start > start || extent_map_end(em) <= start) {
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 4a0a60d3275d..13420cd19ef0 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -288,10 +288,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		test_msg("Expected a hole, got %llu\n", em->block_start);
 		goto out;
 	}
-	if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
-		test_msg("Vacancy flag wasn't set properly\n");
-		goto out;
-	}
 	free_extent_map(em);
 	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
 
@@ -1130,7 +1126,6 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
 	int ret;
 
 	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
-	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
 	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
 
 	test_msg("Running btrfs_get_extent tests\n");
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 4342a329821f..c3ac5ec86519 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -193,7 +193,6 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 	__print_flags(flag, "|",					\
 		{ (1 << EXTENT_FLAG_PINNED), 		"PINNED" 	},\
 		{ (1 << EXTENT_FLAG_COMPRESSED), 	"COMPRESSED" 	},\
-		{ (1 << EXTENT_FLAG_VACANCY), 		"VACANCY" 	},\
 		{ (1 << EXTENT_FLAG_PREALLOC), 		"PREALLOC" 	},\
 		{ (1 << EXTENT_FLAG_LOGGING),	 	"LOGGING" 	},\
 		{ (1 << EXTENT_FLAG_FILLING),	 	"FILLING" 	},\
-- 
cgit v1.2.3


From ccc8dc758da0deeddaec2d8a37524401654e99de Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 30 Nov 2017 12:14:47 +0000
Subject: btrfs: make function update_share_count static

The function update_share_count is local to the source and does
not need to be in global scope, so make it static.

Cleans up sparse warning:
fs/btrfs/backref.c:219:6: warning: symbol 'update_share_count' was not
declared. Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7d0dc100a09a..e4054e533f6d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -216,7 +216,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
 	return 0;
 }
 
-void update_share_count(struct share_check *sc, int oldcount, int newcount)
+static void update_share_count(struct share_check *sc, int oldcount,
+			       int newcount)
 {
 	if ((!sc) || (oldcount == 0 && newcount < 1))
 		return;
-- 
cgit v1.2.3


From 38b5f68e98117daa221c7df813608f48c374aef4 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 29 Nov 2017 18:53:43 +0800
Subject: btrfs: drop btrfs_device::can_discard to query directly

We can query the bdev directly when needed at btrfs_discard_extent()
so drop btrfs_device::can_discard.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Suggested-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 5 ++++-
 fs/btrfs/volumes.c     | 8 --------
 fs/btrfs/volumes.h     | 1 -
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 696275b7b66a..7411f65099d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2145,7 +2145,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 
 		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
 			u64 bytes;
-			if (!stripe->dev->can_discard)
+			struct request_queue *req_q;
+
+			req_q = bdev_get_queue(stripe->dev->bdev);
+			if (!blk_queue_discard(req_q))
 				continue;
 
 			ret = btrfs_issue_discard(stripe->dev->bdev,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 20039d625210..3e348743271a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -698,8 +698,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 	}
 
 	q = bdev_get_queue(bdev);
-	if (blk_queue_discard(q))
-		device->can_discard = 1;
 	if (!blk_queue_nonrot(q))
 		fs_devices->rotating = 1;
 
@@ -2433,8 +2431,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 
 	q = bdev_get_queue(bdev);
-	if (blk_queue_discard(q))
-		device->can_discard = 1;
 	device->writeable = 1;
 	device->generation = trans->transid;
 	device->io_width = fs_info->sectorsize;
@@ -2585,7 +2581,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 				  struct btrfs_device *srcdev,
 				  struct btrfs_device **device_out)
 {
-	struct request_queue *q;
 	struct btrfs_device *device;
 	struct block_device *bdev;
 	struct list_head *devices;
@@ -2642,9 +2637,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	}
 	rcu_assign_pointer(device->name, name);
 
-	q = bdev_get_queue(bdev);
-	if (blk_queue_discard(q))
-		device->can_discard = 1;
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	device->writeable = 1;
 	device->generation = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 294c4eb6a272..60588c259665 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -72,7 +72,6 @@ struct btrfs_device {
 	int writeable;
 	int in_fs_metadata;
 	int missing;
-	int can_discard;
 	int is_tgtdev_for_dev_replace;
 	blk_status_t last_flush_error;
 	int flush_bio_sent;
-- 
cgit v1.2.3


From 3c958bd23b60c22947b857d2cb13196e2cc58c58 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 28 Nov 2017 10:43:10 +0800
Subject: btrfs: add helper for device path or missing

This patch creates a helper function to get either the rcu device path
or missing.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[ rename to btrfs_dev_name, switch to if/else ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7c655f9a7a50..483eb62b9b27 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -304,6 +304,14 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 		dev_replace->cursor_left_last_write_of_item;
 }
 
+static char* btrfs_dev_name(struct btrfs_device *device)
+{
+	if (device->missing)
+		return "<missing disk>";
+	else
+		return rcu_str_deref(device->name);
+}
+
 int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
 		int read_src)
@@ -363,8 +371,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 
 	btrfs_info_in_rcu(fs_info,
 		      "dev_replace from %s (devid %llu) to %s started",
-		      src_device->missing ? "<missing disk>" :
-		        rcu_str_deref(src_device->name),
+		      btrfs_dev_name(src_device),
 		      src_device->devid,
 		      rcu_str_deref(tgt_device->name));
 
@@ -538,8 +545,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	} else {
 		btrfs_err_in_rcu(fs_info,
 				 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
-				 src_device->missing ? "<missing disk>" :
-				 rcu_str_deref(src_device->name),
+				 btrfs_dev_name(src_device),
 				 src_device->devid,
 				 rcu_str_deref(tgt_device->name), scrub_ret);
 		btrfs_dev_replace_unlock(dev_replace, 1);
@@ -557,8 +563,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
 	btrfs_info_in_rcu(fs_info,
 			  "dev_replace from %s (devid %llu) to %s finished",
-			  src_device->missing ? "<missing disk>" :
-			  rcu_str_deref(src_device->name),
+			  btrfs_dev_name(src_device),
 			  src_device->devid,
 			  rcu_str_deref(tgt_device->name));
 	tgt_device->is_tgtdev_for_dev_replace = 0;
@@ -814,12 +819,10 @@ static int btrfs_dev_replace_kthread(void *data)
 	progress = btrfs_dev_replace_progress(fs_info);
 	progress = div_u64(progress, 10);
 	btrfs_info_in_rcu(fs_info,
-		"continuing dev_replace from %s (devid %llu) to %s @%u%%",
-		dev_replace->srcdev->missing ? "<missing disk>"
-			: rcu_str_deref(dev_replace->srcdev->name),
+		"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
+		btrfs_dev_name(dev_replace->srcdev),
 		dev_replace->srcdev->devid,
-		dev_replace->tgtdev ? rcu_str_deref(dev_replace->tgtdev->name)
-			: "<missing target disk>",
+		btrfs_dev_name(dev_replace->tgtdev),
 		(unsigned int)progress);
 
 	btrfs_dev_replace_continue_on_mount(fs_info);
-- 
cgit v1.2.3


From ebbede42d47dc77d1c20e7468418826e5efa6b29 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 4 Dec 2017 12:54:52 +0800
Subject: btrfs: cleanup device states define BTRFS_DEV_STATE_WRITEABLE

Currently device state is being managed by each individual int
variable such as struct btrfs_device::writeable. Instead of that
declare device state BTRFS_DEV_STATE_WRITEABLE and use the
bit operations.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[ whitespace adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c     | 12 +++++++----
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/extent_io.c   |  3 ++-
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/scrub.c       |  3 ++-
 fs/btrfs/volumes.c     | 57 ++++++++++++++++++++++++++++----------------------
 fs/btrfs/volumes.h     |  4 +++-
 7 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5f9430063c50..396f8183ee86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3393,7 +3393,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 			continue;
 		if (!dev->bdev)
 			continue;
-		if (!dev->in_fs_metadata || !dev->writeable)
+		if (!dev->in_fs_metadata ||
+		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
 		write_dev_flush(dev);
@@ -3408,7 +3409,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 			errors_wait++;
 			continue;
 		}
-		if (!dev->in_fs_metadata || !dev->writeable)
+		if (!dev->in_fs_metadata ||
+		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
 		ret = wait_dev_flush(dev);
@@ -3505,7 +3507,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 			total_errors++;
 			continue;
 		}
-		if (!dev->in_fs_metadata || !dev->writeable)
+		if (!dev->in_fs_metadata ||
+		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
 		btrfs_set_stack_device_generation(dev_item, 0);
@@ -3544,7 +3547,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev)
 			continue;
-		if (!dev->in_fs_metadata || !dev->writeable)
+		if (!dev->in_fs_metadata ||
+		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
 		ret = wait_dev_supers(dev, max_mirrors);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7411f65099d1..53cb2a1d025f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10878,7 +10878,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 	*trimmed = 0;
 
 	/* Not writeable = nothing to do. */
-	if (!device->writeable)
+	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
 		return 0;
 
 	/* No free space = nothing to do. */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ef72efef8b39..16ae832bdb5d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2026,7 +2026,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	bio->bi_iter.bi_sector = sector;
 	dev = bbio->stripes[bbio->mirror_num - 1].dev;
 	btrfs_put_bbio(bbio);
-	if (!dev || !dev->bdev || !dev->writeable) {
+	if (!dev || !dev->bdev ||
+	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
 		btrfs_bio_counter_dec(fs_info);
 		bio_put(bio);
 		return -EIO;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e7f37c46d6a9..8c75e555e3f8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1503,7 +1503,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		goto out_free;
 	}
 
-	if (!device->writeable) {
+	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		btrfs_info(fs_info,
 			   "resizer unable to apply on readonly device %llu",
 		       devid);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b2f871d80982..7927307652e9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4117,7 +4117,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		return -ENODEV;
 	}
 
-	if (!is_dev_replace && !readonly && !dev->writeable) {
+	if (!is_dev_replace && !readonly &&
+	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		rcu_read_lock();
 		name = rcu_dereference(dev->name);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3e348743271a..c6db1ae4ac56 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -691,10 +691,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 	device->generation = btrfs_super_generation(disk_super);
 
 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
-		device->writeable = 0;
+		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		fs_devices->seeding = 1;
 	} else {
-		device->writeable = !bdev_read_only(bdev);
+		if (bdev_read_only(bdev))
+			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+		else
+			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	}
 
 	q = bdev_get_queue(bdev);
@@ -706,7 +709,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 	device->mode = flags;
 
 	fs_devices->open_devices++;
-	if (device->writeable && device->devid != BTRFS_DEV_REPLACE_DEVID) {
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 		fs_devices->rw_devices++;
 		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
 	}
@@ -934,9 +938,9 @@ again:
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
-		if (device->writeable) {
+		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 			list_del_init(&device->dev_alloc_list);
-			device->writeable = 0;
+			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 			if (!device->is_tgtdev_for_dev_replace)
 				fs_devices->rw_devices--;
 		}
@@ -968,7 +972,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
 	if (!device->bdev)
 		return;
 
-	if (device->writeable) {
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		sync_blockdev(device->bdev);
 		invalidate_bdev(device->bdev);
 	}
@@ -985,7 +989,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
 	if (device->bdev)
 		fs_devices->open_devices--;
 
-	if (device->writeable &&
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 		list_del_init(&device->dev_alloc_list);
 		fs_devices->rw_devices--;
@@ -1943,12 +1947,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		goto out;
 	}
 
-	if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+	    fs_info->fs_devices->rw_devices == 1) {
 		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
 		goto out;
 	}
 
-	if (device->writeable) {
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		mutex_lock(&fs_info->chunk_mutex);
 		list_del_init(&device->dev_alloc_list);
 		device->fs_devices->rw_devices--;
@@ -2010,7 +2015,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	 * the devices list.  All that's left is to zero out the old
 	 * supers and free the device.
 	 */
-	if (device->writeable)
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
 		btrfs_scratch_superblocks(device->bdev, device->name->str);
 
 	btrfs_close_bdev(device);
@@ -2037,7 +2042,7 @@ out:
 	return ret;
 
 error_undo:
-	if (device->writeable) {
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		mutex_lock(&fs_info->chunk_mutex);
 		list_add(&device->dev_alloc_list,
 			 &fs_info->fs_devices->alloc_list);
@@ -2068,7 +2073,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 	if (srcdev->missing)
 		fs_devices->missing_devices--;
 
-	if (srcdev->writeable)
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
 		fs_devices->rw_devices--;
 
 	if (srcdev->bdev)
@@ -2080,7 +2085,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
 
-	if (srcdev->writeable) {
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
 		/* zero out the old super if it is writable */
 		btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
 	}
@@ -2431,7 +2436,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 
 	q = bdev_get_queue(bdev);
-	device->writeable = 1;
+	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	device->generation = trans->transid;
 	device->io_width = fs_info->sectorsize;
 	device->io_align = fs_info->sectorsize;
@@ -2638,7 +2643,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	rcu_assign_pointer(device->name, name);
 
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	device->writeable = 1;
+	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	device->generation = 0;
 	device->io_width = fs_info->sectorsize;
 	device->io_align = fs_info->sectorsize;
@@ -2738,7 +2743,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 	u64 old_total;
 	u64 diff;
 
-	if (!device->writeable)
+	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
 		return -EACCES;
 
 	new_size = round_down(new_size, fs_info->sectorsize);
@@ -3558,7 +3563,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		old_size = btrfs_device_get_total_bytes(device);
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min_t(u64, size_to_free, SZ_1M);
-		if (!device->writeable ||
+		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
 		    btrfs_device_get_total_bytes(device) -
 		    btrfs_device_get_bytes_used(device) > size_to_free ||
 		    device->is_tgtdev_for_dev_replace)
@@ -4441,7 +4446,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	mutex_lock(&fs_info->chunk_mutex);
 
 	btrfs_device_set_total_bytes(device, new_size);
-	if (device->writeable) {
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		device->fs_devices->total_rw_bytes -= diff;
 		atomic64_sub(diff, &fs_info->free_chunk_space);
 	}
@@ -4566,7 +4571,7 @@ done:
 	if (ret) {
 		mutex_lock(&fs_info->chunk_mutex);
 		btrfs_device_set_total_bytes(device, old_size);
-		if (device->writeable)
+		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
 			device->fs_devices->total_rw_bytes += diff;
 		atomic64_add(diff, &fs_info->free_chunk_space);
 		mutex_unlock(&fs_info->chunk_mutex);
@@ -4726,7 +4731,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		u64 max_avail;
 		u64 dev_offset;
 
-		if (!device->writeable) {
+		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 			WARN(1, KERN_ERR
 			       "BTRFS: read-only device in alloc_list\n");
 			continue;
@@ -5085,8 +5090,8 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 			miss_ndevs++;
 			continue;
 		}
-
-		if (!map->stripes[i].dev->writeable) {
+		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
+					&map->stripes[i].dev->dev_state)) {
 			readonly = 1;
 			goto end;
 		}
@@ -6255,7 +6260,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
 		dev = bbio->stripes[dev_nr].dev;
 		if (!dev || !dev->bdev ||
-		    (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
+		    (bio_op(first_bio) == REQ_OP_WRITE &&
+		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
 			bbio_error(bbio, first_bio, logical);
 			continue;
 		}
@@ -6691,7 +6697,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	}
 
 	if (device->fs_devices != fs_info->fs_devices) {
-		BUG_ON(device->writeable);
+		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
 		if (device->generation !=
 		    btrfs_device_generation(leaf, dev_item))
 			return -EINVAL;
@@ -6699,7 +6705,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->in_fs_metadata = 1;
-	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
+	    !device->is_tgtdev_for_dev_replace) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
 		atomic64_add(device->total_bytes - device->bytes_used,
 				&fs_info->free_chunk_space);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 60588c259665..893e283c7f15 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -47,6 +47,8 @@ struct btrfs_pending_bios {
 #define btrfs_device_data_ordered_init(device) do { } while (0)
 #endif
 
+#define BTRFS_DEV_STATE_WRITEABLE	(0)
+
 struct btrfs_device {
 	struct list_head dev_list;
 	struct list_head dev_alloc_list;
@@ -69,7 +71,7 @@ struct btrfs_device {
 	/* the mode sent to blkdev_get */
 	fmode_t mode;
 
-	int writeable;
+	unsigned long dev_state;
 	int in_fs_metadata;
 	int missing;
 	int is_tgtdev_for_dev_replace;
-- 
cgit v1.2.3


From e12c96214d28f9211b4035cf20e76d677ff5611f Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 4 Dec 2017 12:54:53 +0800
Subject: btrfs: cleanup device states define BTRFS_DEV_STATE_IN_FS_METADATA

Currently device state is being managed by each individual int
variable such as struct btrfs_device::in_fs_metadata. Instead of
that declare device state BTRFS_DEV_STATE_IN_FS_METADATA and use
the bit operations.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
[ whitespace adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c |  8 ++++----
 fs/btrfs/scrub.c   |  3 ++-
 fs/btrfs/super.c   |  5 +++--
 fs/btrfs/volumes.c | 29 +++++++++++++++++------------
 fs/btrfs/volumes.h |  2 +-
 5 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 396f8183ee86..44573cdda1fd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3393,7 +3393,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 			continue;
 		if (!dev->bdev)
 			continue;
-		if (!dev->in_fs_metadata ||
+		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
@@ -3409,7 +3409,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 			errors_wait++;
 			continue;
 		}
-		if (!dev->in_fs_metadata ||
+		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
@@ -3507,7 +3507,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 			total_errors++;
 			continue;
 		}
-		if (!dev->in_fs_metadata ||
+		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
@@ -3547,7 +3547,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
 	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev)
 			continue;
-		if (!dev->in_fs_metadata ||
+		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
 			continue;
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 7927307652e9..dc58005cb9f6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4129,7 +4129,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	}
 
 	mutex_lock(&fs_info->scrub_lock);
-	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
+	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
+	    dev->is_tgtdev_for_dev_replace) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return -EIO;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7a7abe827ac4..bc55c5e6badd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1972,8 +1972,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
-		if (!device->in_fs_metadata || !device->bdev ||
-		    device->is_tgtdev_for_dev_replace)
+		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+						&device->dev_state) ||
+		    !device->bdev || device->is_tgtdev_for_dev_replace)
 			continue;
 
 		if (i >= nr_devices)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c6db1ae4ac56..e782f6853757 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -705,7 +705,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		fs_devices->rotating = 1;
 
 	device->bdev = bdev;
-	device->in_fs_metadata = 0;
+	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	device->mode = flags;
 
 	fs_devices->open_devices++;
@@ -909,7 +909,8 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
 again:
 	/* This is the initialized path, it is safe to release the devices. */
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
-		if (device->in_fs_metadata) {
+		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+							&device->dev_state)) {
 			if (!device->is_tgtdev_for_dev_replace &&
 			    (!latest_dev ||
 			     device->generation > latest_dev->generation)) {
@@ -1634,7 +1635,7 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 
-	WARN_ON(!device->in_fs_metadata);
+	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
 	WARN_ON(device->is_tgtdev_for_dev_replace);
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1975,7 +1976,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	if (ret)
 		goto error_undo;
 
-	device->in_fs_metadata = 0;
+	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	btrfs_scrub_cancel_dev(fs_info, device);
 
 	/*
@@ -2195,7 +2196,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
 		 * is held by the caller.
 		 */
 		list_for_each_entry(tmp, devices, dev_list) {
-			if (tmp->in_fs_metadata && !tmp->bdev) {
+			if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+					&tmp->dev_state) && !tmp->bdev) {
 				*device = tmp;
 				break;
 			}
@@ -2447,7 +2449,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	device->commit_total_bytes = device->total_bytes;
 	device->fs_info = fs_info;
 	device->bdev = bdev;
-	device->in_fs_metadata = 1;
+	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	device->is_tgtdev_for_dev_replace = 0;
 	device->mode = FMODE_EXCL;
 	device->dev_stats_valid = 1;
@@ -2656,7 +2658,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->commit_bytes_used = device->bytes_used;
 	device->fs_info = fs_info;
 	device->bdev = bdev;
-	device->in_fs_metadata = 1;
+	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	device->is_tgtdev_for_dev_replace = 1;
 	device->mode = FMODE_EXCL;
 	device->dev_stats_valid = 1;
@@ -2685,7 +2687,7 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
 	tgtdev->io_align = sectorsize;
 	tgtdev->sector_size = sectorsize;
 	tgtdev->fs_info = fs_info;
-	tgtdev->in_fs_metadata = 1;
+	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state);
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -4737,8 +4739,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
-		if (!device->in_fs_metadata ||
-		    device->is_tgtdev_for_dev_replace)
+		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+					&device->dev_state) ||
+		     device->is_tgtdev_for_dev_replace)
 			continue;
 
 		if (device->total_bytes > device->bytes_used)
@@ -6529,7 +6532,9 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
 			}
 			btrfs_report_missing_device(fs_info, devid, uuid, false);
 		}
-		map->stripes[i].dev->in_fs_metadata = 1;
+		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+				&(map->stripes[i].dev->dev_state));
+
 	}
 
 	write_lock(&map_tree->map_tree.lock);
@@ -6704,7 +6709,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
-	device->in_fs_metadata = 1;
+	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 	    !device->is_tgtdev_for_dev_replace) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 893e283c7f15..d290641658cc 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -48,6 +48,7 @@ struct btrfs_pending_bios {
 #endif
 
 #define BTRFS_DEV_STATE_WRITEABLE	(0)
+#define BTRFS_DEV_STATE_IN_FS_METADATA	(1)
 
 struct btrfs_device {
 	struct list_head dev_list;
@@ -72,7 +73,6 @@ struct btrfs_device {
 	fmode_t mode;
 
 	unsigned long dev_state;
-	int in_fs_metadata;
 	int missing;
 	int is_tgtdev_for_dev_replace;
 	blk_status_t last_flush_error;
-- 
cgit v1.2.3


From e6e674bd4d54fe8d47a06914f3b90752785b4882 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 4 Dec 2017 12:54:54 +0800
Subject: btrfs: cleanup device states define BTRFS_DEV_STATE_MISSING

Currently device state is being managed by each individual int
variable such as struct btrfs_device::missing. Instead of that
declare btrfs_device::dev_state BTRFS_DEV_STATE_MISSING and use
the bit operations.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by : Nikolay Borisov <nborisov@suse.com>
[ whitespace adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c |  2 +-
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/scrub.c       |  7 ++++---
 fs/btrfs/super.c       |  2 +-
 fs/btrfs/volumes.c     | 32 +++++++++++++++++++-------------
 fs/btrfs/volumes.h     |  2 +-
 6 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 483eb62b9b27..d2c820c6f91e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -306,7 +306,7 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 
 static char* btrfs_dev_name(struct btrfs_device *device)
 {
-	if (device->missing)
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 		return "<missing disk>";
 	else
 		return rcu_str_deref(device->name);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 44573cdda1fd..4f7d2a38865c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3389,7 +3389,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
 	list_for_each_entry(dev, head, dev_list) {
-		if (dev->missing)
+		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
 			continue;
 		if (!dev->bdev)
 			continue;
@@ -3403,7 +3403,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 	/* wait for all the barriers */
 	list_for_each_entry(dev, head, dev_list) {
-		if (dev->missing)
+		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
 			continue;
 		if (!dev->bdev) {
 			errors_wait++;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index dc58005cb9f6..f8a073ce490b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2535,7 +2535,7 @@ leave_nomem:
 	}
 
 	WARN_ON(sblock->page_count == 0);
-	if (dev->missing) {
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
 		/*
 		 * This case should only be hit for RAID 5/6 device replace. See
 		 * the comment in scrub_missing_raid56_pages() for details.
@@ -2870,7 +2870,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
 	u8 csum[BTRFS_CSUM_SIZE];
 	u32 blocksize;
 
-	if (dev->missing) {
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
 		scrub_parity_mark_sectors_error(sparity, logical, len);
 		return 0;
 	}
@@ -4112,7 +4112,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
-	if (!dev || (dev->missing && !is_dev_replace)) {
+	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
+		     !is_dev_replace)) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return -ENODEV;
 	}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index bc55c5e6badd..ba5324523995 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2270,7 +2270,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 	while (cur_devices) {
 		head = &cur_devices->devices;
 		list_for_each_entry(dev, head, dev_list) {
-			if (dev->missing)
+			if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
 				continue;
 			if (!dev->name)
 				continue;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e782f6853757..a102e77b35b0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -825,9 +825,9 @@ static noinline int device_list_add(const char *path,
 			return -ENOMEM;
 		rcu_string_free(device->name);
 		rcu_assign_pointer(device->name, name);
-		if (device->missing) {
+		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 			fs_devices->missing_devices--;
-			device->missing = 0;
+			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 		}
 	}
 
@@ -996,7 +996,7 @@ static void btrfs_prepare_close_one_device(struct btrfs_device *device)
 		fs_devices->rw_devices--;
 	}
 
-	if (device->missing)
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 		fs_devices->missing_devices--;
 
 	new_device = btrfs_alloc_device(NULL, &device->devid,
@@ -1882,7 +1882,8 @@ static struct btrfs_device * btrfs_find_next_active_device(
 
 	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
 		if (next_device != device &&
-			!next_device->missing && next_device->bdev)
+		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
+		    && next_device->bdev)
 			return next_device;
 	}
 
@@ -1996,7 +1997,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	device->fs_devices->num_devices--;
 	device->fs_devices->total_devices--;
 
-	if (device->missing)
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 		device->fs_devices->missing_devices--;
 
 	btrfs_assign_next_active_device(fs_info, device, NULL);
@@ -2071,7 +2072,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 	list_del_rcu(&srcdev->dev_list);
 	list_del(&srcdev->dev_alloc_list);
 	fs_devices->num_devices--;
-	if (srcdev->missing)
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
 		fs_devices->missing_devices--;
 
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
@@ -5089,7 +5090,8 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 
 	map = em->map_lookup;
 	for (i = 0; i < map->num_stripes; i++) {
-		if (map->stripes[i].dev->missing) {
+		if (test_bit(BTRFS_DEV_STATE_MISSING,
+					&map->stripes[i].dev->dev_state)) {
 			miss_ndevs++;
 			continue;
 		}
@@ -6117,7 +6119,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
 	int should_queue = 1;
 	struct btrfs_pending_bios *pending_bios;
 
-	if (device->missing || !device->bdev) {
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) ||
+	    !device->bdev) {
 		bio_io_error(bio);
 		return;
 	}
@@ -6313,7 +6316,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
 	device->fs_devices = fs_devices;
 	fs_devices->num_devices++;
 
-	device->missing = 1;
+	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 	fs_devices->missing_devices++;
 
 	return device;
@@ -6675,7 +6678,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 							dev_uuid, false);
 		}
 
-		if(!device->bdev && !device->missing) {
+		if (!device->bdev &&
+		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 			/*
 			 * this happens when a device that was properly setup
 			 * in the device info lists suddenly goes bad.
@@ -6683,12 +6687,13 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 			 * device->missing to one here
 			 */
 			device->fs_devices->missing_devices++;
-			device->missing = 1;
+			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 		}
 
 		/* Move the device to its own fs_devices */
 		if (device->fs_devices != fs_devices) {
-			ASSERT(device->missing);
+			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
+							&device->dev_state));
 
 			list_move(&device->dev_list, &fs_devices->devices);
 			device->fs_devices->num_devices--;
@@ -6874,7 +6879,8 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
 		for (i = 0; i < map->num_stripes; i++) {
 			struct btrfs_device *dev = map->stripes[i].dev;
 
-			if (!dev || !dev->bdev || dev->missing ||
+			if (!dev || !dev->bdev ||
+			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
 			    dev->last_flush_error)
 				missing++;
 		}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d290641658cc..68affb06502a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -49,6 +49,7 @@ struct btrfs_pending_bios {
 
 #define BTRFS_DEV_STATE_WRITEABLE	(0)
 #define BTRFS_DEV_STATE_IN_FS_METADATA	(1)
+#define BTRFS_DEV_STATE_MISSING		(2)
 
 struct btrfs_device {
 	struct list_head dev_list;
@@ -73,7 +74,6 @@ struct btrfs_device {
 	fmode_t mode;
 
 	unsigned long dev_state;
-	int missing;
 	int is_tgtdev_for_dev_replace;
 	blk_status_t last_flush_error;
 	int flush_bio_sent;
-- 
cgit v1.2.3


From 401e29c124eac2b9373eb72f36f202561da339d2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 4 Dec 2017 12:54:55 +0800
Subject: btrfs: cleanup device states define BTRFS_DEV_STATE_REPLACE_TGT

Currently device state is being managed by each individual int
variable such as struct btrfs_device::is_tgtdev_for_dev_replace.
Instead of that declare btrfs_device::dev_state
BTRFS_DEV_STATE_MISSING and use the bit operations.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[ whitespace adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c |  5 +++--
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/scrub.c       |  2 +-
 fs/btrfs/super.c       |  3 ++-
 fs/btrfs/volumes.c     | 39 ++++++++++++++++++++++-----------------
 fs/btrfs/volumes.h     |  2 +-
 7 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d2c820c6f91e..7efbc4d1128b 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -172,7 +172,8 @@ no_valid_dev_replace_entry_found:
 				dev_replace->tgtdev->commit_bytes_used =
 					dev_replace->srcdev->commit_bytes_used;
 			}
-			dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+			set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+				&dev_replace->tgtdev->dev_state);
 			btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
 				dev_replace->tgtdev);
 		}
@@ -566,7 +567,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 			  btrfs_dev_name(src_device),
 			  src_device->devid,
 			  rcu_str_deref(tgt_device->name));
-	tgt_device->is_tgtdev_for_dev_replace = 0;
+	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
 	tgt_device->devid = src_device->devid;
 	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
 	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53cb2a1d025f..8d51e4bb67c1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9693,7 +9693,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 		 * space to fit our block group in.
 		 */
 		if (device->total_bytes > device->bytes_used + min_free &&
-		    !device->is_tgtdev_for_dev_replace) {
+		    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 			ret = find_free_dev_extent(trans, device, min_free,
 						   &dev_offset, NULL);
 			if (!ret)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8c75e555e3f8..a1fd5f7f8298 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1528,7 +1528,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		}
 	}
 
-	if (device->is_tgtdev_for_dev_replace) {
+	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 		ret = -EPERM;
 		goto out_free;
 	}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f8a073ce490b..03da807c43cf 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4131,7 +4131,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	mutex_lock(&fs_info->scrub_lock);
 	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
-	    dev->is_tgtdev_for_dev_replace) {
+	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return -EIO;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ba5324523995..84707e77c051 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1974,7 +1974,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
 		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
 						&device->dev_state) ||
-		    !device->bdev || device->is_tgtdev_for_dev_replace)
+		    !device->bdev ||
+		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
 			continue;
 
 		if (i >= nr_devices)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a102e77b35b0..3690822d469a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -911,9 +911,10 @@ again:
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
 							&device->dev_state)) {
-			if (!device->is_tgtdev_for_dev_replace &&
-			    (!latest_dev ||
-			     device->generation > latest_dev->generation)) {
+			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+			     &device->dev_state) &&
+			     (!latest_dev ||
+			      device->generation > latest_dev->generation)) {
 				latest_dev = device;
 			}
 			continue;
@@ -930,7 +931,8 @@ again:
 			 * not, which means whether this device is
 			 * used or whether it should be removed.
 			 */
-			if (step == 0 || device->is_tgtdev_for_dev_replace) {
+			if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+						  &device->dev_state)) {
 				continue;
 			}
 		}
@@ -942,7 +944,8 @@ again:
 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 			list_del_init(&device->dev_alloc_list);
 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
-			if (!device->is_tgtdev_for_dev_replace)
+			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
+				      &device->dev_state))
 				fs_devices->rw_devices--;
 		}
 		list_del_init(&device->dev_list);
@@ -1250,7 +1253,8 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 
 	*length = 0;
 
-	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
+	if (start >= device->total_bytes ||
+		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
 		return 0;
 
 	path = btrfs_alloc_path();
@@ -1428,7 +1432,8 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
 	max_hole_size = 0;
 
 again:
-	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
+	if (search_start >= search_end ||
+		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 		ret = -ENOSPC;
 		goto out;
 	}
@@ -1636,7 +1641,7 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 
 	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
-	WARN_ON(device->is_tgtdev_for_dev_replace);
+	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1944,7 +1949,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	if (ret)
 		goto out;
 
-	if (device->is_tgtdev_for_dev_replace) {
+	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
 		goto out;
 	}
@@ -2451,7 +2456,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	device->fs_info = fs_info;
 	device->bdev = bdev;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
-	device->is_tgtdev_for_dev_replace = 0;
+	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 	device->mode = FMODE_EXCL;
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
@@ -2660,7 +2665,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->fs_info = fs_info;
 	device->bdev = bdev;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
-	device->is_tgtdev_for_dev_replace = 1;
+	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 	device->mode = FMODE_EXCL;
 	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
@@ -2756,7 +2761,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
 
 	if (new_size <= device->total_bytes ||
-	    device->is_tgtdev_for_dev_replace) {
+	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 		mutex_unlock(&fs_info->chunk_mutex);
 		return -EINVAL;
 	}
@@ -3569,7 +3574,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) ||
 		    btrfs_device_get_total_bytes(device) -
 		    btrfs_device_get_bytes_used(device) > size_to_free ||
-		    device->is_tgtdev_for_dev_replace)
+		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
 			continue;
 
 		ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -4437,7 +4442,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	new_size = round_down(new_size, fs_info->sectorsize);
 	diff = round_down(old_size - new_size, fs_info->sectorsize);
 
-	if (device->is_tgtdev_for_dev_replace)
+	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
 		return -EINVAL;
 
 	path = btrfs_alloc_path();
@@ -4742,7 +4747,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
 					&device->dev_state) ||
-		     device->is_tgtdev_for_dev_replace)
+		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
 			continue;
 
 		if (device->total_bytes > device->bytes_used)
@@ -6566,7 +6571,7 @@ static void fill_device_from_item(struct extent_buffer *leaf,
 	device->io_width = btrfs_device_io_width(leaf, dev_item);
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
 	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
-	device->is_tgtdev_for_dev_replace = 0;
+	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 
 	ptr = btrfs_device_uuid(dev_item);
 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -6716,7 +6721,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	fill_device_from_item(leaf, dev_item, device);
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
-	    !device->is_tgtdev_for_dev_replace) {
+	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
 		atomic64_add(device->total_bytes - device->bytes_used,
 				&fs_info->free_chunk_space);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 68affb06502a..2ac123154c8d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_pending_bios {
 #define BTRFS_DEV_STATE_WRITEABLE	(0)
 #define BTRFS_DEV_STATE_IN_FS_METADATA	(1)
 #define BTRFS_DEV_STATE_MISSING		(2)
+#define BTRFS_DEV_STATE_REPLACE_TGT	(3)
 
 struct btrfs_device {
 	struct list_head dev_list;
@@ -74,7 +75,6 @@ struct btrfs_device {
 	fmode_t mode;
 
 	unsigned long dev_state;
-	int is_tgtdev_for_dev_replace;
 	blk_status_t last_flush_error;
 	int flush_bio_sent;
 
-- 
cgit v1.2.3


From 1c3063b6dbfa03e469a53371fae149a022a41bfd Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 4 Dec 2017 12:54:56 +0800
Subject: btrfs: cleanup device states define BTRFS_DEV_STATE_FLUSH_SENT

Currently device state is being managed by each individual int
variable such as struct btrfs_device::is_tgtdev_for_dev_replace.
Instead of that declare btrfs_device::dev_state
BTRFS_DEV_STATE_FLUSH_SENT and use the bit operations.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 6 +++---
 fs/btrfs/volumes.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4f7d2a38865c..392d6cde4308 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3348,7 +3348,7 @@ static void write_dev_flush(struct btrfs_device *device)
 	bio->bi_private = &device->flush_wait;
 
 	btrfsic_submit_bio(bio);
-	device->flush_bio_sent = 1;
+	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
 }
 
 /*
@@ -3358,10 +3358,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device)
 {
 	struct bio *bio = device->flush_bio;
 
-	if (!device->flush_bio_sent)
+	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
 		return BLK_STS_OK;
 
-	device->flush_bio_sent = 0;
+	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
 	wait_for_completion_io(&device->flush_wait);
 
 	return bio->bi_status;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2ac123154c8d..3e3ae44b6ccc 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -51,6 +51,7 @@ struct btrfs_pending_bios {
 #define BTRFS_DEV_STATE_IN_FS_METADATA	(1)
 #define BTRFS_DEV_STATE_MISSING		(2)
 #define BTRFS_DEV_STATE_REPLACE_TGT	(3)
+#define BTRFS_DEV_STATE_FLUSH_SENT	(4)
 
 struct btrfs_device {
 	struct list_head dev_list;
-- 
cgit v1.2.3


From 440c840cb49f7de91e68a4cc7bca79a75cd298ae Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Mon, 4 Dec 2017 00:30:33 +0300
Subject: Btrfs: compression heuristic: replace heap sort with radix sort

Slowest part of heuristic for now is kernel heap sort()
It's can take up to 55% of runtime on sorting bucket items.

As sorting will always call on most data sets to get correctly
byte_core_set_size, the only way to speed up heuristic, is to
speed up sort on bucket.

Add a general radix_sort function.
Radix sort require 2 buffers, one full size of input array
and one for store counters (jump addresses).

That increase usage per heuristic workspace +1KiB
8KiB + 1KiB -> 8KiB + 2KiB

That is LSD Radix, i use 4 bit as a base for calculating,
to make counters array acceptable small (16 elements * 8 byte).

That Radix sort implementation have several points to adjust,
I added him to make radix sort general usable in kernel,
like heap sort, if needed.

Performance tested in userspace copy of heuristic code,
throughput:
    - average <-> random data: ~3500 MiB/s - heap  sort
    - average <-> random data: ~6000 MiB/s - radix sort

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
[ coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 123 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 5982c8a71f02..8cd48d7c3f76 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,7 +33,6 @@
 #include <linux/bit_spinlock.h>
 #include <linux/slab.h>
 #include <linux/sched/mm.h>
-#include <linux/sort.h>
 #include <linux/log2.h>
 #include "ctree.h"
 #include "disk-io.h"
@@ -752,6 +751,8 @@ struct heuristic_ws {
 	u32 sample_size;
 	/* Buckets store counters for each byte value */
 	struct bucket_item *bucket;
+	/* Sorting buffer */
+	struct bucket_item *bucket_b;
 	struct list_head list;
 };
 
@@ -763,6 +764,7 @@ static void free_heuristic_ws(struct list_head *ws)
 
 	kvfree(workspace->sample);
 	kfree(workspace->bucket);
+	kfree(workspace->bucket_b);
 	kfree(workspace);
 }
 
@@ -782,6 +784,10 @@ static struct list_head *alloc_heuristic_ws(void)
 	if (!ws->bucket)
 		goto fail;
 
+	ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
+	if (!ws->bucket_b)
+		goto fail;
+
 	INIT_LIST_HEAD(&ws->list);
 	return &ws->list;
 fail:
@@ -1278,13 +1284,122 @@ static u32 shannon_entropy(struct heuristic_ws *ws)
 	return entropy_sum * 100 / entropy_max;
 }
 
-/* Compare buckets by size, ascending */
-static int bucket_comp_rev(const void *lv, const void *rv)
+#define RADIX_BASE		4U
+#define COUNTERS_SIZE		(1U << RADIX_BASE)
+
+static u8 get4bits(u64 num, int shift) {
+	u8 low4bits;
+
+	num >>= shift;
+	/* Reverse order */
+	low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
+	return low4bits;
+}
+
+static void copy_cell(void *dst, int dest_i, void *src, int src_i)
 {
-	const struct bucket_item *l = (const struct bucket_item *)lv;
-	const struct bucket_item *r = (const struct bucket_item *)rv;
+	struct bucket_item *dstv = (struct bucket_item *)dst;
+	struct bucket_item *srcv = (struct bucket_item *)src;
+	dstv[dest_i] = srcv[src_i];
+}
+
+static u64 get_num(const void *a, int i)
+{
+	struct bucket_item *av = (struct bucket_item *)a;
+	return av[i].count;
+}
 
-	return r->count - l->count;
+/*
+ * Use 4 bits as radix base
+ * Use 16 u32 counters for calculating new possition in buf array
+ *
+ * @array     - array that will be sorted
+ * @array_buf - buffer array to store sorting results
+ *              must be equal in size to @array
+ * @num       - array size
+ * @get_num   - function to extract number from array
+ * @copy_cell - function to copy data from array to array_buf and vice versa
+ * @get4bits  - function to get 4 bits from number at specified offset
+ */
+static void radix_sort(void *array, void *array_buf, int num,
+		       u64 (*get_num)(const void *, int i),
+		       void (*copy_cell)(void *dest, int dest_i,
+					 void* src, int src_i),
+		       u8 (*get4bits)(u64 num, int shift))
+{
+	u64 max_num;
+	u64 buf_num;
+	u32 counters[COUNTERS_SIZE];
+	u32 new_addr;
+	u32 addr;
+	int bitlen;
+	int shift;
+	int i;
+
+	/*
+	 * Try avoid useless loop iterations for small numbers stored in big
+	 * counters.  Example: 48 33 4 ... in 64bit array
+	 */
+	max_num = get_num(array, 0);
+	for (i = 1; i < num; i++) {
+		buf_num = get_num(array, i);
+		if (buf_num > max_num)
+			max_num = buf_num;
+	}
+
+	buf_num = ilog2(max_num);
+	bitlen = ALIGN(buf_num, RADIX_BASE * 2);
+
+	shift = 0;
+	while (shift < bitlen) {
+		memset(counters, 0, sizeof(counters));
+
+		for (i = 0; i < num; i++) {
+			buf_num = get_num(array, i);
+			addr = get4bits(buf_num, shift);
+			counters[addr]++;
+		}
+
+		for (i = 1; i < COUNTERS_SIZE; i++)
+			counters[i] += counters[i - 1];
+
+		for (i = num - 1; i >= 0; i--) {
+			buf_num = get_num(array, i);
+			addr = get4bits(buf_num, shift);
+			counters[addr]--;
+			new_addr = counters[addr];
+			copy_cell(array_buf, new_addr, array, i);
+		}
+
+		shift += RADIX_BASE;
+
+		/*
+		 * Normal radix expects to move data from a temporary array, to
+		 * the main one.  But that requires some CPU time. Avoid that
+		 * by doing another sort iteration to original array instead of
+		 * memcpy()
+		 */
+		memset(counters, 0, sizeof(counters));
+
+		for (i = 0; i < num; i ++) {
+			buf_num = get_num(array_buf, i);
+			addr = get4bits(buf_num, shift);
+			counters[addr]++;
+		}
+
+		for (i = 1; i < COUNTERS_SIZE; i++)
+			counters[i] += counters[i - 1];
+
+		for (i = num - 1; i >= 0; i--) {
+			buf_num = get_num(array_buf, i);
+			addr = get4bits(buf_num, shift);
+			counters[addr]--;
+			new_addr = counters[addr];
+			copy_cell(array, new_addr, array_buf, i);
+		}
+
+		shift += RADIX_BASE;
+	}
 }
 
 /*
@@ -1314,7 +1429,8 @@ static int byte_core_set_size(struct heuristic_ws *ws)
 	struct bucket_item *bucket = ws->bucket;
 
 	/* Sort in reverse order */
-	sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL);
+	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE, get_num, copy_cell,
+			get4bits);
 
 	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
 		coreset_sum += bucket[i].count;
-- 
cgit v1.2.3


From e9679de3fdcb11a13cfe3873c9923b761c8067ab Mon Sep 17 00:00:00 2001
From: Timofey Titovets <nefelim4ag@gmail.com>
Date: Tue, 24 Oct 2017 01:29:48 +0300
Subject: Btrfs: compress_file_range() change page dirty status once

We need to call extent_range_clear_dirty_for_io()
on compression range to prevent application from changing
page content, while pages compressing.

extent_range_clear_dirty_for_io() runs on each loop iteration,
"(end - start)" can be much (up to 1024 times) bigger
then compression range (BTRFS_MAX_UNCOMPRESSED).

The start pointer is advanced each time we manage to compress part of
the range. The end pointer does not change so we could redirty the
remaining parts repeatedly.

Fix that behaviour by call extent_range_clear_dirty_for_io()
only once, the first time it happens.

This is the safest but probably not the best behaviour. Previous
iterations of the patch tried to redirty only the range that we were not
able to compress. This has been refused by David for safety reasons, the
writeout callchain is complex and there could be some path that relies
on redirtying the entire unwritten range.

Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ enhance changelog, the history and safety concerns, add comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 57785eadb95c..e87ec11c0986 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -536,9 +536,14 @@ again:
 		 *
 		 * If the compression fails for any reason, we set the pages
 		 * dirty again later on.
+		 *
+		 * Note that the remaining part is redirtied, the start pointer
+		 * has moved, the end is the original one.
 		 */
-		extent_range_clear_dirty_for_io(inode, start, end);
-		redirty = 1;
+		if (!redirty) {
+			extent_range_clear_dirty_for_io(inode, start, end);
+			redirty = 1;
+		}
 
 		/* Compression level is applied here and only here */
 		ret = btrfs_compress_pages(
-- 
cgit v1.2.3


From c9f540fa6f4cab5a506b157e7bf2e5373c0241c2 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 4 Dec 2017 18:09:42 -0700
Subject: Btrfs: remove unused variable wait in lock_stripe_add

The defined wait is not used anywhere.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a7f79254ecca..3940906533a2 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -670,7 +670,6 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 	struct btrfs_raid_bio *cur;
 	struct btrfs_raid_bio *pending;
 	unsigned long flags;
-	DEFINE_WAIT(wait);
 	struct btrfs_raid_bio *freeit = NULL;
 	struct btrfs_raid_bio *cache_drop = NULL;
 	int ret = 0;
-- 
cgit v1.2.3


From b4ff5ad72e9331aabfa92456403946d17c4959d5 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 30 Nov 2017 17:26:39 -0700
Subject: Btrfs: use struct completion in scrub_submit_raid56_bio_wait

This changes to use struct completion directly and removes 'struct
scrub_bio_ret' along with the code using it.

This struct is used to get the return value from bio, but the caller can
access bio to get the return value directly and is holding a reference
on it so it won't go away underneath us and can be removed safely.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 03da807c43cf..d766c73eb29a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1666,17 +1666,9 @@ leave_nomem:
 	return 0;
 }
 
-struct scrub_bio_ret {
-	struct completion event;
-	blk_status_t status;
-};
-
 static void scrub_bio_wait_endio(struct bio *bio)
 {
-	struct scrub_bio_ret *ret = bio->bi_private;
-
-	ret->status = bio->bi_status;
-	complete(&ret->event);
+	complete(bio->bi_private);
 }
 
 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
@@ -1689,11 +1681,9 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 					struct bio *bio,
 					struct scrub_page *page)
 {
-	struct scrub_bio_ret done;
+	DECLARE_COMPLETION_ONSTACK(done);
 	int ret;
 
-	init_completion(&done.event);
-	done.status = 0;
 	bio->bi_iter.bi_sector = page->logical >> 9;
 	bio->bi_private = &done;
 	bio->bi_end_io = scrub_bio_wait_endio;
@@ -1704,11 +1694,8 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 	if (ret)
 		return ret;
 
-	wait_for_completion_io(&done.event);
-	if (done.status)
-		return -EIO;
-
-	return 0;
+	wait_for_completion_io(&done);
+	return blk_status_to_errno(bio->bi_status);
 }
 
 /*
-- 
cgit v1.2.3


From 7cfad65297bfe0aa2996cd72d21c898aa84436d9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Dec 2017 15:18:14 +0100
Subject: btrfs: tree-checker: use %zu format string for size_t

The return value of sizeof() is of type size_t, so we must print it
using the %z format modifier rather than %l to avoid this warning
on some architectures:

fs/btrfs/tree-checker.c: In function 'check_dir_item':
fs/btrfs/tree-checker.c:273:50: error: format '%lu' expects argument of type 'long unsigned int', but argument 5 has type 'u32' {aka 'unsigned int'} [-Werror=format=]

Fixes: 005887f2e3e0 ("btrfs: tree-checker: Add checker for dir item")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 66dac0a4b01f..7c55e3ba5a6c 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -270,7 +270,7 @@ static int check_dir_item(struct btrfs_root *root,
 		/* header itself should not cross item boundary */
 		if (cur + sizeof(*di) > item_size) {
 			dir_item_err(root, leaf, slot,
-		"dir item header crosses item boundary, have %lu boundary %u",
+		"dir item header crosses item boundary, have %zu boundary %u",
 				cur + sizeof(*di), item_size);
 			return -EUCLEAN;
 		}
-- 
cgit v1.2.3


From 5c9a702ed1724f1c856c7299fe1faf9b3b5b4eb0 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 1 Dec 2017 11:19:40 +0200
Subject: btrfs: Remove unused variable in btrfs_get_extent

trans was statically assigned to NULL and this never changed over the
course of btrfs_get_extent. So remove any code which checks whether
trans != NULL and just hardcode the fact trans is always NULL.

Resolves-coverity-id: 112806
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e87ec11c0986..8b6c59a068d2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6948,7 +6948,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 	struct extent_map *em = NULL;
 	struct extent_map_tree *em_tree = &inode->extent_tree;
 	struct extent_io_tree *io_tree = &inode->io_tree;
-	struct btrfs_trans_handle *trans = NULL;
 	const bool new_inline = !page || create;
 
 	read_lock(&em_tree->lock);
@@ -6989,8 +6988,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 		path->reada = READA_FORWARD;
 	}
 
-	ret = btrfs_lookup_file_extent(trans, root, path,
-				       objectid, start, trans != NULL);
+	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
 	if (ret < 0) {
 		err = ret;
 		goto out;
@@ -7186,11 +7184,6 @@ out:
 	trace_btrfs_get_extent(root, inode, em);
 
 	btrfs_free_path(path);
-	if (trans) {
-		ret = btrfs_end_transaction(trans);
-		if (!err)
-			err = ret;
-	}
 	if (err) {
 		free_extent_map(em);
 		return ERR_PTR(err);
-- 
cgit v1.2.3


From bf8d32b9b3c5fb183aa4ac5cb7bae9580dcb0dfb Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 1 Dec 2017 11:19:43 +0200
Subject: btrfs: remove redundant check in btrfs_get_extent_fiemap

Before returning hole_em in btrfs_get_fiemap_extent we check if it's different
than null. However, by the time this null check is triggered we already know
hole_em is not null because it means it points to the em we found and it
has already been dereferenced.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8b6c59a068d2..034d7333b14d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7305,7 +7305,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
 			em->block_start = EXTENT_MAP_DELALLOC;
 			em->block_len = found;
 		}
-	} else if (hole_em) {
+	} else {
 		return hole_em;
 	}
 out:
-- 
cgit v1.2.3


From e128f9c3f7242318e1c76d204c7ae32bc878b8c7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 31 Oct 2017 17:24:26 +0100
Subject: btrfs: compression: add helper for type to string conversion

There are several places opencoding this conversion, add a helper now
that we have 3 compression algorithms.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 15 +++++++++++++++
 fs/btrfs/compression.h |  2 ++
 2 files changed, 17 insertions(+)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8cd48d7c3f76..28c3940062b7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -44,6 +44,21 @@
 #include "extent_io.h"
 #include "extent_map.h"
 
+static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
+
+const char* btrfs_compress_type2str(enum btrfs_compression_type type)
+{
+	switch (type) {
+	case BTRFS_COMPRESS_ZLIB:
+	case BTRFS_COMPRESS_LZO:
+	case BTRFS_COMPRESS_ZSTD:
+	case BTRFS_COMPRESS_NONE:
+		return btrfs_compress_types[type];
+	}
+
+	return NULL;
+}
+
 static int btrfs_decompress_bio(struct compressed_bio *cb);
 
 static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 6b692903a23c..677fa4aa0bd7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -137,6 +137,8 @@ extern const struct btrfs_compress_op btrfs_zlib_compress;
 extern const struct btrfs_compress_op btrfs_lzo_compress;
 extern const struct btrfs_compress_op btrfs_zstd_compress;
 
+const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+
 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
 
 #endif
-- 
cgit v1.2.3


From 93370509c24cc41f994d467d145811ba9c9a48f7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 31 Oct 2017 17:32:41 +0100
Subject: btrfs: SETFLAGS ioctl: use helper for compression type conversion

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a1fd5f7f8298..be5bd81b3669 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -307,12 +307,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ip->flags |= BTRFS_INODE_COMPRESS;
 		ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
 
-		if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
-			comp = "lzo";
-		else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB)
-			comp = "zlib";
-		else
-			comp = "zstd";
+		comp = btrfs_compress_type2str(fs_info->compress_type);
+		if (!comp || comp[0] == 0)
+			comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
+
 		ret = btrfs_set_prop(inode, "btrfs.compression",
 				     comp, strlen(comp), 0);
 		if (ret)
-- 
cgit v1.2.3


From 802a5c69584a0e48ab9797e743fb087c7b56a264 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 31 Oct 2017 17:55:14 +0100
Subject: btrfs: prop: use common helper for type to string conversion

Use the helper for conversion, keep the semantics.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/props.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index c39a940d0c75..b30a056963ab 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -423,11 +423,11 @@ static const char *prop_compression_extract(struct inode *inode)
 {
 	switch (BTRFS_I(inode)->prop_compress) {
 	case BTRFS_COMPRESS_ZLIB:
-		return "zlib";
 	case BTRFS_COMPRESS_LZO:
-		return "lzo";
 	case BTRFS_COMPRESS_ZSTD:
-		return "zstd";
+		return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+	default:
+		break;
 	}
 
 	return NULL;
-- 
cgit v1.2.3


From 0f628c632d4009527aef597dac3a4a09e628b051 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 31 Oct 2017 18:06:34 +0100
Subject: btrfs: show options: use helper to convert compression type string

Use the helper, if the COMPRESS option is set, the result is always
defined and not empty.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 84707e77c051..baa8add64681 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1243,7 +1243,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
 	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
-	char *compress_type;
+	const char *compress_type;
 
 	if (btrfs_test_opt(info, DEGRADED))
 		seq_puts(seq, ",degraded");
@@ -1259,12 +1259,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 					     num_online_cpus() + 2, 8))
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
 	if (btrfs_test_opt(info, COMPRESS)) {
-		if (info->compress_type == BTRFS_COMPRESS_ZLIB)
-			compress_type = "zlib";
-		else if (info->compress_type == BTRFS_COMPRESS_LZO)
-			compress_type = "lzo";
-		else
-			compress_type = "zstd";
+		compress_type = btrfs_compress_type2str(info->compress_type);
 		if (btrfs_test_opt(info, FORCE_COMPRESS))
 			seq_printf(seq, ",compress-force=%s", compress_type);
 		else
-- 
cgit v1.2.3


From ad8bc4d005576e3f380ba2dab24c183519f4e9fa Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 6 Dec 2017 11:40:10 +0800
Subject: btrfs: put btrfs_ioctl_vol_args_v2 related defines together

Just a code spatial rearrangement, no functional change.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/uapi/linux/btrfs.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index ce615b75e855..c8d99b9ca550 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -33,7 +33,12 @@ struct btrfs_ioctl_vol_args {
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+#define BTRFS_DEVICE_PATH_NAME_MAX	1024
+#define BTRFS_SUBVOL_NAME_MAX 		4039
+
+#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
+#define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)
 
 #define BTRFS_DEVICE_SPEC_BY_ID		(1ULL << 3)
 
@@ -101,11 +106,7 @@ struct btrfs_ioctl_qgroup_limit_args {
  * - BTRFS_IOC_SUBVOL_GETFLAGS
  * - BTRFS_IOC_SUBVOL_SETFLAGS
  */
-#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
-#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
-#define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)
 
-#define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
 	__s64 fd;
 	__u64 transid;
-- 
cgit v1.2.3


From f3038ee3a3f1017a1cbe9907e31fa12d366c5dcb Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 5 Dec 2017 09:29:19 +0200
Subject: btrfs: Handle btrfs_set_extent_delalloc failure in fixup worker

This function was introduced by 247e743cbe6e ("Btrfs: Use async helpers
to deal with pages that have been improperly dirtied") and it didn't do
any error handling then. This function might very well fail in ENOMEM
situation, yet it's not handled, this could lead to inconsistent state.
So let's handle the failure by setting the mapping error bit.

Cc: stable@vger.kernel.org
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 034d7333b14d..9ad8c9321c8f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2116,8 +2116,15 @@ again:
 		goto out;
 	 }
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state,
-				  0);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
+					&cached_state, 0);
+	if (ret) {
+		mapping_set_error(page->mapping, ret);
+		end_extent_writepage(page, ret, page_start, page_end);
+		ClearPageChecked(page);
+		goto out;
+	}
+
 	ClearPageChecked(page);
 	set_page_dirty(page);
 	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
-- 
cgit v1.2.3


From 87c46ec70068cd28442954ec3fd0784eb90ca19b Mon Sep 17 00:00:00 2001
From: Pravin Shedge <pravin.shedge4linux@gmail.com>
Date: Wed, 6 Dec 2017 22:14:31 +0530
Subject: btrfs: remove duplicate includes

These duplicate includes have been found with scripts/checkincludes.pl but
they have been removed manually to avoid removing false positives.

Signed-off-by: Pravin Shedge <pravin.shedge4linux@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index baa8add64681..428841a17acb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,7 +61,6 @@
 #include "tests/btrfs-tests.h"
 
 #include "qgroup.h"
-#include "backref.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
-- 
cgit v1.2.3


From 9ea2c7c9da13c9073e371c046cbbc45481ecb459 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 12 Dec 2017 11:14:49 +0200
Subject: btrfs: Fix out of bounds access in btrfs_search_slot

When modifying a tree where the root is at BTRFS_MAX_LEVEL - 1 then
the level variable is going to be 7 (this is the max height of the
tree). On the other hand btrfs_cow_block is always called with
"level + 1" as an index into the nodes and slots arrays. This leads to
an out of bounds access. Admittdely this will be benign since an OOB
access of the nodes array will likely read the 0th element from the
slots array, which in this case is going to be 0 (since we start CoW at
the top of the tree). The OOB access into the slots array in turn will
read the 0th and 1st values of the locks array, which would both be 0
at the time. However, this benign behavior relies on the fact that the
path being passed hasn't been initialised, if it has already been used to
query a btree then it could potentially have populated the nodes/slots arrays.

Fix it by explicitly checking if we are at level 7 (the maximum allowed
index in nodes/slots arrays) and explicitly call the CoW routine with
NULL for parent's node/slot.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Fixes-coverity-id: 711515
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1e74cf826532..5361f69433a3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2774,6 +2774,8 @@ again:
 		 * contention with the cow code
 		 */
 		if (cow) {
+			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
+
 			/*
 			 * if we don't really need to cow this block
 			 * then we don't want to set the path blocking,
@@ -2798,9 +2800,13 @@ again:
 			}
 
 			btrfs_set_path_blocking(p);
-			err = btrfs_cow_block(trans, root, b,
-					      p->nodes[level + 1],
-					      p->slots[level + 1], &b);
+			if (last_level)
+				err = btrfs_cow_block(trans, root, b, NULL, 0,
+						      &b);
+			else
+				err = btrfs_cow_block(trans, root, b,
+						      p->nodes[level + 1],
+						      p->slots[level + 1], &b);
 			if (err) {
 				ret = err;
 				goto done;
-- 
cgit v1.2.3


From 3e798068a8ef400049cc3bb4f01f9701a29e1f86 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 11 Dec 2017 16:38:48 +0200
Subject: btrfs: Remove pair of bio_get/put in btrfs_schedule_bio

This code was added in 492bb6deee34 ("Btrfs: Hold a reference on bios
during submit_bio, add some extra bio checks"). However, holding a
reference on a bio is necessary only if it's going to be referenced
after the submit_bio returns and the bio is completed. In this
particular instance this is not the case so there is no need to hold
an extra reference since we directly return.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3690822d469a..9a04245003ab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6132,9 +6132,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
 
 	/* don't bother with additional async steps for reads, right now */
 	if (bio_op(bio) == REQ_OP_READ) {
-		bio_get(bio);
 		btrfsic_submit_bio(bio);
-		bio_put(bio);
 		return;
 	}
 
-- 
cgit v1.2.3


From 5e3ee23648a20dfaf72eeb88f884aae25ea7d8fb Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 8 Dec 2017 15:55:58 +0200
Subject: btrfs: sink extent_write_locked_range tree parameter

This function is called only from submit_compressed_extents and the
io tree being passed is always that of the inode. But we are also
passing the inode, so just move getting the io tree pointer in
extent_write_locked_range to simplify the signature.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 5 +++--
 fs/btrfs/extent_io.h | 4 ++--
 fs/btrfs/inode.c     | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 16ae832bdb5d..c0b2bf65d6b0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4073,11 +4073,12 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 	return ret;
 }
 
-int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
-			      u64 start, u64 end, int mode)
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
+			      int mode)
 {
 	int ret = 0;
 	struct address_space *mapping = inode->i_mapping;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	struct page *page;
 	unsigned long nr_pages = (end - start + PAGE_SIZE) >>
 		PAGE_SHIFT;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c28f5ef88f42..f2cbabb2306a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -405,8 +405,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  struct writeback_control *wbc);
-int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
-			      u64 start, u64 end, int mode);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
+			      int mode);
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
 		      struct writeback_control *wbc);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9ad8c9321c8f..8a7da59292b7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -770,8 +770,8 @@ retry:
 			 * all those pages down to the drive.
 			 */
 			if (!page_started && !ret)
-				extent_write_locked_range(io_tree,
-						  inode, async_extent->start,
+				extent_write_locked_range(inode,
+						  async_extent->start,
 						  async_extent->start +
 						  async_extent->ram_size - 1,
 						  WB_SYNC_ALL);
-- 
cgit v1.2.3


From 0a9b0e5351818d43ac013c00a1474cc3601fc5bb Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 8 Dec 2017 15:55:59 +0200
Subject: btrfs: sink extent_write_full_page tree argument

The tree argument passed to extent_write_full_page is referenced from
the page being passed to the same function. Since we already have
enough information to get the reference, remove the function parameter.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 5 ++---
 fs/btrfs/extent_io.h | 3 +--
 fs/btrfs/inode.c     | 4 +---
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c0b2bf65d6b0..6cd3da16f114 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4056,13 +4056,12 @@ static noinline void flush_write_bio(void *data)
 	flush_epd_write_bio(epd);
 }
 
-int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
-			  struct writeback_control *wbc)
+int extent_write_full_page(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 	struct extent_page_data epd = {
 		.bio = NULL,
-		.tree = tree,
+		.tree = &BTRFS_I(page->mapping->host)->io_tree,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f2cbabb2306a..db2558b0cad4 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -403,8 +403,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  struct extent_state **cached_state);
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset);
-int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
-			  struct writeback_control *wbc);
+int extent_write_full_page(struct page *page, struct writeback_control *wbc);
 int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
 			      int mode);
 int extent_writepages(struct extent_io_tree *tree,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8a7da59292b7..46df5e2a64e7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8855,7 +8855,6 @@ int btrfs_readpage(struct file *file, struct page *page)
 
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct extent_io_tree *tree;
 	struct inode *inode = page->mapping->host;
 	int ret;
 
@@ -8874,8 +8873,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 		redirty_page_for_writepage(wbc, page);
 		return AOP_WRITEPAGE_ACTIVATE;
 	}
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	ret = extent_write_full_page(tree, page, wbc);
+	ret = extent_write_full_page(page, wbc);
 	btrfs_add_delayed_iput(inode);
 	return ret;
 }
-- 
cgit v1.2.3


From a74b35ec876df1c2d11b980bb5e4f697ea4fba94 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 8 Dec 2017 16:27:43 +0200
Subject: btrfs: Rename bin_search -> btrfs_bin_search

Currently there are 2 function doing binary search on btrfs nodes:
bin_search and btrfs_bin_search. The latter being a simple wrapper for
the former. So eliminate the wrapper and just rename bin_search to
btrfs_bin_search. No functional changes

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5361f69433a3..2a09577580b8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1807,8 +1807,8 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
  * simple bin_search frontend that does the right thing for
  * leaves vs nodes
  */
-static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
-		      int level, int *slot)
+int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
+		     int level, int *slot)
 {
 	if (level == 0)
 		return generic_bin_search(eb,
@@ -1824,12 +1824,6 @@ static int bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
 					  slot);
 }
 
-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
-		     int level, int *slot)
-{
-	return bin_search(eb, key, level, slot);
-}
-
 static void root_add_used(struct btrfs_root *root, u32 size)
 {
 	spin_lock(&root->accounting_lock);
@@ -2614,7 +2608,7 @@ static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
 		      int level, int *prev_cmp, int *slot)
 {
 	if (*prev_cmp != 0) {
-		*prev_cmp = bin_search(b, key, level, slot);
+		*prev_cmp = btrfs_bin_search(b, key, level, slot);
 		return *prev_cmp;
 	}
 
@@ -5181,7 +5175,7 @@ again:
 	while (1) {
 		nritems = btrfs_header_nritems(cur);
 		level = btrfs_header_level(cur);
-		sret = bin_search(cur, min_key, level, &slot);
+		sret = btrfs_bin_search(cur, min_key, level, &slot);
 
 		/* at the lowest level, we're done, setup the path and exit */
 		if (level == path->lowest_level) {
-- 
cgit v1.2.3


From e2932ee08e46629b0d39eda920e1d795e6d83946 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:16:17 +0200
Subject: btrfs: merge two flush_write_bio helpers

flush_epd_write_bio is same as flush_write_bio, no point having two such
functions. Merge them to flush_write_bio. The 'noinline' attribute is
removed as it does not have any meaning.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6cd3da16f114..f1842d59fffe 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -138,7 +138,8 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits,
 	BUG_ON(ret < 0);
 }
 
-static noinline void flush_write_bio(void *data);
+static void flush_write_bio(void *data);
+
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
 {
@@ -4039,8 +4040,10 @@ retry:
 	return ret;
 }
 
-static void flush_epd_write_bio(struct extent_page_data *epd)
+static void flush_write_bio(void *data)
 {
+	struct extent_page_data *epd = data;
+
 	if (epd->bio) {
 		int ret;
 
@@ -4050,12 +4053,6 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
 	}
 }
 
-static noinline void flush_write_bio(void *data)
-{
-	struct extent_page_data *epd = data;
-	flush_epd_write_bio(epd);
-}
-
 int extent_write_full_page(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
@@ -4068,7 +4065,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
 
 	ret = __extent_writepage(page, wbc, &epd);
 
-	flush_epd_write_bio(&epd);
+	flush_write_bio(&epd);
 	return ret;
 }
 
@@ -4110,7 +4107,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
 		start += PAGE_SIZE;
 	}
 
-	flush_epd_write_bio(&epd);
+	flush_write_bio(&epd);
 	return ret;
 }
 
@@ -4128,7 +4125,7 @@ int extent_writepages(struct extent_io_tree *tree,
 
 	ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
 				       flush_write_bio);
-	flush_epd_write_bio(&epd);
+	flush_write_bio(&epd);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 25b860e038915ffb6c3d8d3fe8c46272a8d471fe Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:30:28 +0200
Subject: btrfs: sink flush_fn to extent_write_cache_pages

All callers pass the same value flush_write_bio.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f1842d59fffe..cf106d46a2fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3906,8 +3906,7 @@ retry:
  */
 static int extent_write_cache_pages(struct address_space *mapping,
 			     struct writeback_control *wbc,
-			     writepage_t writepage, void *data,
-			     void (*flush_fn)(void *))
+			     writepage_t writepage, void *data)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0;
@@ -3971,7 +3970,7 @@ retry:
 			 * mapping
 			 */
 			if (!trylock_page(page)) {
-				flush_fn(data);
+				flush_write_bio(data);
 				lock_page(page);
 			}
 
@@ -3982,7 +3981,7 @@ retry:
 
 			if (wbc->sync_mode != WB_SYNC_NONE) {
 				if (PageWriteback(page))
-					flush_fn(data);
+					flush_write_bio(data);
 				wait_on_page_writeback(page);
 			}
 
@@ -4123,8 +4122,7 @@ int extent_writepages(struct extent_io_tree *tree,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 
-	ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
-				       flush_write_bio);
+	ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd);
 	flush_write_bio(&epd);
 	return ret;
 }
-- 
cgit v1.2.3


From 935db8531fa4d6cc951729a2b5063b6406b71432 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 04:30:28 +0200
Subject: btrfs: sink writepage parameter to extent_write_cache_pages

The function extent_write_cache_pages is modelled after
write_cache_pages which is a generic interface and the writepage
parameter makes sense there. In btrfs we know exactly which callback
we're going to use, so we can pass it directly.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cf106d46a2fd..0ceb427a8e63 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3893,8 +3893,7 @@ retry:
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
+ * @data: data passed to __extent_writepage function
  *
  * If a page is already under I/O, write_cache_pages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
@@ -3906,7 +3905,7 @@ retry:
  */
 static int extent_write_cache_pages(struct address_space *mapping,
 			     struct writeback_control *wbc,
-			     writepage_t writepage, void *data)
+			     void *data)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0;
@@ -3991,7 +3990,7 @@ retry:
 				continue;
 			}
 
-			ret = (*writepage)(page, wbc, data);
+			ret = __extent_writepage(page, wbc, data);
 
 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
 				unlock_page(page);
@@ -4122,7 +4121,7 @@ int extent_writepages(struct extent_io_tree *tree,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 
-	ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd);
+	ret = extent_write_cache_pages(mapping, wbc, &epd);
 	flush_write_bio(&epd);
 	return ret;
 }
-- 
cgit v1.2.3


From aab6e9edf07f2f4747fbc4aa04e14683fbe0c8ac Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 30 Nov 2017 18:00:02 +0100
Subject: btrfs: unify extent_page_data type passed as void

Functions called from extent_write_cache_pages used void* as generic
callback data, but all of them convert it to extent_page_data, or use it
directly.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0ceb427a8e63..eee9cc5db9ff 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -138,7 +138,7 @@ static void add_extent_changeset(struct extent_state *state, unsigned bits,
 	BUG_ON(ret < 0);
 }
 
-static void flush_write_bio(void *data);
+static void flush_write_bio(struct extent_page_data *epd);
 
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
@@ -3456,10 +3456,9 @@ done:
  * and the end_io handler clears the writeback ranges
  */
 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
-			      void *data)
+			      struct extent_page_data *epd)
 {
 	struct inode *inode = page->mapping->host;
-	struct extent_page_data *epd = data;
 	u64 start = page_offset(page);
 	u64 page_end = start + PAGE_SIZE - 1;
 	int ret;
@@ -3905,7 +3904,7 @@ retry:
  */
 static int extent_write_cache_pages(struct address_space *mapping,
 			     struct writeback_control *wbc,
-			     void *data)
+			     struct extent_page_data *epd)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0;
@@ -3969,7 +3968,7 @@ retry:
 			 * mapping
 			 */
 			if (!trylock_page(page)) {
-				flush_write_bio(data);
+				flush_write_bio(epd);
 				lock_page(page);
 			}
 
@@ -3980,7 +3979,7 @@ retry:
 
 			if (wbc->sync_mode != WB_SYNC_NONE) {
 				if (PageWriteback(page))
-					flush_write_bio(data);
+					flush_write_bio(epd);
 				wait_on_page_writeback(page);
 			}
 
@@ -3990,7 +3989,7 @@ retry:
 				continue;
 			}
 
-			ret = __extent_writepage(page, wbc, data);
+			ret = __extent_writepage(page, wbc, epd);
 
 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
 				unlock_page(page);
@@ -4038,10 +4037,8 @@ retry:
 	return ret;
 }
 
-static void flush_write_bio(void *data)
+static void flush_write_bio(struct extent_page_data *epd)
 {
-	struct extent_page_data *epd = data;
-
 	if (epd->bio) {
 		int ret;
 
-- 
cgit v1.2.3


From 72fa39f5c7a1c9d95c24ddf6605581ea05d6081c Mon Sep 17 00:00:00 2001
From: "Misono, Tomohiro" <misono.tomohiro@jp.fujitsu.com>
Date: Thu, 14 Dec 2017 17:24:30 +0900
Subject: btrfs: add btrfs_mount_root() and new file_system_type

Add btrfs_mount_root() and new file_system_type for preparation of cleanup
of btrfs_mount(). Code path is not changed yet.

btrfs_mount_root() is almost the same as current btrfs_mount(), but doesn't
have subvolume related part.

Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 428841a17acb..ce78906bc7d7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -65,7 +65,15 @@
 #include <trace/events/btrfs.h>
 
 static const struct super_operations btrfs_super_ops;
+
+/*
+ * Types for mounting the default subvolume and a subvolume explicitly
+ * requested by subvol=/path. That way the callchain is straightforward and we
+ * don't have to play tricks with the mount options and recursive calls to
+ * btrfs_mount.
+ */
 static struct file_system_type btrfs_fs_type;
+static struct file_system_type btrfs_root_fs_type;
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data);
 
@@ -1549,6 +1557,112 @@ static int setup_security_options(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
+static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
+		int flags, const char *device_name, void *data)
+{
+	struct block_device *bdev = NULL;
+	struct super_block *s;
+	struct btrfs_fs_devices *fs_devices = NULL;
+	struct btrfs_fs_info *fs_info = NULL;
+	struct security_mnt_opts new_sec_opts;
+	fmode_t mode = FMODE_READ;
+	char *subvol_name = NULL;
+	u64 subvol_objectid = 0;
+	int error = 0;
+
+	if (!(flags & SB_RDONLY))
+		mode |= FMODE_WRITE;
+
+	error = btrfs_parse_early_options(data, mode, fs_type,
+					  &subvol_name, &subvol_objectid,
+					  &fs_devices);
+	if (error) {
+		kfree(subvol_name);
+		return ERR_PTR(error);
+	}
+
+	security_init_mnt_opts(&new_sec_opts);
+	if (data) {
+		error = parse_security_options(data, &new_sec_opts);
+		if (error)
+			return ERR_PTR(error);
+	}
+
+	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
+	if (error)
+		goto error_sec_opts;
+
+	/*
+	 * Setup a dummy root and fs_info for test/set super.  This is because
+	 * we don't actually fill this stuff out until open_ctree, but we need
+	 * it for searching for existing supers, so this lets us do that and
+	 * then open_ctree will properly initialize everything later.
+	 */
+	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+	if (!fs_info) {
+		error = -ENOMEM;
+		goto error_sec_opts;
+	}
+
+	fs_info->fs_devices = fs_devices;
+
+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+	security_init_mnt_opts(&fs_info->security_opts);
+	if (!fs_info->super_copy || !fs_info->super_for_commit) {
+		error = -ENOMEM;
+		goto error_fs_info;
+	}
+
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
+	if (error)
+		goto error_fs_info;
+
+	if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
+		goto error_close_devices;
+	}
+
+	bdev = fs_devices->latest_bdev;
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
+		 fs_info);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto error_close_devices;
+	}
+
+	if (s->s_root) {
+		btrfs_close_devices(fs_devices);
+		free_fs_info(fs_info);
+		if ((flags ^ s->s_flags) & SB_RDONLY)
+			error = -EBUSY;
+	} else {
+		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+		btrfs_sb(s)->bdev_holder = fs_type;
+		error = btrfs_fill_super(s, fs_devices, data);
+	}
+	if (error) {
+		deactivate_locked_super(s);
+		goto error_sec_opts;
+	}
+
+	fs_info = btrfs_sb(s);
+	error = setup_security_options(fs_info, s, &new_sec_opts);
+	if (error) {
+		deactivate_locked_super(s);
+		goto error_sec_opts;
+	}
+
+	return dget(s->s_root);
+
+error_close_devices:
+	btrfs_close_devices(fs_devices);
+error_fs_info:
+	free_fs_info(fs_info);
+error_sec_opts:
+	security_free_mnt_opts(&new_sec_opts);
+	return ERR_PTR(error);
+}
 /*
  * Find a superblock for the given device / mount point.
  *
@@ -2170,6 +2284,15 @@ static struct file_system_type btrfs_fs_type = {
 	.kill_sb	= btrfs_kill_super,
 	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
 };
+
+static struct file_system_type btrfs_root_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "btrfs",
+	.mount		= btrfs_mount_root,
+	.kill_sb	= btrfs_kill_super,
+	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+};
+
 MODULE_ALIAS_FS("btrfs");
 
 static int btrfs_control_open(struct inode *inode, struct file *file)
-- 
cgit v1.2.3


From 312c89fbca06896cb25a0daf4fa5f44c29bbb1b1 Mon Sep 17 00:00:00 2001
From: "Misono, Tomohiro" <misono.tomohiro@jp.fujitsu.com>
Date: Thu, 14 Dec 2017 17:25:01 +0900
Subject: btrfs: cleanup btrfs_mount() using btrfs_mount_root()

Cleanup btrfs_mount() by using btrfs_mount_root(). This avoids getting
btrfs_mount() called twice in mount path.

Old btrfs_mount() will do:
0. VFS layer calls vfs_kern_mount() with registered file_system_type
   (for btrfs, btrfs_fs_type). btrfs_mount() is called on the way.
1. btrfs_parse_early_options() parses "subvolid=" mount option and set the
   value to subvol_objectid. Otherwise, subvol_objectid has the initial
   value of 0
2. check subvol_objectid is 5 or not. Assume this time id is not 5, then
   btrfs_mount() returns by calling mount_subvol()
3. In mount_subvol(), original mount options are modified to contain
   "subvolid=0" in setup_root_args(). Then, vfs_kern_mount() is called with
   btrfs_fs_type and new options
4. btrfs_mount() is called again
5. btrfs_parse_early_options() parses "subvolid=0" and set 5 (instead of 0)
   to subvol_objectid
6. check subvol_objectid is 5 or not. This time id is 5 and mount_subvol()
   is not called. btrfs_mount() finishes mounting a root
7. (in mount_subvol()) with using a return vale of vfs_kern_mount(), it
   calls mount_subtree()
8. return subvolume's dentry

Reusing the same file_system_type (and btrfs_mount()) for vfs_kern_mount()
is the cause of complication.

Instead, new btrfs_mount() will do:
1. parse subvol id related options for later use in mount_subvol()
2. mount device's root by calling vfs_kern_mount() with
   btrfs_root_fs_type, which is not registered to VFS by
   register_filesystem(). As a result, btrfs_mount_root() is called
3. return by calling mount_subvol()

The code of 2. is moved from the first part of mount_subvol().

The semantics of device holder changes from btrfs_fs_type to
btrfs_root_fs_type and has to be used in all contexts. Otherwise we'd
get wrong results when mount and dev scan would not check the same
thing. (this has been found indendently and the fix is folded into this
patch)

Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ fold the btrfs_control_ioctl fixup, extend the comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 193 ++++++++++++++++++-------------------------------------
 1 file changed, 63 insertions(+), 130 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ce78906bc7d7..adf0f8b8829c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -71,6 +71,8 @@ static const struct super_operations btrfs_super_ops;
  * requested by subvol=/path. That way the callchain is straightforward and we
  * don't have to play tricks with the mount options and recursive calls to
  * btrfs_mount.
+ *
+ * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
  */
 static struct file_system_type btrfs_fs_type;
 static struct file_system_type btrfs_root_fs_type;
@@ -1405,48 +1407,11 @@ static char *setup_root_args(char *args)
 
 static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
 				   int flags, const char *device_name,
-				   char *data)
+				   char *data, struct vfsmount *mnt)
 {
 	struct dentry *root;
-	struct vfsmount *mnt = NULL;
-	char *newargs;
 	int ret;
 
-	newargs = setup_root_args(data);
-	if (!newargs) {
-		root = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
-	if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
-		if (flags & SB_RDONLY) {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY,
-					     device_name, newargs);
-		} else {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY,
-					     device_name, newargs);
-			if (IS_ERR(mnt)) {
-				root = ERR_CAST(mnt);
-				mnt = NULL;
-				goto out;
-			}
-
-			down_write(&mnt->mnt_sb->s_umount);
-			ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
-			up_write(&mnt->mnt_sb->s_umount);
-			if (ret < 0) {
-				root = ERR_PTR(ret);
-				goto out;
-			}
-		}
-	}
-	if (IS_ERR(mnt)) {
-		root = ERR_CAST(mnt);
-		mnt = NULL;
-		goto out;
-	}
-
 	if (!subvol_name) {
 		if (!subvol_objectid) {
 			ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
@@ -1502,7 +1467,6 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
 
 out:
 	mntput(mnt);
-	kfree(newargs);
 	kfree(subvol_name);
 	return root;
 }
@@ -1557,6 +1521,12 @@ static int setup_security_options(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note: This is based on mount_bdev from fs/super.c with a few additions
+ *       for multiple device setup.  Make sure to keep it in sync.
+ */
 static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 		int flags, const char *device_name, void *data)
 {
@@ -1663,20 +1633,35 @@ error_sec_opts:
 	security_free_mnt_opts(&new_sec_opts);
 	return ERR_PTR(error);
 }
+
 /*
- * Find a superblock for the given device / mount point.
+ * Mount function which is called by VFS layer.
  *
- * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
- *	  for multiple device setup.  Make sure to keep it in sync.
+ * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
+ * which needs vfsmount* of device's root (/).  This means device's root has to
+ * be mounted internally in any case.
+ *
+ * Operation flow:
+ *   1. Parse subvol id related options for later use in mount_subvol().
+ *
+ *   2. Mount device's root (/) by calling vfs_kern_mount().
+ *
+ *      NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
+ *      first place. In order to avoid calling btrfs_mount() again, we use
+ *      different file_system_type which is not registered to VFS by
+ *      register_filesystem() (btrfs_root_fs_type). As a result,
+ *      btrfs_mount_root() is called. The return value will be used by
+ *      mount_subtree() in mount_subvol().
+ *
+ *   3. Call mount_subvol() to get the dentry of subvolume. Since there is
+ *      "btrfs subvolume set-default", mount_subvol() is called always.
  */
 static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		const char *device_name, void *data)
 {
-	struct block_device *bdev = NULL;
-	struct super_block *s;
 	struct btrfs_fs_devices *fs_devices = NULL;
-	struct btrfs_fs_info *fs_info = NULL;
-	struct security_mnt_opts new_sec_opts;
+	struct vfsmount *mnt_root;
+	struct dentry *root;
 	fmode_t mode = FMODE_READ;
 	char *subvol_name = NULL;
 	u64 subvol_objectid = 0;
@@ -1693,93 +1678,41 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		return ERR_PTR(error);
 	}
 
-	if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
-		/* mount_subvol() will free subvol_name. */
-		return mount_subvol(subvol_name, subvol_objectid, flags,
-				    device_name, data);
-	}
-
-	security_init_mnt_opts(&new_sec_opts);
-	if (data) {
-		error = parse_security_options(data, &new_sec_opts);
-		if (error)
-			return ERR_PTR(error);
-	}
-
-	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
-	if (error)
-		goto error_sec_opts;
-
-	/*
-	 * Setup a dummy root and fs_info for test/set super.  This is because
-	 * we don't actually fill this stuff out until open_ctree, but we need
-	 * it for searching for existing supers, so this lets us do that and
-	 * then open_ctree will properly initialize everything later.
-	 */
-	fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
-	if (!fs_info) {
-		error = -ENOMEM;
-		goto error_sec_opts;
-	}
-
-	fs_info->fs_devices = fs_devices;
-
-	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
-	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
-	security_init_mnt_opts(&fs_info->security_opts);
-	if (!fs_info->super_copy || !fs_info->super_for_commit) {
-		error = -ENOMEM;
-		goto error_fs_info;
-	}
-
-	error = btrfs_open_devices(fs_devices, mode, fs_type);
-	if (error)
-		goto error_fs_info;
-
-	if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
-		error = -EACCES;
-		goto error_close_devices;
-	}
-
-	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
-		 fs_info);
-	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
-		goto error_close_devices;
-	}
+	/* mount device's root (/) */
+	mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
+	if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
+		if (flags & SB_RDONLY) {
+			mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
+				flags & ~SB_RDONLY, device_name, data);
+		} else {
+			mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
+				flags | SB_RDONLY, device_name, data);
+			if (IS_ERR(mnt_root)) {
+				root = ERR_CAST(mnt_root);
+				goto out;
+			}
 
-	if (s->s_root) {
-		btrfs_close_devices(fs_devices);
-		free_fs_info(fs_info);
-		if ((flags ^ s->s_flags) & SB_RDONLY)
-			error = -EBUSY;
-	} else {
-		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
-		btrfs_sb(s)->bdev_holder = fs_type;
-		error = btrfs_fill_super(s, fs_devices, data);
-	}
-	if (error) {
-		deactivate_locked_super(s);
-		goto error_sec_opts;
+			down_write(&mnt_root->mnt_sb->s_umount);
+			error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
+			up_write(&mnt_root->mnt_sb->s_umount);
+			if (error < 0) {
+				root = ERR_PTR(error);
+				mntput(mnt_root);
+				goto out;
+			}
+		}
 	}
-
-	fs_info = btrfs_sb(s);
-	error = setup_security_options(fs_info, s, &new_sec_opts);
-	if (error) {
-		deactivate_locked_super(s);
-		goto error_sec_opts;
+	if (IS_ERR(mnt_root)) {
+		root = ERR_CAST(mnt_root);
+		goto out;
 	}
 
-	return dget(s->s_root);
+	/* mount_subvol() will free subvol_name and mnt_root */
+	root = mount_subvol(subvol_name, subvol_objectid, flags, device_name,
+			data, mnt_root);
 
-error_close_devices:
-	btrfs_close_devices(fs_devices);
-error_fs_info:
-	free_fs_info(fs_info);
-error_sec_opts:
-	security_free_mnt_opts(&new_sec_opts);
-	return ERR_PTR(error);
+out:
+	return root;
 }
 
 static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
@@ -2326,11 +2259,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
 		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
-					    &btrfs_fs_type, &fs_devices);
+					    &btrfs_root_fs_type, &fs_devices);
 		break;
 	case BTRFS_IOC_DEVICES_READY:
 		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
-					    &btrfs_fs_type, &fs_devices);
+					    &btrfs_root_fs_type, &fs_devices);
 		if (ret)
 			break;
 		ret = !(fs_devices->num_devices == fs_devices->total_devices);
-- 
cgit v1.2.3


From d7407606564c595e4b9775cc6b14913e92217782 Mon Sep 17 00:00:00 2001
From: "Misono, Tomohiro" <misono.tomohiro@jp.fujitsu.com>
Date: Thu, 14 Dec 2017 17:25:28 +0900
Subject: btrfs: split parse_early_options() in two

Now parse_early_options() is used by both btrfs_mount() and
btrfs_mount_root(). However, the former only needs subvol related part
and the latter needs the others.

Therefore extract the subvol related parts from parse_early_options() and
move it to new parse function (parse_subvol_options()).

Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 82 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index adf0f8b8829c..0d615d02eb4a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -463,7 +463,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 		case Opt_subvolrootid:
 		case Opt_device:
 			/*
-			 * These are parsed by btrfs_parse_early_options
+			 * These are parsed by btrfs_parse_subvol_options
+			 * and btrfs_parse_early_options
 			 * and can be happily ignored here.
 			 */
 			break;
@@ -897,11 +898,60 @@ out:
  * only when we need to allocate a new super block.
  */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
-		void *holder, char **subvol_name, u64 *subvol_objectid,
-		struct btrfs_fs_devices **fs_devices)
+		void *holder, struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *device_name, *opts, *orig, *p;
+	int error = 0;
+
+	if (!options)
+		return 0;
+
+	/*
+	 * strsep changes the string, duplicate it because btrfs_parse_options
+	 * gets called later
+	 */
+	opts = kstrdup(options, GFP_KERNEL);
+	if (!opts)
+		return -ENOMEM;
+	orig = opts;
+
+	while ((p = strsep(&opts, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		if (token == Opt_device) {
+			device_name = match_strdup(&args[0]);
+			if (!device_name) {
+				error = -ENOMEM;
+				goto out;
+			}
+			error = btrfs_scan_one_device(device_name,
+					flags, holder, fs_devices);
+			kfree(device_name);
+			if (error)
+				goto out;
+		}
+	}
+
+out:
+	kfree(orig);
+	return error;
+}
+
+/*
+ * Parse mount options that are related to subvolume id
+ *
+ * The value is later passed to mount_subvol()
+ */
+static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
+		void *holder, char **subvol_name, u64 *subvol_objectid)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *opts, *orig, *p;
 	char *num = NULL;
 	int error = 0;
 
@@ -909,8 +959,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		return 0;
 
 	/*
-	 * strsep changes the string, duplicate it because parse_options
-	 * gets called twice
+	 * strsep changes the string, duplicate it because
+	 * btrfs_parse_early_options gets called later
 	 */
 	opts = kstrdup(options, GFP_KERNEL);
 	if (!opts)
@@ -949,18 +999,6 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		case Opt_subvolrootid:
 			pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
 			break;
-		case Opt_device:
-			device_name = match_strdup(&args[0]);
-			if (!device_name) {
-				error = -ENOMEM;
-				goto out;
-			}
-			error = btrfs_scan_one_device(device_name,
-					flags, holder, fs_devices);
-			kfree(device_name);
-			if (error)
-				goto out;
-			break;
 		default:
 			break;
 		}
@@ -1536,18 +1574,14 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
 	struct btrfs_fs_info *fs_info = NULL;
 	struct security_mnt_opts new_sec_opts;
 	fmode_t mode = FMODE_READ;
-	char *subvol_name = NULL;
-	u64 subvol_objectid = 0;
 	int error = 0;
 
 	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	error = btrfs_parse_early_options(data, mode, fs_type,
-					  &subvol_name, &subvol_objectid,
 					  &fs_devices);
 	if (error) {
-		kfree(subvol_name);
 		return ERR_PTR(error);
 	}
 
@@ -1659,7 +1693,6 @@ error_sec_opts:
 static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		const char *device_name, void *data)
 {
-	struct btrfs_fs_devices *fs_devices = NULL;
 	struct vfsmount *mnt_root;
 	struct dentry *root;
 	fmode_t mode = FMODE_READ;
@@ -1670,9 +1703,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
-	error = btrfs_parse_early_options(data, mode, fs_type,
-					  &subvol_name, &subvol_objectid,
-					  &fs_devices);
+	error = btrfs_parse_subvol_options(data, mode, fs_type,
+					  &subvol_name, &subvol_objectid);
 	if (error) {
 		kfree(subvol_name);
 		return ERR_PTR(error);
-- 
cgit v1.2.3


From 83085935cc38b9752215556c02b2f080c96bf1be Mon Sep 17 00:00:00 2001
From: "Misono, Tomohiro" <misono.tomohiro@jp.fujitsu.com>
Date: Thu, 14 Dec 2017 17:25:54 +0900
Subject: btrfs: remove unused setup_root_args()

Since setup_root_args() is not used anymore, just remove it.

Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0d615d02eb4a..89333e118c7a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1407,42 +1407,6 @@ static inline int is_subvolume_inode(struct inode *inode)
 	return 0;
 }
 
-/*
- * This will add subvolid=0 to the argument string while removing any subvol=
- * and subvolid= arguments to make sure we get the top-level root for path
- * walking to the subvol we want.
- */
-static char *setup_root_args(char *args)
-{
-	char *buf, *dst, *sep;
-
-	if (!args)
-		return kstrdup("subvolid=0", GFP_KERNEL);
-
-	/* The worst case is that we add ",subvolid=0" to the end. */
-	buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1,
-			GFP_KERNEL);
-	if (!buf)
-		return NULL;
-
-	while (1) {
-		sep = strchrnul(args, ',');
-		if (!strstarts(args, "subvol=") &&
-		    !strstarts(args, "subvolid=")) {
-			memcpy(dst, args, sep - args);
-			dst += sep - args;
-			*dst++ = ',';
-		}
-		if (*sep)
-			args = sep + 1;
-		else
-			break;
-	}
-	strcpy(dst, "subvolid=0");
-
-	return buf;
-}
-
 static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
 				   int flags, const char *device_name,
 				   char *data, struct vfsmount *mnt)
-- 
cgit v1.2.3


From 78f6beacd024e3ab8091a2a5c12ee7031f9ccc38 Mon Sep 17 00:00:00 2001
From: "Misono, Tomohiro" <misono.tomohiro@jp.fujitsu.com>
Date: Wed, 17 Jan 2018 17:38:31 +0900
Subject: btrfs: remove unused arg from parse_subvol_options()

Remove unused arg 'holder' from parse_subvol_options(), which has been
forgotten to be cleaned in the commit b99beb110e2d ("btrfs: split
parse_early_options() in two").

Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 89333e118c7a..03367a64b5c1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -948,7 +948,7 @@ out:
  * The value is later passed to mount_subvol()
  */
 static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
-		void *holder, char **subvol_name, u64 *subvol_objectid)
+		char **subvol_name, u64 *subvol_objectid)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *opts, *orig, *p;
@@ -1667,7 +1667,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
-	error = btrfs_parse_subvol_options(data, mode, fs_type,
+	error = btrfs_parse_subvol_options(data, mode,
 					  &subvol_name, &subvol_objectid);
 	if (error) {
 		kfree(subvol_name);
-- 
cgit v1.2.3


From 23ae8c63aaf82967536cba8893e5166b80b6d99a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 12 Dec 2017 20:35:02 +0100
Subject: btrfs: heuristic: open code get_num callback of radix sort

The callback is trivial and we don't need the abstraction for our
purposes. Let's open code it and also make the array types explicit.

Reviewed-by: Timofey Titovets <nefelim4ag@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 28c3940062b7..37a69d4b04ce 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1318,12 +1318,6 @@ static void copy_cell(void *dst, int dest_i, void *src, int src_i)
 	dstv[dest_i] = srcv[src_i];
 }
 
-static u64 get_num(const void *a, int i)
-{
-	struct bucket_item *av = (struct bucket_item *)a;
-	return av[i].count;
-}
-
 /*
  * Use 4 bits as radix base
  * Use 16 u32 counters for calculating new possition in buf array
@@ -1332,12 +1326,11 @@ static u64 get_num(const void *a, int i)
  * @array_buf - buffer array to store sorting results
  *              must be equal in size to @array
  * @num       - array size
- * @get_num   - function to extract number from array
  * @copy_cell - function to copy data from array to array_buf and vice versa
  * @get4bits  - function to get 4 bits from number at specified offset
  */
-static void radix_sort(void *array, void *array_buf, int num,
-		       u64 (*get_num)(const void *, int i),
+static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
+		       int num,
 		       void (*copy_cell)(void *dest, int dest_i,
 					 void* src, int src_i),
 		       u8 (*get4bits)(u64 num, int shift))
@@ -1355,9 +1348,9 @@ static void radix_sort(void *array, void *array_buf, int num,
 	 * Try avoid useless loop iterations for small numbers stored in big
 	 * counters.  Example: 48 33 4 ... in 64bit array
 	 */
-	max_num = get_num(array, 0);
+	max_num = array[0].count;
 	for (i = 1; i < num; i++) {
-		buf_num = get_num(array, i);
+		buf_num = array[i].count;
 		if (buf_num > max_num)
 			max_num = buf_num;
 	}
@@ -1370,7 +1363,7 @@ static void radix_sort(void *array, void *array_buf, int num,
 		memset(counters, 0, sizeof(counters));
 
 		for (i = 0; i < num; i++) {
-			buf_num = get_num(array, i);
+			buf_num = array[i].count;
 			addr = get4bits(buf_num, shift);
 			counters[addr]++;
 		}
@@ -1379,7 +1372,7 @@ static void radix_sort(void *array, void *array_buf, int num,
 			counters[i] += counters[i - 1];
 
 		for (i = num - 1; i >= 0; i--) {
-			buf_num = get_num(array, i);
+			buf_num = array[i].count;
 			addr = get4bits(buf_num, shift);
 			counters[addr]--;
 			new_addr = counters[addr];
@@ -1397,7 +1390,7 @@ static void radix_sort(void *array, void *array_buf, int num,
 		memset(counters, 0, sizeof(counters));
 
 		for (i = 0; i < num; i ++) {
-			buf_num = get_num(array_buf, i);
+			buf_num = array_buf[i].count;
 			addr = get4bits(buf_num, shift);
 			counters[addr]++;
 		}
@@ -1406,7 +1399,7 @@ static void radix_sort(void *array, void *array_buf, int num,
 			counters[i] += counters[i - 1];
 
 		for (i = num - 1; i >= 0; i--) {
-			buf_num = get_num(array_buf, i);
+			buf_num = array_buf[i].count;
 			addr = get4bits(buf_num, shift);
 			counters[addr]--;
 			new_addr = counters[addr];
@@ -1444,7 +1437,7 @@ static int byte_core_set_size(struct heuristic_ws *ws)
 	struct bucket_item *bucket = ws->bucket;
 
 	/* Sort in reverse order */
-	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE, get_num, copy_cell,
+	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE, copy_cell,
 			get4bits);
 
 	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
-- 
cgit v1.2.3


From 7add17befcfc0811b583e4c3c70849a3095f0080 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 12 Dec 2017 20:35:02 +0100
Subject: btrfs: heuristic: open code copy_call callback of radix sort

The callback is trivial and we don't need the abstraction for our
purposes. Let's open code it.

Reviewed-by: Timofey Titovets <nefelim4ag@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 37a69d4b04ce..935acabc0ea7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1311,13 +1311,6 @@ static u8 get4bits(u64 num, int shift) {
 	return low4bits;
 }
 
-static void copy_cell(void *dst, int dest_i, void *src, int src_i)
-{
-	struct bucket_item *dstv = (struct bucket_item *)dst;
-	struct bucket_item *srcv = (struct bucket_item *)src;
-	dstv[dest_i] = srcv[src_i];
-}
-
 /*
  * Use 4 bits as radix base
  * Use 16 u32 counters for calculating new possition in buf array
@@ -1326,13 +1319,10 @@ static void copy_cell(void *dst, int dest_i, void *src, int src_i)
  * @array_buf - buffer array to store sorting results
  *              must be equal in size to @array
  * @num       - array size
- * @copy_cell - function to copy data from array to array_buf and vice versa
  * @get4bits  - function to get 4 bits from number at specified offset
  */
 static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
 		       int num,
-		       void (*copy_cell)(void *dest, int dest_i,
-					 void* src, int src_i),
 		       u8 (*get4bits)(u64 num, int shift))
 {
 	u64 max_num;
@@ -1376,7 +1366,7 @@ static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
 			addr = get4bits(buf_num, shift);
 			counters[addr]--;
 			new_addr = counters[addr];
-			copy_cell(array_buf, new_addr, array, i);
+			array_buf[new_addr] = array[i];
 		}
 
 		shift += RADIX_BASE;
@@ -1403,7 +1393,7 @@ static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
 			addr = get4bits(buf_num, shift);
 			counters[addr]--;
 			new_addr = counters[addr];
-			copy_cell(array, new_addr, array_buf, i);
+			array[new_addr] = array_buf[i];
 		}
 
 		shift += RADIX_BASE;
@@ -1437,8 +1427,7 @@ static int byte_core_set_size(struct heuristic_ws *ws)
 	struct bucket_item *bucket = ws->bucket;
 
 	/* Sort in reverse order */
-	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE, copy_cell,
-			get4bits);
+	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE, get4bits);
 
 	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
 		coreset_sum += bucket[i].count;
-- 
cgit v1.2.3


From 36243c9199d6df63a0fbebd4fc49a1af21f3d8a8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 12 Dec 2017 20:35:02 +0100
Subject: btrfs: heuristic: call get4bits directly

As it's a single instance and local to the file, we don't need to pass
it as an argument.

Reviewed-by: Timofey Titovets <nefelim4ag@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 935acabc0ea7..208334aa6c6e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1319,11 +1319,9 @@ static u8 get4bits(u64 num, int shift) {
  * @array_buf - buffer array to store sorting results
  *              must be equal in size to @array
  * @num       - array size
- * @get4bits  - function to get 4 bits from number at specified offset
  */
 static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
-		       int num,
-		       u8 (*get4bits)(u64 num, int shift))
+		       int num)
 {
 	u64 max_num;
 	u64 buf_num;
@@ -1427,7 +1425,7 @@ static int byte_core_set_size(struct heuristic_ws *ws)
 	struct bucket_item *bucket = ws->bucket;
 
 	/* Sort in reverse order */
-	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE, get4bits);
+	radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
 
 	for (i = 0; i < BYTE_CORE_SET_LOW; i++)
 		coreset_sum += bucket[i].count;
-- 
cgit v1.2.3


From 4271ecea64f40683d28d83ad433ddc43e5ca2ee9 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 13 Dec 2017 09:38:14 +0200
Subject: btrfs: Improve btrfs_search_slot description

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2a09577580b8..b88a79e69ddf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2654,17 +2654,29 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
 }
 
 /*
- * look for key in the tree.  path is filled in with nodes along the way
- * if key is found, we return zero and you can find the item in the leaf
- * level of the path (level 0)
+ * btrfs_search_slot - look for a key in a tree and perform necessary
+ * modifications to preserve tree invariants.
  *
- * If the key isn't found, the path points to the slot where it should
- * be inserted, and 1 is returned.  If there are other errors during the
- * search a negative error number is returned.
+ * @trans:	Handle of transaction, used when modifying the tree
+ * @p:		Holds all btree nodes along the search path
+ * @root:	The root node of the tree
+ * @key:	The key we are looking for
+ * @ins_len:	Indicates purpose of search, for inserts it is 1, for
+ *		deletions it's -1. 0 for plain searches
+ * @cow:	boolean should CoW operations be performed. Must always be 1
+ *		when modifying the tree.
  *
- * if ins_len > 0, nodes and leaves will be split as we walk down the
- * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
- * possible)
+ * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
+ * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
+ *
+ * If @key is found, 0 is returned and you can find the item in the leaf level
+ * of the path (level 0)
+ *
+ * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
+ * points to the slot where it should be inserted
+ *
+ * If an error is encountered while searching the tree a negative error number
+ * is returned
  */
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      const struct btrfs_key *key, struct btrfs_path *p,
-- 
cgit v1.2.3


From 32506af595dcaa8b71b7858bb54ccaa310d274fb Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 13 Dec 2017 10:25:37 +0200
Subject: btrfs: Remove redundant bio_get/set calls in compressed read/write
 paths

bio_get/set is necessary only if the bio is going to be referenced
following submissions. In the code paths where such calls are made
we don't really need them since the bio is referenced only if
btrfs_map_bio returns an error. And this function can return an error
prior to submission only. So referencing the bio is safe. Furthermore
we do call bio_endio which will consume the last reference. So let's
remove the redundant calls.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 208334aa6c6e..5abcc0461ee1 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -362,8 +362,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		page->mapping = NULL;
 		if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
 		    PAGE_SIZE) {
-			bio_get(bio);
-
 			/*
 			 * inc the count before we submit the bio so
 			 * we know the end IO handler won't happen before
@@ -386,8 +384,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				bio_endio(bio);
 			}
 
-			bio_put(bio);
-
 			bio = btrfs_bio_alloc(bdev, first_byte);
 			bio->bi_opf = REQ_OP_WRITE | write_flags;
 			bio->bi_private = cb;
@@ -403,7 +399,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		first_byte += PAGE_SIZE;
 		cond_resched();
 	}
-	bio_get(bio);
 
 	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
 	BUG_ON(ret); /* -ENOMEM */
@@ -419,7 +414,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		bio_endio(bio);
 	}
 
-	bio_put(bio);
 	return 0;
 }
 
@@ -652,8 +646,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		page->mapping = NULL;
 		if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
 		    PAGE_SIZE) {
-			bio_get(comp_bio);
-
 			ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
 						  BTRFS_WQ_ENDIO_DATA);
 			BUG_ON(ret); /* -ENOMEM */
@@ -680,8 +672,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				bio_endio(comp_bio);
 			}
 
-			bio_put(comp_bio);
-
 			comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
 			bio_set_op_attrs(comp_bio, REQ_OP_READ, 0);
 			comp_bio->bi_private = cb;
@@ -691,7 +681,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		}
 		cur_disk_byte += PAGE_SIZE;
 	}
-	bio_get(comp_bio);
 
 	ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
 	BUG_ON(ret); /* -ENOMEM */
@@ -707,7 +696,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		bio_endio(comp_bio);
 	}
 
-	bio_put(comp_bio);
 	return 0;
 
 fail2:
-- 
cgit v1.2.3


From ea057f6dafcb6d21dd2c8be0e45ba3979b98e2b3 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 13 Dec 2017 10:25:38 +0200
Subject: btrfs: Remove redundant bio_get/set from submit_dio_repair_bio

The bio that is passsed is the newly created repair bio which already
has a reference count of 1, which is going to be consumed by the
endio routine on successful submission. On error the handler also
calls bio_put.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 46df5e2a64e7..bacf71ccf0f2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7941,15 +7941,12 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
 
 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
 
-	bio_get(bio);
-
 	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
 	if (ret)
-		goto err;
+		return ret;
 
 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
-err:
-	bio_put(bio);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From ffc9c8dd7d5c902b31a3caae3e55f019a85c90ea Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 13 Dec 2017 10:25:39 +0200
Subject: btrfs: Remove redundant bio_get/bio_set pair from submit_one_bio

The bio is never referenced after it has been submitted so there is no
point in getting an extra reference.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eee9cc5db9ff..c40591486f23 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2733,7 +2733,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 	start = page_offset(page) + bvec->bv_offset;
 
 	bio->bi_private = NULL;
-	bio_get(bio);
 
 	if (tree->ops)
 		ret = tree->ops->submit_bio_hook(tree->private_data, bio,
@@ -2741,7 +2740,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 	else
 		btrfsic_submit_bio(bio);
 
-	bio_put(bio);
 	return blk_status_to_errno(ret);
 }
 
-- 
cgit v1.2.3


From 36f7894f66f286e454bf00d308b989c3423ba0da Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 13 Dec 2017 10:25:40 +0200
Subject: btrfs: Remove redundant pair of bio_get/set in __btrfs_submit_dio_bio

The bio is not referenced after it has been submitted and the endio is
going to consume the sole reference on successful submission. On error,
the callers of __btrfs_submit_dio_bio do invoke bio_put so we don't
leak it either.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bacf71ccf0f2..0c0e61c6b927 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8449,8 +8449,6 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
 	if (async_submit)
 		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
-	bio_get(bio);
-
 	if (!write) {
 		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
 		if (ret)
@@ -8483,7 +8481,6 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
 map:
 	ret = btrfs_map_bio(fs_info, bio, 0, 0);
 err:
-	bio_put(bio);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 203e02d934ed0570551b87c8d0a0a9cf917487cb Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 22 Dec 2017 16:23:01 -0700
Subject: Btrfs: remove unused wait in btrfs_stripe_hash

In fact nobody is waiting on @wait's waitqueue, it can be safely
removed.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h  |  1 -
 fs/btrfs/raid56.c | 10 ----------
 2 files changed, 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 09b72b6996ce..1a462ab85c49 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -679,7 +679,6 @@ enum btrfs_orphan_cleanup_state {
 /* used by the raid56 code to lock stripes for read/modify/write */
 struct btrfs_stripe_hash {
 	struct list_head hash_list;
-	wait_queue_head_t wait;
 	spinlock_t lock;
 };
 
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3940906533a2..9fa45e061953 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -231,7 +231,6 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 		cur = h + i;
 		INIT_LIST_HEAD(&cur->hash_list);
 		spin_lock_init(&cur->lock);
-		init_waitqueue_head(&cur->wait);
 	}
 
 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
@@ -815,15 +814,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 			}
 
 			goto done_nolock;
-			/*
-			 * The barrier for this waitqueue_active is not needed,
-			 * we're protected by h->lock and can't miss a wakeup.
-			 */
-		} else if (waitqueue_active(&h->wait)) {
-			spin_unlock(&rbio->bio_list_lock);
-			spin_unlock_irqrestore(&h->lock, flags);
-			wake_up(&h->wait);
-			goto done_nolock;
 		}
 	}
 done:
-- 
cgit v1.2.3


From e215772cd2abcf33adad8857c0f5a8214a6e7d22 Mon Sep 17 00:00:00 2001
From: "Misono, Tomohiro" <misono.tomohiro@jp.fujitsu.com>
Date: Thu, 14 Dec 2017 17:28:00 +0900
Subject: btrfs: cleanup unnecessary string dup in btrfs_parse_options()

Long ago, commit edf24abe51493 ("btrfs: sanity mount option parsing and
early mount code") split the btrfs_parse_options() into two parts
(btrfs_parse_early_options() and btrfs_parse_options()). As a result,
btrfs_parse_optins no longer gets called twice and is the last one to
parse mount option string. Therefore there is no need to dup it.

Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 03367a64b5c1..1d33744a9326 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -414,7 +414,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			unsigned long new_flags)
 {
 	substring_t args[MAX_OPT_ARGS];
-	char *p, *num, *orig = NULL;
+	char *p, *num;
 	u64 cache_gen;
 	int intarg;
 	int ret = 0;
@@ -437,16 +437,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 	if (!options)
 		goto check;
 
-	/*
-	 * strsep changes the string, duplicate it because parse_options
-	 * gets called twice
-	 */
-	options = kstrdup(options, GFP_KERNEL);
-	if (!options)
-		return -ENOMEM;
-
-	orig = options;
-
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
 		if (!*p)
@@ -887,7 +877,6 @@ out:
 		btrfs_info(info, "disk space caching is enabled");
 	if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
 		btrfs_info(info, "using free space tree");
-	kfree(orig);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 5bedc48a8f9e1d62fb693c4171ffddb990d034f6 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 2 Jan 2018 18:19:50 +0100
Subject: btrfs: drop unused parameters from mount_subvol

Recent patches reworking the mount path left some unused parameters. We
pass a vfsmount to mount_subvol, the flags and data (ie. mount options)
have been already applied and we will not need them.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1d33744a9326..f40352843c0b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1397,8 +1397,7 @@ static inline int is_subvolume_inode(struct inode *inode)
 }
 
 static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
-				   int flags, const char *device_name,
-				   char *data, struct vfsmount *mnt)
+				   const char *device_name, struct vfsmount *mnt)
 {
 	struct dentry *root;
 	int ret;
@@ -1693,8 +1692,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	/* mount_subvol() will free subvol_name and mnt_root */
-	root = mount_subvol(subvol_name, subvol_objectid, flags, device_name,
-			data, mnt_root);
+	root = mount_subvol(subvol_name, subvol_objectid, device_name, mnt_root);
 
 out:
 	return root;
-- 
cgit v1.2.3


From d810a4be1a625aafb2602c56c1256047f1e27380 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 7 Dec 2017 18:52:54 +0100
Subject: btrfs: add separate helper for unlock_extent_cached with GFP_ATOMIC

There's only one instance where we pass different gfp mask to
unlock_extent_cached. Add a separate helper for that and then we can
drop the gfp parameter from unlock_extent_cached.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 2 +-
 fs/btrfs/extent_io.h | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c40591486f23..87809458b3ac 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2493,7 +2493,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
 
 	if (uptodate && tree->track_uptodate)
 		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
-	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+	unlock_extent_cached_atomic(tree, start, end, &cached);
 }
 
 /*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index db2558b0cad4..978351e8e8dc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -318,6 +318,13 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
 				mask, NULL);
 }
 
+static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
+		u64 start, u64 end, struct extent_state **cached)
+{
+	return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+				GFP_ATOMIC, NULL);
+}
+
 static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
 		u64 end, unsigned bits)
 {
-- 
cgit v1.2.3


From e43bbe5e16d87b40f3b382b3a43b0142d6d1193d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 12 Dec 2017 21:43:52 +0100
Subject: btrfs: sink unlock_extent parameter gfp_flags

All callers pass either GFP_NOFS or GFP_KERNEL now, so we can sink the
parameter to the function, though we lose some of the slightly better
semantics of GFP_KERNEL in some places, it's worth cleaning up the
callchains.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c          |  2 +-
 fs/btrfs/extent_io.c        | 10 ++++------
 fs/btrfs/extent_io.h        |  4 ++--
 fs/btrfs/file.c             | 15 +++++++--------
 fs/btrfs/free-space-cache.c |  5 ++---
 fs/btrfs/inode.c            | 26 ++++++++++++--------------
 fs/btrfs/ioctl.c            |  7 +++----
 fs/btrfs/scrub.c            |  3 +--
 8 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 392d6cde4308..41d151b9ffd2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -381,7 +381,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		clear_extent_buffer_uptodate(eb);
 out:
 	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
-			     &cached_state, GFP_NOFS);
+			     &cached_state);
 	if (need_lock)
 		btrfs_tree_read_unlock_blocking(eb);
 	return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 87809458b3ac..930c1ea77e91 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1648,7 +1648,7 @@ again:
 			     EXTENT_DELALLOC, 1, cached_state);
 	if (!ret) {
 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
-				     &cached_state, GFP_NOFS);
+				     &cached_state);
 		__unlock_for_delalloc(inode, locked_page,
 			      delalloc_start, delalloc_end);
 		cond_resched();
@@ -2941,8 +2941,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
 			unlock_extent_cached(tree, cur,
-					     cur + iosize - 1,
-					     &cached, GFP_NOFS);
+					     cur + iosize - 1, &cached);
 			break;
 		}
 		em = __get_extent_map(inode, page, pg_offset, cur,
@@ -3035,8 +3034,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
 			unlock_extent_cached(tree, cur,
-					     cur + iosize - 1,
-					     &cached, GFP_NOFS);
+					     cur + iosize - 1, &cached);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
@@ -4621,7 +4619,7 @@ out_free:
 out:
 	btrfs_free_path(path);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-			     &cached_state, GFP_NOFS);
+			     &cached_state);
 	return ret;
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 978351e8e8dc..72e5af2965a8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -312,10 +312,10 @@ static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 }
 
 static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
-		u64 end, struct extent_state **cached, gfp_t mask)
+		u64 end, struct extent_state **cached)
 {
 	return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
-				mask, NULL);
+				GFP_NOFS, NULL);
 }
 
 static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1096398e1351..1ed2e6e9e204 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1504,7 +1504,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset <= last_pos) {
 			unlock_extent_cached(&inode->io_tree, start_pos,
-					last_pos, cached_state, GFP_NOFS);
+					last_pos, cached_state);
 			for (i = 0; i < num_pages; i++) {
 				unlock_page(pages[i]);
 				put_page(pages[i]);
@@ -1758,8 +1758,7 @@ again:
 						pos, copied, NULL);
 		if (extents_locked)
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-					     lockstart, lockend, &cached_state,
-					     GFP_NOFS);
+					     lockstart, lockend, &cached_state);
 		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
 		if (ret) {
 			btrfs_drop_pages(pages, num_pages);
@@ -2600,7 +2599,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state, GFP_NOFS);
+				     lockend, &cached_state);
 		ret = btrfs_wait_ordered_range(inode, lockstart,
 					       lockend - lockstart + 1);
 		if (ret) {
@@ -2751,7 +2750,7 @@ out_free:
 	btrfs_free_block_rsv(fs_info, rsv);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-			     &cached_state, GFP_NOFS);
+			     &cached_state);
 out_only_mutex:
 	if (!updated_inode && truncated_block && !ret && !err) {
 		/*
@@ -2913,7 +2912,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 			btrfs_put_ordered_extent(ordered);
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     alloc_start, locked_end,
-					     &cached_state, GFP_KERNEL);
+					     &cached_state);
 			/*
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
@@ -3015,7 +3014,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	}
 out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-			     &cached_state, GFP_KERNEL);
+			     &cached_state);
 out:
 	inode_unlock(inode);
 	/* Let go of our reservation. */
@@ -3088,7 +3087,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 			*offset = min_t(loff_t, start, inode->i_size);
 	}
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-			     &cached_state, GFP_NOFS);
+			     &cached_state);
 	return ret;
 }
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b8ab90c9a9fb..014f3c090231 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1125,8 +1125,7 @@ cleanup_write_cache_enospc(struct inode *inode,
 {
 	io_ctl_drop_pages(io_ctl);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-			     i_size_read(inode) - 1, cached_state,
-			     GFP_NOFS);
+			     i_size_read(inode) - 1, cached_state);
 }
 
 static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1320,7 +1319,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	io_ctl_drop_pages(io_ctl);
 
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+			     i_size_read(inode) - 1, &cached_state);
 
 	/*
 	 * at this point the pages are under IO and we're happy,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0c0e61c6b927..eebfe2615428 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2100,7 +2100,7 @@ again:
 					PAGE_SIZE);
 	if (ordered) {
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
-				     page_end, &cached_state, GFP_NOFS);
+				     page_end, &cached_state);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
@@ -2130,7 +2130,7 @@ again:
 	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			     &cached_state, GFP_NOFS);
+			     &cached_state);
 out_page:
 	unlock_page(page);
 	put_page(page);
@@ -2722,7 +2722,7 @@ out_free_path:
 	btrfs_end_transaction(trans);
 out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
-			     &cached, GFP_NOFS);
+			     &cached);
 	iput(inode);
 	return ret;
 }
@@ -4812,7 +4812,7 @@ again:
 	ordered = btrfs_lookup_ordered_extent(inode, block_start);
 	if (ordered) {
 		unlock_extent_cached(io_tree, block_start, block_end,
-				     &cached_state, GFP_NOFS);
+				     &cached_state);
 		unlock_page(page);
 		put_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
@@ -4829,7 +4829,7 @@ again:
 					&cached_state, 0);
 	if (ret) {
 		unlock_extent_cached(io_tree, block_start, block_end,
-				     &cached_state, GFP_NOFS);
+				     &cached_state);
 		goto out_unlock;
 	}
 
@@ -4848,8 +4848,7 @@ again:
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
-	unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
-			     GFP_NOFS);
+	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
 
 out_unlock:
 	if (ret)
@@ -4950,7 +4949,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 		if (!ordered)
 			break;
 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
-				     &cached_state, GFP_NOFS);
+				     &cached_state);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 	}
@@ -5015,8 +5014,7 @@ next:
 			break;
 	}
 	free_extent_map(em);
-	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
-			     GFP_NOFS);
+	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
 	return err;
 }
 
@@ -7629,7 +7627,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 			break;
 
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				     cached_state, GFP_NOFS);
+				     cached_state);
 
 		if (ordered) {
 			/*
@@ -9116,7 +9114,7 @@ again:
 			PAGE_SIZE);
 	if (ordered) {
 		unlock_extent_cached(io_tree, page_start, page_end,
-				     &cached_state, GFP_NOFS);
+				     &cached_state);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
@@ -9149,7 +9147,7 @@ again:
 					&cached_state, 0);
 	if (ret) {
 		unlock_extent_cached(io_tree, page_start, page_end,
-				     &cached_state, GFP_NOFS);
+				     &cached_state);
 		ret = VM_FAULT_SIGBUS;
 		goto out_unlock;
 	}
@@ -9175,7 +9173,7 @@ again:
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 
-	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
 
 out_unlock:
 	if (!ret) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index be5bd81b3669..f6d4d5810cc1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -977,7 +977,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
 		/* get the big lock and read metadata off disk */
 		lock_extent_bits(io_tree, start, end, &cached);
 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
-		unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
+		unlock_extent_cached(io_tree, start, end, &cached);
 
 		if (IS_ERR(em))
 			return NULL;
@@ -1128,7 +1128,7 @@ again:
 			ordered = btrfs_lookup_ordered_extent(inode,
 							      page_start);
 			unlock_extent_cached(tree, page_start, page_end,
-					     &cached_state, GFP_NOFS);
+					     &cached_state);
 			if (!ordered)
 				break;
 
@@ -1204,8 +1204,7 @@ again:
 			  &cached_state);
 
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-			     page_start, page_end - 1, &cached_state,
-			     GFP_NOFS);
+			     page_start, page_end - 1, &cached_state);
 
 	for (i = 0; i < i_done; i++) {
 		clear_page_dirty_for_io(pages[i]);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d766c73eb29a..ecfe3118d9dd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4468,8 +4468,7 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
 	free_extent_map(em);
 
 out_unlock:
-	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
-			     GFP_NOFS);
+	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6528b99d3d20795ff947d9b3fd736affe901acef Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Mon, 18 Dec 2017 17:08:59 +0800
Subject: btrfs: factor btrfs_check_rw_degradable() to check given device

Update btrfs_check_rw_degradable() to check against the given device if
its lost.

We can use this function to know if the volume is going to be in
degraded mode OR failed state, when the given device fails.  Which is
needed when we are handling the device failed state.

A preparatory patch does not affect the flow as such.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
[ enhance comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c |  4 ++--
 fs/btrfs/super.c   |  2 +-
 fs/btrfs/volumes.c | 10 ++++++++--
 fs/btrfs/volumes.h |  4 ++--
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41d151b9ffd2..73ab44159d82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2866,7 +2866,7 @@ retry_root_backup:
 		goto fail_sysfs;
 	}
 
-	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info)) {
+	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
 		btrfs_warn(fs_info,
 		"writeable mount is not allowed due to too many missing devices");
 		goto fail_sysfs;
@@ -3369,7 +3369,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device)
 
 static int check_barrier_error(struct btrfs_fs_info *fs_info)
 {
-	if (!btrfs_check_rw_degradable(fs_info))
+	if (!btrfs_check_rw_degradable(fs_info, NULL))
 		return -EIO;
 	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f40352843c0b..8af7590a5638 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1844,7 +1844,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 			goto restore;
 		}
 
-		if (!btrfs_check_rw_degradable(fs_info)) {
+		if (!btrfs_check_rw_degradable(fs_info, NULL)) {
 			btrfs_warn(fs_info,
 				"too many missing devices, writeable remount is not allowed");
 			ret = -EACCES;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9a04245003ab..f85991a2585c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6851,10 +6851,13 @@ out_short_read:
 /*
  * Check if all chunks in the fs are OK for read-write degraded mount
  *
+ * If the @failing_dev is specified, it's accounted as missing.
+ *
  * Return true if all chunks meet the minimal RW mount requirements.
  * Return false if any chunk doesn't meet the minimal RW mount requirements.
  */
-bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+					struct btrfs_device *failing_dev)
 {
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
@@ -6886,9 +6889,12 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
 			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
 			    dev->last_flush_error)
 				missing++;
+			else if (failing_dev && failing_dev == dev)
+				missing++;
 		}
 		if (missing > max_tolerated) {
-			btrfs_warn(fs_info,
+			if (!failing_dev)
+				btrfs_warn(fs_info,
 	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
 				   em->start, missing, max_tolerated);
 			free_extent_map(em);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3e3ae44b6ccc..a339bcf72feb 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -554,7 +554,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
 struct list_head *btrfs_get_fs_uuids(void);
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
-
-bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info);
+bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
+					struct btrfs_device *failing_dev);
 
 #endif
-- 
cgit v1.2.3


From 762221f095e3932669093466aaf4b85ed9ad2ac1 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 2 Jan 2018 13:36:42 -0700
Subject: Btrfs: fix scrub to repair raid6 corruption

The raid6 corruption is that,
suppose that all disks can be read without problems and if the content
that was read out doesn't match its checksum, currently for raid6
btrfs at most retries twice,

- the 1st retry is to rebuild with all other stripes, it'll eventually
  be a raid5 xor rebuild,
- if the 1st fails, the 2nd retry will deliberately fail parity p so
  that it will do raid6 style rebuild,

however, the chances are that another non-parity stripe content also
has something corrupted, so that the above retries are not able to
return correct content.

We've fixed normal reads to rebuild raid6 correctly with more retries
in Patch "Btrfs: make raid6 rebuild retry more"[1], this is to fix
scrub to do the exactly same rebuild process.

[1]: https://patchwork.kernel.org/patch/10091755/

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ecfe3118d9dd..b5b299b7113f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -301,6 +301,11 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_put_ctx(struct scrub_ctx *sctx);
 
+static inline int scrub_is_page_on_raid56(struct scrub_page *page)
+{
+	return page->recover &&
+	       (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+}
 
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 {
@@ -1323,15 +1328,34 @@ nodatasum_case:
 	 * could happen otherwise that a correct page would be
 	 * overwritten by a bad one).
 	 */
-	for (mirror_index = 0;
-	     mirror_index < BTRFS_MAX_MIRRORS &&
-	     sblocks_for_recheck[mirror_index].page_count > 0;
-	     mirror_index++) {
+	for (mirror_index = 0; ;mirror_index++) {
 		struct scrub_block *sblock_other;
 
 		if (mirror_index == failed_mirror_index)
 			continue;
-		sblock_other = sblocks_for_recheck + mirror_index;
+
+		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
+		if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+			if (mirror_index >= BTRFS_MAX_MIRRORS)
+				break;
+			if (!sblocks_for_recheck[mirror_index].page_count)
+				break;
+
+			sblock_other = sblocks_for_recheck + mirror_index;
+		} else {
+			struct scrub_recover *r = sblock_bad->pagev[0]->recover;
+			int max_allowed = r->bbio->num_stripes -
+						r->bbio->num_tgtdevs;
+
+			if (mirror_index >= max_allowed)
+				break;
+			if (!sblocks_for_recheck[1].page_count)
+				break;
+
+			ASSERT(failed_mirror_index == 0);
+			sblock_other = sblocks_for_recheck + 1;
+			sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
+		}
 
 		/* build and submit the bios, check checksums */
 		scrub_recheck_block(fs_info, sblock_other, 0);
@@ -1671,26 +1695,22 @@ static void scrub_bio_wait_endio(struct bio *bio)
 	complete(bio->bi_private);
 }
 
-static inline int scrub_is_page_on_raid56(struct scrub_page *page)
-{
-	return page->recover &&
-	       (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
-}
-
 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 					struct bio *bio,
 					struct scrub_page *page)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int ret;
+	int mirror_num;
 
 	bio->bi_iter.bi_sector = page->logical >> 9;
 	bio->bi_private = &done;
 	bio->bi_end_io = scrub_bio_wait_endio;
 
+	mirror_num = page->sblock->pagev[0]->mirror_num;
 	ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
 				    page->recover->map_length,
-				    page->mirror_num, 0);
+				    mirror_num, 0);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 8810f7517a3bc4ca2d41d022446d3f5fd6b77c09 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 2 Jan 2018 13:36:41 -0700
Subject: Btrfs: make raid6 rebuild retry more

There is a scenario that can end up with rebuild process failing to
return good content, i.e.
suppose that all disks can be read without problems and if the content
that was read out doesn't match its checksum, currently for raid6
btrfs at most retries twice,

- the 1st retry is to rebuild with all other stripes, it'll eventually
  be a raid5 xor rebuild,
- if the 1st fails, the 2nd retry will deliberately fail parity p so
  that it will do raid6 style rebuild,

however, the chances are that another non-parity stripe content also
has something corrupted, so that the above retries are not able to
return correct content, and users will think of this as data loss.
More seriouly, if the loss happens on some important internal btree
roots, it could refuse to mount.

This extends btrfs to do more retries and each retry fails only one
stripe.  Since raid6 can tolerate 2 disk failures, if there is one
more failure besides the failure on which we're recovering, this can
always work.

The worst case is to retry as many times as the number of raid6 disks,
but given the fact that such a scenario is really rare in practice,
it's still acceptable.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c  | 18 ++++++++++++++----
 fs/btrfs/volumes.c |  9 ++++++++-
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9fa45e061953..a1dfdd35e7ab 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2159,11 +2159,21 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
 	}
 
 	/*
-	 * reconstruct from the q stripe if they are
-	 * asking for mirror 3
+	 * Loop retry:
+	 * for 'mirror == 2', reconstruct from all other stripes.
+	 * for 'mirror_num > 2', select a stripe to fail on every retry.
 	 */
-	if (mirror_num == 3)
-		rbio->failb = rbio->real_stripes - 2;
+	if (mirror_num > 2) {
+		/*
+		 * 'mirror == 3' is to fail the p stripe and
+		 * reconstruct from the q stripe.  'mirror > 3' is to
+		 * fail a data stripe and reconstruct from p+q stripe.
+		 */
+		rbio->failb = rbio->real_stripes - (mirror_num - 1);
+		ASSERT(rbio->failb > 0);
+		if (rbio->failb <= rbio->faila)
+			rbio->failb--;
+	}
 
 	ret = lock_stripe_add(rbio);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f85991a2585c..7ebf53cb4269 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5167,7 +5167,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
 		ret = 2;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-		ret = 3;
+		/*
+		 * There could be two corrupted data stripes, we need
+		 * to loop retry in order to rebuild the correct data.
+		 * 
+		 * Fail a stripe at a time on every retry except the
+		 * stripe under reconstruction.
+		 */
+		ret = map->num_stripes;
 	else
 		ret = 1;
 	free_extent_map(em);
-- 
cgit v1.2.3


From 61ecda68652591c3a7131e6bdb51639612a1244c Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Thu, 4 Jan 2018 18:01:54 +0800
Subject: btrfs: remove check for BTRFS_FS_STATE_ERROR which we just set

__btrfs_handle_fs_error() sets BTRFS_FS_STATE_ERROR, and calls
btrfs_handle_error() so no need to check if the BTRFS_FS_STATE_ERROR
is set in btrfs_handle_error(). And there is no other user of
btrfs_handle_error() as well.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8af7590a5638..786e8bc04f9c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -115,20 +115,18 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 	if (sb_rdonly(sb))
 		return;
 
-	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
-		sb->s_flags |= SB_RDONLY;
-		btrfs_info(fs_info, "forced readonly");
-		/*
-		 * Note that a running device replace operation is not
-		 * canceled here although there is no way to update
-		 * the progress. It would add the risk of a deadlock,
-		 * therefore the canceling is omitted. The only penalty
-		 * is that some I/O remains active until the procedure
-		 * completes. The next time when the filesystem is
-		 * mounted writeable again, the device replace
-		 * operation continues.
-		 */
-	}
+	sb->s_flags |= SB_RDONLY;
+	btrfs_info(fs_info, "forced readonly");
+	/*
+	 * Note that a running device replace operation is not
+	 * canceled here although there is no way to update
+	 * the progress. It would add the risk of a deadlock,
+	 * therefore the canceling is omitted. The only penalty
+	 * is that some I/O remains active until the procedure
+	 * completes. The next time when the filesystem is
+	 * mounted writeable again, the device replace
+	 * operation continues.
+	 */
 }
 
 /*
-- 
cgit v1.2.3


From 922ea8994a39ae56d1babbdd98228aef0543bf52 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Thu, 4 Jan 2018 18:01:55 +0800
Subject: btrfS: collapse btrfs_handle_error() into __btrfs_handle_fs_error()

There is no other consumer for btrfs_handle_error() other than
__btrfs_handle_fs_error(), further this function quite small.
Merge it into its parent.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
[ reformat comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 786e8bc04f9c..6e71a2a78363 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -107,28 +107,6 @@ const char *btrfs_decode_error(int errno)
 	return errstr;
 }
 
-/* btrfs handle error by forcing the filesystem readonly */
-static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
-{
-	struct super_block *sb = fs_info->sb;
-
-	if (sb_rdonly(sb))
-		return;
-
-	sb->s_flags |= SB_RDONLY;
-	btrfs_info(fs_info, "forced readonly");
-	/*
-	 * Note that a running device replace operation is not
-	 * canceled here although there is no way to update
-	 * the progress. It would add the risk of a deadlock,
-	 * therefore the canceling is omitted. The only penalty
-	 * is that some I/O remains active until the procedure
-	 * completes. The next time when the filesystem is
-	 * mounted writeable again, the device replace
-	 * operation continues.
-	 */
-}
-
 /*
  * __btrfs_handle_fs_error decodes expected errors from the caller and
  * invokes the approciate error response.
@@ -175,8 +153,23 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
 	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 
 	/* Don't go through full error handling during mount */
-	if (sb->s_flags & SB_BORN)
-		btrfs_handle_error(fs_info);
+	if (!(sb->s_flags & SB_BORN))
+		return;
+
+	if (sb_rdonly(sb))
+		return;
+
+	/* btrfs handle error by forcing the filesystem readonly */
+	sb->s_flags |= SB_RDONLY;
+	btrfs_info(fs_info, "forced readonly");
+	/*
+	 * Note that a running device replace operation is not canceled here
+	 * although there is no way to update the progress. It would add the
+	 * risk of a deadlock, therefore the canceling is omitted. The only
+	 * penalty is that some I/O remains active until the procedure
+	 * completes. The next time when the filesystem is mounted writeable
+	 * again, the device replace operation continues.
+	 */
 }
 
 #ifdef CONFIG_PRINTK
-- 
cgit v1.2.3


From cadbc0a0670e32e42282b388ac8b7539100ac0f2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 3 Jan 2018 16:08:30 +0800
Subject: btrfs: rename btrfs_device::scrub_device to scrub_ctx

btrfs_device::scrub_device is not a device which is being scrubbed,
but it holds the scrub context, so rename to reflect the same. No
functional changes here.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c   | 14 +++++++-------
 fs/btrfs/volumes.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b5b299b7113f..ec56f33feea9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4145,7 +4145,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	}
 
 	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
-	if (dev->scrub_device ||
+	if (dev->scrub_ctx ||
 	    (!is_dev_replace &&
 	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
 		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
@@ -4170,7 +4170,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		return PTR_ERR(sctx);
 	}
 	sctx->readonly = readonly;
-	dev->scrub_device = sctx;
+	dev->scrub_ctx = sctx;
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
 	/*
@@ -4205,7 +4205,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		memcpy(progress, &sctx->stat, sizeof(*progress));
 
 	mutex_lock(&fs_info->scrub_lock);
-	dev->scrub_device = NULL;
+	dev->scrub_ctx = NULL;
 	scrub_workers_put(fs_info);
 	mutex_unlock(&fs_info->scrub_lock);
 
@@ -4262,16 +4262,16 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
 	struct scrub_ctx *sctx;
 
 	mutex_lock(&fs_info->scrub_lock);
-	sctx = dev->scrub_device;
+	sctx = dev->scrub_ctx;
 	if (!sctx) {
 		mutex_unlock(&fs_info->scrub_lock);
 		return -ENOTCONN;
 	}
 	atomic_inc(&sctx->cancel_req);
-	while (dev->scrub_device) {
+	while (dev->scrub_ctx) {
 		mutex_unlock(&fs_info->scrub_lock);
 		wait_event(fs_info->scrub_pause_wait,
-			   dev->scrub_device == NULL);
+			   dev->scrub_ctx == NULL);
 		mutex_lock(&fs_info->scrub_lock);
 	}
 	mutex_unlock(&fs_info->scrub_lock);
@@ -4288,7 +4288,7 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
 	if (dev)
-		sctx = dev->scrub_device;
+		sctx = dev->scrub_ctx;
 	if (sctx)
 		memcpy(progress, &sctx->stat, sizeof(*progress));
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a339bcf72feb..15216fed918b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -131,7 +131,7 @@ struct btrfs_device {
 	struct completion flush_wait;
 
 	/* per-device scrub information */
-	struct scrub_ctx *scrub_device;
+	struct scrub_ctx *scrub_ctx;
 
 	struct btrfs_work work;
 	struct rcu_head rcu;
-- 
cgit v1.2.3


From c1f32b7c1f3be98386f3d268b786660030a44437 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Wed, 20 Dec 2017 14:42:26 +0800
Subject: btrfs: simplify mutex unlocking code in btrfs_commit_transaction

No functional change rearrange the mutex_unlock.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
[ edit subject ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6348573e26a7..04f07144b45c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2265,16 +2265,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	}
 
 	ret = write_all_supers(fs_info, 0);
-	if (ret) {
-		mutex_unlock(&fs_info->tree_log_mutex);
-		goto scrub_continue;
-	}
-
 	/*
 	 * the super is written, we can safely allow the tree-loggers
 	 * to go about their business
 	 */
 	mutex_unlock(&fs_info->tree_log_mutex);
+	if (ret)
+		goto scrub_continue;
 
 	btrfs_finish_extent_commit(trans, fs_info);
 
-- 
cgit v1.2.3


From 05a5c55dfc308544200752f2f0860c6bdd9b7281 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 15 Dec 2017 15:40:16 +0800
Subject: btrfs: minor style cleanups in btrfs_scan_one_device

Assign ret = -EINVAL where it is actually required.
Remove { } around single line if else code.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7ebf53cb4269..2333d48ca1d7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1186,7 +1186,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	struct btrfs_super_block *disk_super;
 	struct block_device *bdev;
 	struct page *page;
-	int ret = -EINVAL;
+	int ret;
 	u64 devid;
 	u64 transid;
 	u64 total_devices;
@@ -1208,8 +1208,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error;
 	}
 
-	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
+	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
+		ret = -EINVAL;
 		goto error_bdev_put;
+	}
 
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	transid = btrfs_super_generation(disk_super);
@@ -1217,11 +1219,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 	if (ret > 0) {
-		if (disk_super->label[0]) {
+		if (disk_super->label[0])
 			pr_info("BTRFS: device label %s ", disk_super->label);
-		} else {
+		else
 			pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
-		}
 
 		pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
 		ret = 0;
-- 
cgit v1.2.3


From db34be19c4f0f5512a4adf63713e4f5494618ead Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 4 Dec 2017 15:40:35 -0700
Subject: Btrfs: remove redundant check in rbio_can_merge

Given the above
'
if (last->operation != cur->operation)
	return 0;
',
it's guaranteed that two operations are same.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a1dfdd35e7ab..945e940d2368 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -594,12 +594,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 	 * bio list here, anyone else that wants to
 	 * change this stripe needs to do their own rmw.
 	 */
-	if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
-	    cur->operation == BTRFS_RBIO_PARITY_SCRUB)
+	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
 		return 0;
 
-	if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
-	    cur->operation == BTRFS_RBIO_REBUILD_MISSING)
+	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
 		return 0;
 
 	return 1;
-- 
cgit v1.2.3


From cc54ff626a56fdd7c22febd55ed24676fda8ed83 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 11 Dec 2017 14:56:31 -0700
Subject: Btrfs: do not merge rbios if their fail stripe index are not
 identical

Since fail stripe index in rbio would be used to decide which
algorithm reconstruction would be run, we cannot merge rbios if
their's fail striped indexes are different, otherwise, one of the two
reconstructions would fail.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 945e940d2368..7b382eeada72 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -600,6 +600,25 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
 		return 0;
 
+	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
+		int fa = last->faila;
+		int fb = last->failb;
+		int cur_fa = cur->faila;
+		int cur_fb = cur->failb;
+
+		if (last->faila >= last->failb) {
+			fa = last->failb;
+			fb = last->faila;
+		}
+
+		if (cur->faila >= cur->failb) {
+			cur_fa = cur->failb;
+			cur_fb = cur->faila;
+		}
+
+		if (fa != cur_fa || fb != cur_fb)
+			return 0;
+	}
 	return 1;
 }
 
-- 
cgit v1.2.3


From f27451f229966874a8793995b8e6b74326d125df Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 25 Oct 2017 11:55:28 +0100
Subject: Btrfs: add support for fallocate's zero range operation

This implements support the zero range operation of fallocate. For now
at least it's as simple as possible while reusing most of the existing
fallocate and hole punching infrastructure.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 338 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 276 insertions(+), 62 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1ed2e6e9e204..16c8031db645 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2458,6 +2458,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
 	return ret;
 }
 
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+				       const u64 lockstart,
+				       const u64 lockend,
+				       struct extent_state **cached_state)
+{
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		int ret;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len <= lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, cached_state);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2574,38 +2614,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out_only_mutex;
 	}
 
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-
-		truncate_pagecache_range(inode, lockstart, lockend);
-
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 &cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
-		/*
-		 * We need to make sure we have no ordered extents in this range
-		 * and nobody raced in and read a page in this range, if we did
-		 * we need to try again.
-		 */
-		if ((!ordered ||
-		    (ordered->file_offset + ordered->len <= lockstart ||
-		     ordered->file_offset > lockend)) &&
-		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
-			if (ordered)
-				btrfs_put_ordered_extent(ordered);
-			break;
-		}
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, &cached_state);
-		ret = btrfs_wait_ordered_range(inode, lockstart,
-					       lockend - lockstart + 1);
-		if (ret) {
-			inode_unlock(inode);
-			return ret;
-		}
+	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+					  &cached_state);
+	if (ret) {
+		inode_unlock(inode);
+		goto out_only_mutex;
 	}
 
 	path = btrfs_alloc_path();
@@ -2814,6 +2827,217 @@ insert:
 	return 0;
 }
 
+static int btrfs_fallocate_update_isize(struct inode *inode,
+					const u64 end,
+					const int mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	int ret2;
+
+	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+		return 0;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	inode->i_ctime = current_time(inode);
+	i_size_write(inode, end);
+	btrfs_ordered_update_i_size(inode, end, NULL);
+	ret = btrfs_update_inode(trans, root, inode);
+	ret2 = btrfs_end_transaction(trans);
+
+	return ret ? ret : ret2;
+}
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+						 u64 offset)
+{
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	struct extent_map *em;
+	int ret = 0;
+
+	offset = round_down(offset, sectorsize);
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+	if (IS_ERR(em))
+		return PTR_ERR(em);
+
+	if (em->block_start == EXTENT_MAP_HOLE)
+		ret = 1;
+
+	free_extent_map(em);
+	return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+			    loff_t offset,
+			    loff_t len,
+			    const int mode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct extent_map *em;
+	struct extent_changeset *data_reserved = NULL;
+	int ret;
+	u64 alloc_hint = 0;
+	const u64 sectorsize = btrfs_inode_sectorsize(inode);
+	u64 alloc_start = round_down(offset, sectorsize);
+	u64 alloc_end = round_up(offset + len, sectorsize);
+	u64 bytes_to_reserve = 0;
+	bool space_reserved = false;
+
+	inode_dio_wait(inode);
+
+	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+			      alloc_start, alloc_end - alloc_start, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+
+	/*
+	 * Avoid hole punching and extent allocation for some cases. More cases
+	 * could be considered, but these are unlikely common and we keep things
+	 * as simple as possible for now. Also, intentionally, if the target
+	 * range contains one or more prealloc extents together with regular
+	 * extents and holes, we drop all the existing extents and allocate a
+	 * new prealloc extent, so that we get a larger contiguous disk extent.
+	 */
+	if (em->start <= alloc_start &&
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		const u64 em_end = em->start + em->len;
+
+		if (em_end >= offset + len) {
+			/*
+			 * The whole range is already a prealloc extent,
+			 * do nothing except updating the inode's i_size if
+			 * needed.
+			 */
+			free_extent_map(em);
+			ret = btrfs_fallocate_update_isize(inode, offset + len,
+							   mode);
+			goto out;
+		}
+		/*
+		 * Part of the range is already a prealloc extent, so operate
+		 * only on the remaining part of the range.
+		 */
+		alloc_start = em_end;
+		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+		len = offset + len - alloc_start;
+		offset = alloc_start;
+		alloc_hint = em->block_start + em->len;
+	}
+	free_extent_map(em);
+
+	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+				      alloc_start, sectorsize, 0);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			free_extent_map(em);
+			ret = btrfs_fallocate_update_isize(inode, offset + len,
+							   mode);
+			goto out;
+		}
+		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+			free_extent_map(em);
+			ret = btrfs_truncate_block(inode, offset, len, 0);
+			if (!ret)
+				ret = btrfs_fallocate_update_isize(inode,
+								   offset + len,
+								   mode);
+			return ret;
+		}
+		free_extent_map(em);
+		alloc_start = round_down(offset, sectorsize);
+		alloc_end = alloc_start + sectorsize;
+		goto reserve_space;
+	}
+
+	alloc_start = round_up(offset, sectorsize);
+	alloc_end = round_down(offset + len, sectorsize);
+
+	/*
+	 * For unaligned ranges, check the pages at the boundaries, they might
+	 * map to an extent, in which case we need to partially zero them, or
+	 * they might map to a hole, in which case we need our allocation range
+	 * to cover them.
+	 */
+	if (!IS_ALIGNED(offset, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode, offset);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_start = round_down(offset, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset, 0, 0);
+			if (ret)
+				goto out;
+		}
+	}
+
+	if (!IS_ALIGNED(offset + len, sectorsize)) {
+		ret = btrfs_zero_range_check_range_boundary(inode,
+							    offset + len);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			alloc_end = round_up(offset + len, sectorsize);
+			ret = 0;
+		} else {
+			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+			if (ret)
+				goto out;
+		}
+	}
+
+reserve_space:
+	if (alloc_start < alloc_end) {
+		struct extent_state *cached_state = NULL;
+		const u64 lockstart = alloc_start;
+		const u64 lockend = alloc_end - 1;
+
+		bytes_to_reserve = alloc_end - alloc_start;
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      bytes_to_reserve);
+		if (ret < 0)
+			goto out;
+		space_reserved = true;
+		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+						alloc_start, bytes_to_reserve);
+		if (ret)
+			goto out;
+		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+						  &cached_state);
+		if (ret)
+			goto out;
+		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+						alloc_end - alloc_start,
+						i_blocksize(inode),
+						offset + len, &alloc_hint);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state);
+		/* btrfs_prealloc_file_range releases reserved space on error */
+		if (ret)
+			space_reserved = false;
+	}
+ out:
+	if (ret && space_reserved)
+		btrfs_free_reserved_data_space(inode, data_reserved,
+					       alloc_start, bytes_to_reserve);
+	extent_changeset_free(data_reserved);
+
+	return ret;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
@@ -2839,7 +3063,8 @@ static long btrfs_fallocate(struct file *file, int mode,
 	cur_offset = alloc_start;
 
 	/* Make sure we aren't being give some crap mode */
-	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -2850,10 +3075,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 	 *
 	 * For qgroup space, it will be checked later.
 	 */
-	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
-			alloc_end - alloc_start);
-	if (ret < 0)
-		return ret;
+	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+						      alloc_end - alloc_start);
+		if (ret < 0)
+			return ret;
+	}
 
 	inode_lock(inode);
 
@@ -2895,6 +3122,12 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret)
 		goto out;
 
+	if (mode & FALLOC_FL_ZERO_RANGE) {
+		ret = btrfs_zero_range(inode, offset, len, mode);
+		inode_unlock(inode);
+		return ret;
+	}
+
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -2988,37 +3221,18 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret < 0)
 		goto out_unlock;
 
-	if (actual_end > inode->i_size &&
-	    !(mode & FALLOC_FL_KEEP_SIZE)) {
-		struct btrfs_trans_handle *trans;
-		struct btrfs_root *root = BTRFS_I(inode)->root;
-
-		/*
-		 * We didn't need to allocate any more space, but we
-		 * still extended the size of the file so we need to
-		 * update i_size and the inode item.
-		 */
-		trans = btrfs_start_transaction(root, 1);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-		} else {
-			inode->i_ctime = current_time(inode);
-			i_size_write(inode, actual_end);
-			btrfs_ordered_update_i_size(inode, actual_end, NULL);
-			ret = btrfs_update_inode(trans, root, inode);
-			if (ret)
-				btrfs_end_transaction(trans);
-			else
-				ret = btrfs_end_transaction(trans);
-		}
-	}
+	/*
+	 * We didn't need to allocate any more space, but we still extended the
+	 * size of the file so we need to update i_size and the inode item.
+	 */
+	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
 out_unlock:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state);
 out:
 	inode_unlock(inode);
 	/* Let go of our reservation. */
-	if (ret != 0)
+	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
 		btrfs_free_reserved_data_space(inode, data_reserved,
 				alloc_start, alloc_end - cur_offset);
 	extent_changeset_free(data_reserved);
-- 
cgit v1.2.3


From 94f450712ac9cb4e165b5115e5eb0ab10055a64b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 31 Oct 2017 17:59:54 +0000
Subject: Btrfs: use cached state when dirtying pages during buffered write

During a buffered IO write, we can have an extent state that we got when
we locked the range (if the range starts at an offset lower than eof), so
always pass it to btrfs_dirty_pages() so that setting the delalloc bit
in the range does not need to do a full search in the inode's io tree,
saving time and reducing the amount of time we hold the io tree's lock.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 16c8031db645..cba2ac371ce0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1755,7 +1755,7 @@ again:
 
 		if (copied > 0)
 			ret = btrfs_dirty_pages(inode, pages, dirty_pages,
-						pos, copied, NULL);
+						pos, copied, &cached_state);
 		if (extents_locked)
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     lockstart, lockend, &cached_state);
-- 
cgit v1.2.3


From 9f13ce743b1bd4e764193980e6311bfcdf424bb2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 18 Jan 2018 11:34:20 +0000
Subject: Btrfs: fix missing inode i_size update after zero range operation

For a fallocate's zero range operation that targets a range with an end
that is not aligned to the sector size, we can end up not updating the
inode's i_size. This happens when the last page of the range maps to an
unwritten (prealloc) extent and before that last page we have either a
hole or a written extent. This is because in this scenario we relied
on a call to btrfs_prealloc_file_range() to update the inode's i_size,
however it can only update the i_size to the "down aligned" end of the
range.

Example:

 $ mkfs.btrfs -f /dev/sdc
 $ mount /dev/sdc /mnt
 $ xfs_io -f -c "pwrite -S 0xff 0 428K" /mnt/foobar
 $ xfs_io -c "falloc -k 428K 4K" /mnt/foobar
 $ xfs_io -c "fzero 0 430K" /mnt/foobar
 $ du --bytes /mnt/foobar
 438272	/mnt/foobar

The inode's i_size was left as 428Kb (438272 bytes) when it should have
been updated to 430Kb (440320 bytes).
Fix this by always updating the inode's i_size explicitly after zeroing
the range.

Fixes: ba6d5887946ff86d93dc ("Btrfs: add support for fallocate's zero range operation")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index cba2ac371ce0..baad81c1f9a3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3026,9 +3026,12 @@ reserve_space:
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
 				     lockend, &cached_state);
 		/* btrfs_prealloc_file_range releases reserved space on error */
-		if (ret)
+		if (ret) {
 			space_reserved = false;
+			goto out;
+		}
 	}
+	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
  out:
 	if (ret && space_reserved)
 		btrfs_free_reserved_data_space(inode, data_reserved,
-- 
cgit v1.2.3


From 81fdf6382b3b92f6fc5f34f9c8cd9074b25f6c0e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 18 Jan 2018 11:34:31 +0000
Subject: Btrfs: fix space leak after fallocate and zero range operations

If we do a buffered write after a zero range operation that has an
unaligned (with the filesystem's sector size) end which also falls within
an unwritten (prealloc) extent that is currently beyond the inode's
i_size, and the zero range operation has the flag FALLOC_FL_KEEP_SIZE,
we end up leaking data and metadata space. This happens because when
zeroing a range we call btrfs_truncate_block(), which does delalloc
(loads the page and partially zeroes its content), and in the buffered
write path we only clear existing delalloc space reservation for the
range we are writing into if that range starts at an offset smaller then
the inode's i_size, which makes sense since we can not have delalloc
extents beyond the i_size, only unwritten extents are allowed.

Example reproducer:

 $ mkfs.btrfs -f /dev/sdb
 $ mount /dev/sdb /mnt
 $ xfs_io -f -c "falloc -k 428K 4K" /mnt/foobar
 $ xfs_io -c "fzero -k 0 430K" /mnt/foobar
 $ xfs_io -c "pwrite -S 0xaa 428K 4K" /mnt/foobar
 $ umount /mnt

After the unmount we get the metadata and data space leaks reported in
dmesg/syslog:

 [95794.602253] ------------[ cut here ]------------
 [95794.603322] WARNING: CPU: 0 PID: 31496 at fs/btrfs/inode.c:9561 btrfs_destroy_inode+0x4e/0x206 [btrfs]
 [95794.605167] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod virtio_scsi ata_generic crc32c_intel ata_piix floppy virtio_pci virtio_ring virtio libata scsi_mod e1000 [last unloaded: btrfs]
 [95794.613000] CPU: 0 PID: 31496 Comm: umount Tainted: G        W       4.14.0-rc6-btrfs-next-54+ #1
 [95794.614448] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [95794.615972] task: ffff880075aa0240 task.stack: ffffc90001734000
 [95794.617114] RIP: 0010:btrfs_destroy_inode+0x4e/0x206 [btrfs]
 [95794.618001] RSP: 0018:ffffc90001737d00 EFLAGS: 00010202
 [95794.618721] RAX: 0000000000000000 RBX: ffff880070fa1418 RCX: ffffc90001737c7c
 [95794.619645] RDX: 0000000175aa0240 RSI: 0000000000000001 RDI: ffff880070fa1418
 [95794.620711] RBP: ffffc90001737d38 R08: 0000000000000000 R09: 0000000000000000
 [95794.621932] R10: ffffc90001737c48 R11: ffff88007123e158 R12: ffff880075b6a000
 [95794.623124] R13: ffff88006145c000 R14: ffff880070fa1418 R15: ffff880070c3b4a0
 [95794.624188] FS:  00007fa6793c92c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
 [95794.625578] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [95794.626522] CR2: 000056338670d048 CR3: 00000000610dc005 CR4: 00000000001606f0
 [95794.627647] Call Trace:
 [95794.628128]  destroy_inode+0x3d/0x55
 [95794.628573]  evict+0x177/0x17e
 [95794.629010]  dispose_list+0x50/0x71
 [95794.629478]  evict_inodes+0x132/0x141
 [95794.630289]  generic_shutdown_super+0x3f/0x10b
 [95794.630864]  kill_anon_super+0x12/0x1c
 [95794.631383]  btrfs_kill_super+0x16/0x21 [btrfs]
 [95794.631930]  deactivate_locked_super+0x30/0x68
 [95794.632539]  deactivate_super+0x36/0x39
 [95794.633200]  cleanup_mnt+0x49/0x67
 [95794.633818]  __cleanup_mnt+0x12/0x14
 [95794.634416]  task_work_run+0x82/0xa6
 [95794.634902]  prepare_exit_to_usermode+0xe1/0x10c
 [95794.635525]  syscall_return_slowpath+0x18c/0x1af
 [95794.636122]  entry_SYSCALL_64_fastpath+0xab/0xad
 [95794.636834] RIP: 0033:0x7fa678cb99a7
 [95794.637370] RSP: 002b:00007ffccf0aaed8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
 [95794.638672] RAX: 0000000000000000 RBX: 0000563386706030 RCX: 00007fa678cb99a7
 [95794.639596] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000056338670ca90
 [95794.640703] RBP: 000056338670ca90 R08: 000056338670c740 R09: 0000000000000015
 [95794.641773] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007fa6791bae64
 [95794.643150] R13: 0000000000000000 R14: 0000563386706210 R15: 00007ffccf0ab160
 [95794.644249] Code: ff 4c 8b a8 80 06 00 00 48 8b 87 c0 01 00 00 48 85 c0 74 02 0f ff 48 83 bb e0 02 00 00 00 74 02 0f ff 83 bb 3c ff ff ff 00 74 02 <0f> ff 83 bb 40 ff ff ff 00 74 02 0f ff 48 83 bb f8 fe ff ff 00
 [95794.646929] ---[ end trace e95877675c6ec007 ]---
 [95794.647751] ------------[ cut here ]------------
 [95794.648509] WARNING: CPU: 0 PID: 31496 at fs/btrfs/inode.c:9562 btrfs_destroy_inode+0x59/0x206 [btrfs]
 [95794.649842] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod virtio_scsi ata_generic crc32c_intel ata_piix floppy virtio_pci virtio_ring virtio libata scsi_mod e1000 [last unloaded: btrfs]
 [95794.654659] CPU: 0 PID: 31496 Comm: umount Tainted: G        W       4.14.0-rc6-btrfs-next-54+ #1
 [95794.655894] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [95794.657546] task: ffff880075aa0240 task.stack: ffffc90001734000
 [95794.658433] RIP: 0010:btrfs_destroy_inode+0x59/0x206 [btrfs]
 [95794.659279] RSP: 0018:ffffc90001737d00 EFLAGS: 00010202
 [95794.660054] RAX: 0000000000000000 RBX: ffff880070fa1418 RCX: ffffc90001737c7c
 [95794.660753] RDX: 0000000175aa0240 RSI: 0000000000000001 RDI: ffff880070fa1418
 [95794.661513] RBP: ffffc90001737d38 R08: 0000000000000000 R09: 0000000000000000
 [95794.662289] R10: ffffc90001737c48 R11: ffff88007123e158 R12: ffff880075b6a000
 [95794.663393] R13: ffff88006145c000 R14: ffff880070fa1418 R15: ffff880070c3b4a0
 [95794.664342] FS:  00007fa6793c92c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
 [95794.665673] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [95794.666593] CR2: 000056338670d048 CR3: 00000000610dc005 CR4: 00000000001606f0
 [95794.667629] Call Trace:
 [95794.668065]  destroy_inode+0x3d/0x55
 [95794.668637]  evict+0x177/0x17e
 [95794.669179]  dispose_list+0x50/0x71
 [95794.669830]  evict_inodes+0x132/0x141
 [95794.670416]  generic_shutdown_super+0x3f/0x10b
 [95794.671103]  kill_anon_super+0x12/0x1c
 [95794.671786]  btrfs_kill_super+0x16/0x21 [btrfs]
 [95794.672552]  deactivate_locked_super+0x30/0x68
 [95794.673393]  deactivate_super+0x36/0x39
 [95794.674107]  cleanup_mnt+0x49/0x67
 [95794.674706]  __cleanup_mnt+0x12/0x14
 [95794.675279]  task_work_run+0x82/0xa6
 [95794.675795]  prepare_exit_to_usermode+0xe1/0x10c
 [95794.676507]  syscall_return_slowpath+0x18c/0x1af
 [95794.677275]  entry_SYSCALL_64_fastpath+0xab/0xad
 [95794.678006] RIP: 0033:0x7fa678cb99a7
 [95794.678600] RSP: 002b:00007ffccf0aaed8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
 [95794.679739] RAX: 0000000000000000 RBX: 0000563386706030 RCX: 00007fa678cb99a7
 [95794.680779] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000056338670ca90
 [95794.681837] RBP: 000056338670ca90 R08: 000056338670c740 R09: 0000000000000015
 [95794.682867] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007fa6791bae64
 [95794.683891] R13: 0000000000000000 R14: 0000563386706210 R15: 00007ffccf0ab160
 [95794.684843] Code: c0 01 00 00 48 85 c0 74 02 0f ff 48 83 bb e0 02 00 00 00 74 02 0f ff 83 bb 3c ff ff ff 00 74 02 0f ff 83 bb 40 ff ff ff 00 74 02 <0f> ff 48 83 bb f8 fe ff ff 00 74 02 0f ff 48 83 bb 00 ff ff ff
 [95794.687156] ---[ end trace e95877675c6ec008 ]---
 [95794.687876] ------------[ cut here ]------------
 [95794.688579] WARNING: CPU: 0 PID: 31496 at fs/btrfs/inode.c:9565 btrfs_destroy_inode+0x7d/0x206 [btrfs]
 [95794.689735] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod virtio_scsi ata_generic crc32c_intel ata_piix floppy virtio_pci virtio_ring virtio libata scsi_mod e1000 [last unloaded: btrfs]
 [95794.695015] CPU: 0 PID: 31496 Comm: umount Tainted: G        W       4.14.0-rc6-btrfs-next-54+ #1
 [95794.696396] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [95794.697956] task: ffff880075aa0240 task.stack: ffffc90001734000
 [95794.698925] RIP: 0010:btrfs_destroy_inode+0x7d/0x206 [btrfs]
 [95794.699763] RSP: 0018:ffffc90001737d00 EFLAGS: 00010206
 [95794.700434] RAX: 0000000000000000 RBX: ffff880070fa1418 RCX: ffffc90001737c7c
 [95794.701445] RDX: 0000000175aa0240 RSI: 0000000000000001 RDI: ffff880070fa1418
 [95794.702448] RBP: ffffc90001737d38 R08: 0000000000000000 R09: 0000000000000000
 [95794.703557] R10: ffffc90001737c48 R11: ffff88007123e158 R12: ffff880075b6a000
 [95794.704441] R13: ffff88006145c000 R14: ffff880070fa1418 R15: ffff880070c3b4a0
 [95794.705270] FS:  00007fa6793c92c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
 [95794.706341] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [95794.707001] CR2: 000056338670d048 CR3: 00000000610dc005 CR4: 00000000001606f0
 [95794.708030] Call Trace:
 [95794.708466]  destroy_inode+0x3d/0x55
 [95794.709071]  evict+0x177/0x17e
 [95794.709497]  dispose_list+0x50/0x71
 [95794.709973]  evict_inodes+0x132/0x141
 [95794.710564]  generic_shutdown_super+0x3f/0x10b
 [95794.711200]  kill_anon_super+0x12/0x1c
 [95794.711633]  btrfs_kill_super+0x16/0x21 [btrfs]
 [95794.712139]  deactivate_locked_super+0x30/0x68
 [95794.712608]  deactivate_super+0x36/0x39
 [95794.713093]  cleanup_mnt+0x49/0x67
 [95794.713514]  __cleanup_mnt+0x12/0x14
 [95794.713933]  task_work_run+0x82/0xa6
 [95794.714543]  prepare_exit_to_usermode+0xe1/0x10c
 [95794.715247]  syscall_return_slowpath+0x18c/0x1af
 [95794.715952]  entry_SYSCALL_64_fastpath+0xab/0xad
 [95794.716653] RIP: 0033:0x7fa678cb99a7
 [95794.721100] RSP: 002b:00007ffccf0aaed8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
 [95794.722052] RAX: 0000000000000000 RBX: 0000563386706030 RCX: 00007fa678cb99a7
 [95794.722856] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000056338670ca90
 [95794.723698] RBP: 000056338670ca90 R08: 000056338670c740 R09: 0000000000000015
 [95794.724736] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007fa6791bae64
 [95794.725928] R13: 0000000000000000 R14: 0000563386706210 R15: 00007ffccf0ab160
 [95794.726728] Code: 40 ff ff ff 00 74 02 0f ff 48 83 bb f8 fe ff ff 00 74 02 0f ff 48 83 bb 00 ff ff ff 00 74 02 0f ff 48 83 bb 30 ff ff ff 00 74 02 <0f> ff 48 83 bb 08 ff ff ff 00 74 02 0f ff 4d 85 e4 0f 84 52 01
 [95794.729203] ---[ end trace e95877675c6ec009 ]---
 [95794.841054] ------------[ cut here ]------------
 [95794.841829] WARNING: CPU: 0 PID: 31496 at fs/btrfs/extent-tree.c:5831 btrfs_free_block_groups+0x235/0x36a [btrfs]
 [95794.843425] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod virtio_scsi ata_generic crc32c_intel ata_piix floppy virtio_pci virtio_ring virtio libata scsi_mod e1000 [last unloaded: btrfs]
 [95794.850658] CPU: 0 PID: 31496 Comm: umount Tainted: G        W       4.14.0-rc6-btrfs-next-54+ #1
 [95794.852590] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [95794.854752] task: ffff880075aa0240 task.stack: ffffc90001734000
 [95794.855812] RIP: 0010:btrfs_free_block_groups+0x235/0x36a [btrfs]
 [95794.856811] RSP: 0018:ffffc90001737d70 EFLAGS: 00010206
 [95794.857805] RAX: 0000000080000000 RBX: ffff88006145c000 RCX: 0000000000000001
 [95794.859014] RDX: 00000001810af668 RSI: 0000000000000002 RDI: 00000000ffffffff
 [95794.860270] RBP: ffffc90001737d98 R08: 0000000000000000 R09: ffffffff817e22b9
 [95794.861525] R10: ffffc90001737c80 R11: 00000000000337fd R12: 0000000000000000
 [95794.862700] R13: ffff88006145c0c0 R14: ffff88021b61a800 R15: ffff88006145c100
 [95794.863810] FS:  00007fa6793c92c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
 [95794.865149] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [95794.866099] CR2: 000056338670d048 CR3: 00000000610dc005 CR4: 00000000001606f0
 [95794.867198] Call Trace:
 [95794.867626]  close_ctree+0x1db/0x2b8 [btrfs]
 [95794.868188]  ? evict_inodes+0x132/0x141
 [95794.869037]  btrfs_put_super+0x15/0x17 [btrfs]
 [95794.870400]  generic_shutdown_super+0x6a/0x10b
 [95794.871262]  kill_anon_super+0x12/0x1c
 [95794.872046]  btrfs_kill_super+0x16/0x21 [btrfs]
 [95794.872746]  deactivate_locked_super+0x30/0x68
 [95794.873687]  deactivate_super+0x36/0x39
 [95794.874639]  cleanup_mnt+0x49/0x67
 [95794.875504]  __cleanup_mnt+0x12/0x14
 [95794.876126]  task_work_run+0x82/0xa6
 [95794.876788]  prepare_exit_to_usermode+0xe1/0x10c
 [95794.877777]  syscall_return_slowpath+0x18c/0x1af
 [95794.878381]  entry_SYSCALL_64_fastpath+0xab/0xad
 [95794.878888] RIP: 0033:0x7fa678cb99a7
 [95794.879307] RSP: 002b:00007ffccf0aaed8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
 [95794.880204] RAX: 0000000000000000 RBX: 0000563386706030 RCX: 00007fa678cb99a7
 [95794.881640] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000056338670ca90
 [95794.882690] RBP: 000056338670ca90 R08: 000056338670c740 R09: 0000000000000015
 [95794.883538] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007fa6791bae64
 [95794.884562] R13: 0000000000000000 R14: 0000563386706210 R15: 00007ffccf0ab160
 [95794.885664] Code: 89 ef e8 07 ec 32 e1 e8 9d c0 ea e0 48 8d b3 28 02 00 00 48 83 c9 ff 31 d2 48 89 df e8 29 c5 ff ff 48 83 bb 80 02 00 00 00 74 02 <0f> ff 48 83 bb 88 02 00 00 00 74 02 0f ff 48 83 bb d8 02 00 00
 [95794.887980] ---[ end trace e95877675c6ec00a ]---
 [95794.888739] ------------[ cut here ]------------
 [95794.889405] WARNING: CPU: 0 PID: 31496 at fs/btrfs/extent-tree.c:5832 btrfs_free_block_groups+0x241/0x36a [btrfs]
 [95794.891020] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod virtio_scsi ata_generic crc32c_intel ata_piix floppy virtio_pci virtio_ring virtio libata scsi_mod e1000 [last unloaded: btrfs]
 [95794.897551] CPU: 0 PID: 31496 Comm: umount Tainted: G        W       4.14.0-rc6-btrfs-next-54+ #1
 [95794.898509] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [95794.899685] task: ffff880075aa0240 task.stack: ffffc90001734000
 [95794.900592] RIP: 0010:btrfs_free_block_groups+0x241/0x36a [btrfs]
 [95794.901387] RSP: 0018:ffffc90001737d70 EFLAGS: 00010206
 [95794.902300] RAX: 0000000080000000 RBX: ffff88006145c000 RCX: 0000000000000001
 [95794.903260] RDX: 00000001810af668 RSI: 0000000000000002 RDI: 00000000ffffffff
 [95794.904332] RBP: ffffc90001737d98 R08: 0000000000000000 R09: ffffffff817e22b9
 [95794.905300] R10: ffffc90001737c80 R11: 00000000000337fd R12: 0000000000000000
 [95794.906439] R13: ffff88006145c0c0 R14: ffff88021b61a800 R15: ffff88006145c100
 [95794.907459] FS:  00007fa6793c92c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
 [95794.908625] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [95794.909511] CR2: 000056338670d048 CR3: 00000000610dc005 CR4: 00000000001606f0
 [95794.910630] Call Trace:
 [95794.911153]  close_ctree+0x1db/0x2b8 [btrfs]
 [95794.911837]  ? evict_inodes+0x132/0x141
 [95794.912344]  btrfs_put_super+0x15/0x17 [btrfs]
 [95794.912975]  generic_shutdown_super+0x6a/0x10b
 [95794.913788]  kill_anon_super+0x12/0x1c
 [95794.914424]  btrfs_kill_super+0x16/0x21 [btrfs]
 [95794.915142]  deactivate_locked_super+0x30/0x68
 [95794.915831]  deactivate_super+0x36/0x39
 [95794.916433]  cleanup_mnt+0x49/0x67
 [95794.917045]  __cleanup_mnt+0x12/0x14
 [95794.917665]  task_work_run+0x82/0xa6
 [95794.918309]  prepare_exit_to_usermode+0xe1/0x10c
 [95794.919021]  syscall_return_slowpath+0x18c/0x1af
 [95794.919722]  entry_SYSCALL_64_fastpath+0xab/0xad
 [95794.920426] RIP: 0033:0x7fa678cb99a7
 [95794.921039] RSP: 002b:00007ffccf0aaed8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
 [95794.922303] RAX: 0000000000000000 RBX: 0000563386706030 RCX: 00007fa678cb99a7
 [95794.923335] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000056338670ca90
 [95794.924364] RBP: 000056338670ca90 R08: 000056338670c740 R09: 0000000000000015
 [95794.925435] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007fa6791bae64
 [95794.926533] R13: 0000000000000000 R14: 0000563386706210 R15: 00007ffccf0ab160
 [95794.927557] Code: 48 8d b3 28 02 00 00 48 83 c9 ff 31 d2 48 89 df e8 29 c5 ff ff 48 83 bb 80 02 00 00 00 74 02 0f ff 48 83 bb 88 02 00 00 00 74 02 <0f> ff 48 83 bb d8 02 00 00 00 74 02 0f ff 48 83 bb e0 02 00 00
 [95794.930166] ---[ end trace e95877675c6ec00b ]---
 [95794.930961] ------------[ cut here ]------------
 [95794.931727] WARNING: CPU: 0 PID: 31496 at fs/btrfs/extent-tree.c:9953 btrfs_free_block_groups+0x2bc/0x36a [btrfs]
 [95794.932729] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod virtio_scsi ata_generic crc32c_intel ata_piix floppy virtio_pci virtio_ring virtio libata scsi_mod e1000 [last unloaded: btrfs]
 [95794.938394] CPU: 0 PID: 31496 Comm: umount Tainted: G        W       4.14.0-rc6-btrfs-next-54+ #1
 [95794.939842] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [95794.941455] task: ffff880075aa0240 task.stack: ffffc90001734000
 [95794.942336] RIP: 0010:btrfs_free_block_groups+0x2bc/0x36a [btrfs]
 [95794.943268] RSP: 0018:ffffc90001737d70 EFLAGS: 00010206
 [95794.944127] RAX: ffff8802004fd0e8 RBX: ffff88006145c000 RCX: 0000000000000001
 [95794.945211] RDX: 00000001810af668 RSI: 0000000000000002 RDI: 00000000ffffffff
 [95794.946316] RBP: ffffc90001737d98 R08: 0000000000000000 R09: ffffffff817e22b9
 [95794.947271] R10: ffffc90001737c80 R11: 00000000000337fd R12: ffff8802004fd0e8
 [95794.948219] R13: ffff88006145c0c0 R14: ffff88006145e598 R15: ffff88006145c100
 [95794.949193] FS:  00007fa6793c92c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
 [95794.950495] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [95794.951338] CR2: 000056338670d048 CR3: 00000000610dc005 CR4: 00000000001606f0
 [95794.952361] Call Trace:
 [95794.952811]  close_ctree+0x1db/0x2b8 [btrfs]
 [95794.953522]  ? evict_inodes+0x132/0x141
 [95794.954543]  btrfs_put_super+0x15/0x17 [btrfs]
 [95794.955231]  generic_shutdown_super+0x6a/0x10b
 [95794.955916]  kill_anon_super+0x12/0x1c
 [95794.956414]  btrfs_kill_super+0x16/0x21 [btrfs]
 [95794.956953]  deactivate_locked_super+0x30/0x68
 [95794.957635]  deactivate_super+0x36/0x39
 [95794.958256]  cleanup_mnt+0x49/0x67
 [95794.958701]  __cleanup_mnt+0x12/0x14
 [95794.959181]  task_work_run+0x82/0xa6
 [95794.959635]  prepare_exit_to_usermode+0xe1/0x10c
 [95794.960182]  syscall_return_slowpath+0x18c/0x1af
 [95794.960731]  entry_SYSCALL_64_fastpath+0xab/0xad
 [95794.961438] RIP: 0033:0x7fa678cb99a7
 [95794.961990] RSP: 002b:00007ffccf0aaed8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
 [95794.963111] RAX: 0000000000000000 RBX: 0000563386706030 RCX: 00007fa678cb99a7
 [95794.963975] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000056338670ca90
 [95794.964680] RBP: 000056338670ca90 R08: 000056338670c740 R09: 0000000000000015
 [95794.965763] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007fa6791bae64
 [95794.966868] R13: 0000000000000000 R14: 0000563386706210 R15: 00007ffccf0ab160
 [95794.967800] Code: 00 00 00 4c 8b a3 98 25 00 00 49 83 bc 24 60 ff ff ff 00 75 16 49 83 bc 24 68 ff ff ff 00 75 0b 49 83 bc 24 70 ff ff ff 00 74 16 <0f> ff 49 8d b4 24 18 ff ff ff 31 c9 31 d2 48 89 df e8 93 7a ff
 [95794.970629] ---[ end trace e95877675c6ec00c ]---
 [95794.971451] BTRFS info (device sdi): space_info 1 has 7680000 free, is not full
 [95794.972351] BTRFS info (device sdi): space_info total=8388608, used=704512, pinned=0, reserved=0, may_use=4096, readonly=0
 [95794.973595] ------------[ cut here ]------------
 [95794.974353] WARNING: CPU: 0 PID: 31496 at fs/btrfs/extent-tree.c:9953 btrfs_free_block_groups+0x2bc/0x36a [btrfs]
 [95794.980163] Modules linked in: btrfs xfs ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper parport_pc psmouse sg i2c_piix4 parport i2c_core evdev pcspkr button serio_raw sunrpc loop autofs4 ext4 crc16 mbcache jbd2 zstd_decompress zstd_compress xxhash raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod sd_mod virtio_scsi ata_generic crc32c_intel ata_piix floppy virtio_pci virtio_ring virtio libata scsi_mod e1000 [last unloaded: btrfs]
 [95794.986461] CPU: 0 PID: 31496 Comm: umount Tainted: G        W       4.14.0-rc6-btrfs-next-54+ #1
 [95794.987591] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [95794.988929] task: ffff880075aa0240 task.stack: ffffc90001734000
 [95794.989922] RIP: 0010:btrfs_free_block_groups+0x2bc/0x36a [btrfs]
 [95794.990715] RSP: 0018:ffffc90001737d70 EFLAGS: 00010206
 [95794.991431] RAX: ffff88020f6e70e8 RBX: ffff88006145c000 RCX: ffffffff8115a906
 [95794.992455] RDX: ffffffff8115a902 RSI: ffff880075aa0b40 RDI: ffff880075aa0b40
 [95794.993535] RBP: ffffc90001737d98 R08: 0000000000000020 R09: fffffffffffffff7
 [95794.994573] R10: 00000000ffffffc4 R11: ffff8800633b1bc0 R12: ffff88020f6e70e8
 [95794.996250] R13: 0000000000000038 R14: ffff88006145e598 R15: 0000000000000000
 [95794.997233] FS:  00007fa6793c92c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
 [95794.998592] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [95794.999484] CR2: 000056338670d048 CR3: 00000000610dc005 CR4: 00000000001606f0
 [95795.000542] Call Trace:
 [95795.001138]  close_ctree+0x1db/0x2b8 [btrfs]
 [95795.001885]  ? evict_inodes+0x132/0x141
 [95795.002407]  btrfs_put_super+0x15/0x17 [btrfs]
 [95795.003093]  generic_shutdown_super+0x6a/0x10b
 [95795.003720]  kill_anon_super+0x12/0x1c
 [95795.004353]  btrfs_kill_super+0x16/0x21 [btrfs]
 [95795.005095]  deactivate_locked_super+0x30/0x68
 [95795.005716]  deactivate_super+0x36/0x39
 [95795.006388]  cleanup_mnt+0x49/0x67
 [95795.006939]  __cleanup_mnt+0x12/0x14
 [95795.007512]  task_work_run+0x82/0xa6
 [95795.008124]  prepare_exit_to_usermode+0xe1/0x10c
 [95795.008994]  syscall_return_slowpath+0x18c/0x1af
 [95795.009831]  entry_SYSCALL_64_fastpath+0xab/0xad
 [95795.010610] RIP: 0033:0x7fa678cb99a7
 [95795.011193] RSP: 002b:00007ffccf0aaed8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
 [95795.012327] RAX: 0000000000000000 RBX: 0000563386706030 RCX: 00007fa678cb99a7
 [95795.013432] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000056338670ca90
 [95795.014558] RBP: 000056338670ca90 R08: 000056338670c740 R09: 0000000000000015
 [95795.015577] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007fa6791bae64
 [95795.016569] R13: 0000000000000000 R14: 0000563386706210 R15: 00007ffccf0ab160
 [95795.017662] Code: 00 00 00 4c 8b a3 98 25 00 00 49 83 bc 24 60 ff ff ff 00 75 16 49 83 bc 24 68 ff ff ff 00 75 0b 49 83 bc 24 70 ff ff ff 00 74 16 <0f> ff 49 8d b4 24 18 ff ff ff 31 c9 31 d2 48 89 df e8 93 7a ff
 [95795.020538] ---[ end trace e95877675c6ec00d ]---
 [95795.021259] BTRFS info (device sdi): space_info 4 has 1072775168 free, is not full
 [95795.022390] BTRFS info (device sdi): space_info total=1073741824, used=114688, pinned=0, reserved=0, may_use=786432, readonly=65536

Fix this by ensuring the zero range operation does not call
btrfs_truncate_block() if the corresponding extent is an unwritten one
(it's pointless anyway, since reading from an unwritten extent yields
zeroes).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index baad81c1f9a3..06a631f89b1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2852,12 +2852,18 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
 	return ret ? ret : ret2;
 }
 
+enum {
+	RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
+	RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
+	RANGE_BOUNDARY_HOLE = 2,
+};
+
 static int btrfs_zero_range_check_range_boundary(struct inode *inode,
 						 u64 offset)
 {
 	const u64 sectorsize = btrfs_inode_sectorsize(inode);
 	struct extent_map *em;
-	int ret = 0;
+	int ret;
 
 	offset = round_down(offset, sectorsize);
 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
@@ -2865,7 +2871,11 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode,
 		return PTR_ERR(em);
 
 	if (em->block_start == EXTENT_MAP_HOLE)
-		ret = 1;
+		ret = RANGE_BOUNDARY_HOLE;
+	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
+	else
+		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
 
 	free_extent_map(em);
 	return ret;
@@ -2974,13 +2984,15 @@ static int btrfs_zero_range(struct inode *inode,
 		ret = btrfs_zero_range_check_range_boundary(inode, offset);
 		if (ret < 0)
 			goto out;
-		if (ret) {
+		if (ret == RANGE_BOUNDARY_HOLE) {
 			alloc_start = round_down(offset, sectorsize);
 			ret = 0;
-		} else {
+		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
 			ret = btrfs_truncate_block(inode, offset, 0, 0);
 			if (ret)
 				goto out;
+		} else {
+			ret = 0;
 		}
 	}
 
@@ -2989,13 +3001,15 @@ static int btrfs_zero_range(struct inode *inode,
 							    offset + len);
 		if (ret < 0)
 			goto out;
-		if (ret) {
+		if (ret == RANGE_BOUNDARY_HOLE) {
 			alloc_end = round_up(offset + len, sectorsize);
 			ret = 0;
-		} else {
+		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
 			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
 			if (ret)
 				goto out;
+		} else {
+			ret = 0;
 		}
 	}
 
-- 
cgit v1.2.3


From a6f93c71d412ba8ed743152c3a54ad0b78dcd9c7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 15 Nov 2017 16:28:11 -0700
Subject: Btrfs: avoid losing data raid profile when deleting a device

We've avoided data losing raid profile when doing balance, but it
turns out that deleting a device could also result in the same
problem.

Say we have 3 disks, and they're created with '-d raid1' profile.

- We have chunk P (the only data chunk on the empty btrfs).

- Suppose that chunk P's two raid1 copies reside in disk A and disk B.

- Now, 'btrfs device remove disk B'
         btrfs_rm_device()
	   -> btrfs_shrink_device()
	      -> btrfs_relocate_chunk() #relocate any chunk on disk B
	      	 			 to other places.

- Chunk P will be removed and a new chunk will be created to hold
  those data, but as chunk P is the only one holding raid1 profile,
  after it goes away, the new chunk will be created as single profile
  which is our default profile.

This fixes the problem by creating an empty data chunk before
relocating the data chunk.

Metadata/System chunk are supposed to have non-zero bytes all the time
so their raid profile is preserved.

Reported-by: James Alandt <James.Alandt@wdc.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 84 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2333d48ca1d7..c831a089471f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3106,6 +3106,48 @@ error:
 	return ret;
 }
 
+/*
+ * return 1 : allocate a data chunk successfully,
+ * return <0: errors during allocating a data chunk,
+ * return 0 : no need to allocate a data chunk.
+ */
+static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
+				      u64 chunk_offset)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 bytes_used;
+	u64 chunk_type;
+
+	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	ASSERT(cache);
+	chunk_type = cache->flags;
+	btrfs_put_block_group(cache);
+
+	if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
+		spin_lock(&fs_info->data_sinfo->lock);
+		bytes_used = fs_info->data_sinfo->bytes_used;
+		spin_unlock(&fs_info->data_sinfo->lock);
+
+		if (!bytes_used) {
+			struct btrfs_trans_handle *trans;
+			int ret;
+
+			trans =	btrfs_join_transaction(fs_info->tree_root);
+			if (IS_ERR(trans))
+				return PTR_ERR(trans);
+
+			ret = btrfs_force_chunk_alloc(trans, fs_info,
+						      BTRFS_BLOCK_GROUP_DATA);
+			btrfs_end_transaction(trans);
+			if (ret < 0)
+				return ret;
+
+			return 1;
+		}
+	}
+	return 0;
+}
+
 static int insert_balance_item(struct btrfs_fs_info *fs_info,
 			       struct btrfs_balance_control *bctl)
 {
@@ -3564,7 +3606,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	u32 count_meta = 0;
 	u32 count_sys = 0;
 	int chunk_reserved = 0;
-	u64 bytes_used = 0;
 
 	/* step one make some room on all the devices */
 	devices = &fs_info->fs_devices->devices;
@@ -3723,28 +3764,21 @@ again:
 			goto loop;
 		}
 
-		ASSERT(fs_info->data_sinfo);
-		spin_lock(&fs_info->data_sinfo->lock);
-		bytes_used = fs_info->data_sinfo->bytes_used;
-		spin_unlock(&fs_info->data_sinfo->lock);
-
-		if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
-		    !chunk_reserved && !bytes_used) {
-			trans = btrfs_start_transaction(chunk_root, 0);
-			if (IS_ERR(trans)) {
-				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
-				ret = PTR_ERR(trans);
-				goto error;
-			}
-
-			ret = btrfs_force_chunk_alloc(trans, fs_info,
-						      BTRFS_BLOCK_GROUP_DATA);
-			btrfs_end_transaction(trans);
+		if (!chunk_reserved) {
+			/*
+			 * We may be relocating the only data chunk we have,
+			 * which could potentially end up with losing data's
+			 * raid profile, so lets allocate an empty one in
+			 * advance.
+			 */
+			ret = btrfs_may_alloc_data_chunk(fs_info,
+							 found_key.offset);
 			if (ret < 0) {
 				mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 				goto error;
+			} else if (ret == 1) {
+				chunk_reserved = 1;
 			}
-			chunk_reserved = 1;
 		}
 
 		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
@@ -4507,6 +4541,18 @@ again:
 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
 		btrfs_release_path(path);
 
+		/*
+		 * We may be relocating the only data chunk we have,
+		 * which could potentially end up with losing data's
+		 * raid profile, so lets allocate an empty one in
+		 * advance.
+		 */
+		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
+		if (ret < 0) {
+			mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+			goto done;
+		}
+
 		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
 		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
 		if (ret && ret != -ENOSPC)
-- 
cgit v1.2.3


From e2731e55884f2138a252b0a3d7b24d57e49c3c59 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Tue, 9 Jan 2018 09:05:41 +0800
Subject: btrfs: define SUPER_FLAG_METADUMP_V2

btrfs-progs uses super flag bit BTRFS_SUPER_FLAG_METADUMP_V2 (1ULL << 34).
So just define that in kernel so that we know its been used.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c              | 3 ++-
 include/uapi/linux/btrfs_tree.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 73ab44159d82..1916016e9fbb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -61,7 +61,8 @@
 				 BTRFS_HEADER_FLAG_RELOC |\
 				 BTRFS_SUPER_FLAG_ERROR |\
 				 BTRFS_SUPER_FLAG_SEEDING |\
-				 BTRFS_SUPER_FLAG_METADUMP)
+				 BTRFS_SUPER_FLAG_METADUMP |\
+				 BTRFS_SUPER_FLAG_METADUMP_V2)
 
 static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 6d6e5da51527..38ab0e06259a 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -456,6 +456,7 @@ struct btrfs_free_space_header {
 
 #define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
+#define BTRFS_SUPER_FLAG_METADUMP_V2	(1ULL << 34)
 
 
 /*
-- 
cgit v1.2.3


From 98820a7e244b17b8a4d9e9d1ff9d3b4e5bfca58b Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Tue, 9 Jan 2018 09:05:42 +0800
Subject: btrfs: add support for SUPER_FLAG_CHANGING_FSID

The UUID change by btrfstune sets SUPER_FLAG_CHANGING_FSID and resets it
only when changing fsid is complete. Its not a good idea to mount the
device anything in between, reading metadata blocks would fail with UUID
mismatch.

This patch doesn't add SUPER_FLAG_CHANGING_FSID into
BTRFS_SUPER_FLAG_SUPP list, so mount will fail (along with the fix in
the next patch) when SUPER_FLAG_CHANGING_FSID is set.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/uapi/linux/btrfs_tree.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 38ab0e06259a..aff1356c2bb8 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -457,6 +457,7 @@ struct btrfs_free_space_header {
 #define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
 #define BTRFS_SUPER_FLAG_METADUMP_V2	(1ULL << 34)
+#define BTRFS_SUPER_FLAG_CHANGING_FSID	(1ULL << 35)
 
 
 /*
-- 
cgit v1.2.3


From 6f794e3c5c8f8fdd3b5bb20d9ded894e685b5bbe Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Tue, 9 Jan 2018 09:05:43 +0800
Subject: btrfs: fail mount when sb flag is not in BTRFS_SUPER_FLAG_SUPP

It appears from the original commit [1] that there isn't any design
specific reason not to fail the mount instead of just warning. This
patch will change it to fail.

[1]
 commit 319e4d0661e5323c9f9945f0f8fb5905e5fe74c3
    btrfs: Enhance super validation check

Fixes: 319e4d0661e5323 ("btrfs: Enhance super validation check")
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1916016e9fbb..ed095202942f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3907,9 +3907,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
 		btrfs_err(fs_info, "no valid FS found");
 		ret = -EINVAL;
 	}
-	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
-		btrfs_warn(fs_info, "unrecognized super flag: %llu",
+	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
+		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
 				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+		ret = -EINVAL;
+	}
 	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
 		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
 				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
-- 
cgit v1.2.3


From 6670d4c2d9b7d352cbf90cd18c516bb6444acf34 Mon Sep 17 00:00:00 2001
From: Xiongfeng Wang <xiongfeng.wang@linaro.org>
Date: Mon, 8 Jan 2018 19:51:22 +0800
Subject: btrfs: use correct string length in DEV_INFO ioctl

gcc-8 reports:

fs/btrfs/ioctl.c: In function 'btrfs_ioctl':
./include/linux/string.h:245:9: warning: '__builtin_strncpy' specified
bound 1024 equals destination size [-Wstringop-truncation]

We need one less byte or call strlcpy() to make it a nul-terminated
string. This is done on the next line anyway, but we want to avoid the
warning.

Signed-off-by: Xiongfeng Wang <xiongfeng.wang@linaro.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f6d4d5810cc1..f573cad72b7e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2798,7 +2798,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
 		struct rcu_string *name;
 
 		name = rcu_dereference(dev->name);
-		strncpy(di_args->path, name->str, sizeof(di_args->path));
+		strncpy(di_args->path, name->str, sizeof(di_args->path) - 1);
 		di_args->path[sizeof(di_args->path) - 1] = 0;
 	} else {
 		di_args->path[0] = '\0';
-- 
cgit v1.2.3


From e2683fc9d219430f5b78889b50cde7f40efeba7b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 10 Jan 2018 15:13:07 +0100
Subject: btrfs: tree-check: reduce stack consumption in check_dir_item

I've noticed that the updated item checker stack consumption increased
dramatically in 542f5385e20cf97447 ("btrfs: tree-checker: Add checker
for dir item")

tree-checker.c:check_leaf                    +552 (176 -> 728)

The array is 255 bytes long, dynamic allocation would slow down the
sanity checks so it's more reasonable to keep it on-stack. Moving the
variable to the scope of use reduces the stack usage again

tree-checker.c:check_leaf                    -264 (728 -> 464)

Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7c55e3ba5a6c..c3c8d48f6618 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -259,7 +259,6 @@ static int check_dir_item(struct btrfs_root *root,
 
 	di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 	while (cur < item_size) {
-		char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
 		u32 name_len;
 		u32 data_len;
 		u32 max_name_len;
@@ -342,6 +341,8 @@ static int check_dir_item(struct btrfs_root *root,
 		 */
 		if (key->type == BTRFS_DIR_ITEM_KEY ||
 		    key->type == BTRFS_XATTR_ITEM_KEY) {
+			char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+
 			read_extent_buffer(leaf, namebuf,
 					(unsigned long)(di + 1), name_len);
 			name_hash = btrfs_name_hash(namebuf, name_len);
-- 
cgit v1.2.3


From 1c94da9dd913e69c67fbc77603d56ffa61e454c2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 10 Jan 2018 13:15:18 +0800
Subject: btrfs: cleanup btrfs_free_stale_device() usage

We call btrfs_free_stale_device() only when we alloc a new struct
btrfs_device (ret=1), so move it closer to where we alloc the new
device. Also drop the comments.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c831a089471f..11fccf2fcb1f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -782,6 +782,7 @@ static noinline int device_list_add(const char *path,
 
 		ret = 1;
 		device->fs_devices = fs_devices;
+		btrfs_free_stale_device(device);
 	} else if (!device->name || strcmp(device->name->str, path)) {
 		/*
 		 * When FS is already mounted.
@@ -840,13 +841,6 @@ static noinline int device_list_add(const char *path,
 	if (!fs_devices->opened)
 		device->generation = found_transid;
 
-	/*
-	 * if there is new btrfs on an already registered device,
-	 * then remove the stale device entry.
-	 */
-	if (ret > 0)
-		btrfs_free_stale_device(device);
-
 	*fs_devices_ret = fs_devices;
 
 	return ret;
-- 
cgit v1.2.3


From d2560ebd231e1f8f6f125d1721cf14bc8d5422b3 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 11 Jan 2018 15:46:44 +0200
Subject: btrfs: Make btrfs_inode_rsv_release static

It's not used outside of extent-tree so there is no reason to not be
static.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8d51e4bb67c1..05751a677da4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5773,7 +5773,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
  * This is the same as btrfs_block_rsv_release, except that it handles the
  * tracepoint for the reservation.
  */
-void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
-- 
cgit v1.2.3


From df6703e15c493fcc7b7f7576dff17d82e460945b Mon Sep 17 00:00:00 2001
From: Su Yue <suy.fnst@cn.fujitsu.com>
Date: Fri, 12 Jan 2018 11:08:02 +0800
Subject: btrfs: correct wrong comment about magic number of index_cnt

There is no function named btrfs_get_inode_index_count.
Explanation for magic number index_cnt=2 in btrfs_new_inode() is
actually located in btrfs_set_inode_index_count().

So replace 'btrfs_get_inode_index_count' in the comment by
'btrfs_set_inode_index_count'.

Signed-off-by: Su Yue <suy.fnst@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index eebfe2615428..633b83b709a1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6315,7 +6315,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	}
 	/*
 	 * index_cnt is ignored for everything but a dir,
-	 * btrfs_get_inode_index_count has an explanation for the magic
+	 * btrfs_set_inode_index_count has an explanation for the magic
 	 * number
 	 */
 	BTRFS_I(inode)->index_cnt = 2;
-- 
cgit v1.2.3


From 0198e5b707cfeb5defbd1b71b1ec6e71580d7db9 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 12 Jan 2018 18:07:01 -0700
Subject: Btrfs: raid56: iterate raid56 internal bio with
 bio_for_each_segment_all

Bio iterated by set_bio_pages_uptodate() is raid56 internal one, so it
will never be a BIO_CLONED bio, and since this is called by end_io
functions, bio->bi_iter.bi_size is zero, we mustn't use
bio_for_each_segment() as that is a no-op if bi_size is zero.

Fixes: 6592e58c6b68e61f003a01ba29a3716e7e2e9484 ("Btrfs: fix write corruption due to bio cloning on raid5/6")
Cc: <stable@vger.kernel.org> # v4.12-rc6+
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 7b382eeada72..f86ba6a319fe 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1441,14 +1441,13 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
  */
 static void set_bio_pages_uptodate(struct bio *bio)
 {
-	struct bio_vec bvec;
-	struct bvec_iter iter;
+	struct bio_vec *bvec;
+	int i;
 
-	if (bio_flagged(bio, BIO_CLONED))
-		bio->bi_iter = btrfs_io_bio(bio)->iter;
+	ASSERT(!bio_flagged(bio, BIO_CLONED));
 
-	bio_for_each_segment(bvec, bio, iter)
-		SetPageUptodate(bvec.bv_page);
+	bio_for_each_segment_all(bvec, bio, i)
+		SetPageUptodate(bvec->bv_page);
 }
 
 /*
-- 
cgit v1.2.3


From 44ac474def527a9c595a88400f4717dacae03b8a Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 12 Jan 2018 18:07:02 -0700
Subject: Btrfs: do not cache rbio pages if using raid6 recover

Since raid6 recover tries all possible combinations of failed stripes,

- when raid6 rebuild algorithm is used, i.e. raid6_datap_recov() and
  raid6_2data_recov(), it may change the in-memory content of failed
  stripes, if such a raid bio is cached, a later raid write rmw or recover
  can steal @stripe_pages from it instead of reading from disks, such that
  it carries the wrong content to do write rmw or recovery and ends up
  with corruption or recovery failures.

- when raid5 rebuild algorithm is used, i.e. xor, raid bio can be cached
  because the only failed stripe which contains @rbio->bio_pages gets
  modified, others remain the same so that their in-memory content is
  consistent with their on-disk content.

This adds a check to skip caching rbio if using raid6 recover.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index f86ba6a319fe..2f1ff7007280 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1974,7 +1974,22 @@ cleanup:
 
 cleanup_io:
 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
-		if (err == BLK_STS_OK)
+		/*
+		 * - In case of two failures, where rbio->failb != -1:
+		 *
+		 *   Do not cache this rbio since the above read reconstruction
+		 *   (raid6_datap_recov() or raid6_2data_recov()) may have
+		 *   changed some content of stripes which are not identical to
+		 *   on-disk content any more, otherwise, a later write/recover
+		 *   may steal stripe_pages from this rbio and end up with
+		 *   corruptions or rebuild failures.
+		 *
+		 * - In case of single failure, where rbio->failb == -1:
+		 *
+		 *   Cache this rbio iff the above read reconstruction is
+		 *   excuted without problems.
+		 */
+		if (err == BLK_STS_OK && rbio->failb < 0)
 			cache_rbio_pages(rbio);
 		else
 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
-- 
cgit v1.2.3


From 7583d8d088ff2c323b1d4f15b191ca2c23d32558 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 9 Jan 2018 18:36:25 -0700
Subject: Btrfs: raid56: fix race between merge_bio and rbio_orig_end_io
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before rbio_orig_end_io() goes to free rbio, rbio may get merged with
more bios from other rbios and rbio->bio_list becomes non-empty,
in that case, these newly merged bios don't end properly.

Once unlock_stripe() is done, rbio->bio_list will not be updated any
more and we can call bio_endio() on all queued bios.

It should only happen in error-out cases, the normal path of recover
and full stripe write have already set RBIO_RMW_LOCKED_BIT to disable
merge before doing IO, so rbio_orig_end_io() called by them doesn't
have the above issue.

Reported-by: Jérôme Carretero <cJ-ko@zougloub.eu>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 2f1ff7007280..dec0907dfb8a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -864,10 +864,17 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 	kfree(rbio);
 }
 
-static void free_raid_bio(struct btrfs_raid_bio *rbio)
+static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
 {
-	unlock_stripe(rbio);
-	__free_raid_bio(rbio);
+	struct bio *next;
+
+	while (cur) {
+		next = cur->bi_next;
+		cur->bi_next = NULL;
+		cur->bi_status = err;
+		bio_endio(cur);
+		cur = next;
+	}
 }
 
 /*
@@ -877,20 +884,26 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 {
 	struct bio *cur = bio_list_get(&rbio->bio_list);
-	struct bio *next;
+	struct bio *extra;
 
 	if (rbio->generic_bio_cnt)
 		btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
 
-	free_raid_bio(rbio);
+	/*
+	 * At this moment, rbio->bio_list is empty, however since rbio does not
+	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
+	 * hash list, rbio may be merged with others so that rbio->bio_list
+	 * becomes non-empty.
+	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
+	 * more and we can call bio_endio() on all queued bios.
+	 */
+	unlock_stripe(rbio);
+	extra = bio_list_get(&rbio->bio_list);
+	__free_raid_bio(rbio);
 
-	while (cur) {
-		next = cur->bi_next;
-		cur->bi_next = NULL;
-		cur->bi_status = err;
-		bio_endio(cur);
-		cur = next;
-	}
+	rbio_endio_bio_list(cur, err);
+	if (extra)
+		rbio_endio_bio_list(extra, err);
 }
 
 /*
-- 
cgit v1.2.3


From 3cbf26da5ec85f1d3177eee534e377c707ed89e1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Wed, 17 Jan 2018 12:21:49 -0800
Subject: btrfs: Remove unused readahead spinlock

The reada_lock in struct btrfs_device was only initialised, and not
actually used.  That's good because there's another lock also called
reada_lock in the btrfs_fs_info that was quite heavily used.  Remove
this one.

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 1 -
 fs/btrfs/volumes.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 11fccf2fcb1f..5750cd9df417 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -319,7 +319,6 @@ static struct btrfs_device *__alloc_device(void)
 
 	spin_lock_init(&dev->io_lock);
 
-	spin_lock_init(&dev->reada_lock);
 	atomic_set(&dev->reada_in_flight, 0);
 	atomic_set(&dev->dev_stats_ccnt, 0);
 	btrfs_device_data_ordered_init(dev);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 15216fed918b..28c28eeadff3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -137,7 +137,6 @@ struct btrfs_device {
 	struct rcu_head rcu;
 
 	/* readahead state */
-	spinlock_t reada_lock;
 	atomic_t reada_in_flight;
 	u64 reada_next;
 	struct reada_zone *reada_curr_zone;
-- 
cgit v1.2.3


From a520a7e0b548690ea8eb77bb45800459351a7cf5 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:08 -0700
Subject: Btrfs: fix incorrect block_len in merge_extent_mapping

%block_len could be checked on deciding if two em are mergeable.

merge_extent_mapping() has only added the front pad if the front part
of em gets truncated, but it's possible that the end part gets
truncated.

For both compressed extent and inline extent, em->block_len is not
adjusted accordingly, and for regular extent, em->block_len always
equals to em->len, hence this sets em->block_len with em->len.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 633b83b709a1..f9fd8b14ef08 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6874,7 +6874,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
 	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 		em->block_start += start_diff;
-		em->block_len -= start_diff;
+		em->block_len = em->len;
 	}
 	return add_extent_mapping(em_tree, em, 0);
 }
-- 
cgit v1.2.3


From 18e83ac75bfe67009c4ddcdd581bba8eb16f4030 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:09 -0700
Subject: Btrfs: fix unexpected EEXIST from btrfs_get_extent

This fixes a corner case that is caused by a race of dio write vs dio
read/write.

Here is how the race could happen.

Suppose that no extent map has been loaded into memory yet.
There is a file extent [0, 32K), two jobs are running concurrently
against it, t1 is doing dio write to [8K, 32K) and t2 is doing dio
read from [0, 4K) or [4K, 8K).

t1 goes ahead of t2 and splits em [0, 32K) to em [0K, 8K) and [8K 32K).

------------------------------------------------------
             t1                                t2
      btrfs_get_blocks_direct()         btrfs_get_blocks_direct()
       -> btrfs_get_extent()              -> btrfs_get_extent()
           -> lookup_extent_mapping()
           -> add_extent_mapping()            -> lookup_extent_mapping()
              # load [0, 32K)
       -> btrfs_new_extent_direct()
           -> btrfs_drop_extent_cache()
              # split [0, 32K) and
	      # drop [8K, 32K)
           -> add_extent_mapping()
              # add [8K, 32K)
                                              -> add_extent_mapping()
                                                 # handle -EEXIST when adding
                                                 # [0, 32K)
------------------------------------------------------
About how t2(dio read/write) runs into -EEXIST:

a) add_extent_mapping() gets -EEXIST for adding em [0, 32k),

b) search_extent_mapping() then returns [0, 8k) as the existing em,
   even though start == existing->start, em is [0, 32k) so that
   extent_map_end(em) > extent_map_end(existing), i.e. 32k > 8k,

c) then it goes thru merge_extent_mapping() which tries to add a [8k, 8k)
   (with a length 0) and returns -EEXIST as [8k, 32k) is already in tree,

d) so btrfs_get_extent() ends up returning -EEXIST to dio read/write,
   which is confusing applications.

Here I conclude all the possible situations,
1) start < existing->start

            +-----------+em+-----------+
+--prev---+ |     +-------------+      |
|         | |     |             |      |
+---------+ +     +---+existing++      ++
                +
                |
                +
             start

2) start == existing->start

      +------------em------------+
      |     +-------------+      |
      |     |             |      |
      +     +----existing-+      +
            |
            |
            +
         start

3) start > existing->start && start < (existing->start + existing->len)

      +------------em------------+
      |     +-------------+      |
      |     |             |      |
      +     +----existing-+      +
               |
               |
               +
             start

4) start >= (existing->start + existing->len)

+-----------+em+-----------+
|     +-------------+      | +--next---+
|     |             |      | |         |
+     +---+existing++      + +---------+
                      +
                      |
                      +
                   start

As we can see, it turns out that if start is within existing em (front
inclusive), then the existing em should be returned as is, otherwise,
we try our best to merge candidate em with sibling ems to form a
larger em (in order to reduce the total number of em).

Reported-by: David Vallender <david.vallender@landmark.co.uk>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9fd8b14ef08..4ab713bd4139 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7153,19 +7153,12 @@ insert:
 		 * existing will always be non-NULL, since there must be
 		 * extent causing the -EEXIST.
 		 */
-		if (existing->start == em->start &&
-		    extent_map_end(existing) >= extent_map_end(em) &&
-		    em->block_start == existing->block_start) {
-			/*
-			 * The existing extent map already encompasses the
-			 * entire extent map we tried to add.
-			 */
+		if (start >= existing->start &&
+		    start < extent_map_end(existing)) {
 			free_extent_map(em);
 			em = existing;
 			err = 0;
-
-		} else if (start >= extent_map_end(existing) ||
-		    start <= existing->start) {
+		} else {
 			/*
 			 * The existing extent map is the one nearest to
 			 * the [start, start + len) range which overlaps
@@ -7177,10 +7170,6 @@ insert:
 				free_extent_map(em);
 				em = NULL;
 			}
-		} else {
-			free_extent_map(em);
-			em = existing;
-			err = 0;
 		}
 	}
 	write_unlock(&em_tree->lock);
-- 
cgit v1.2.3


From 7b4df058b051fb67db61ea371f7d278131cb6e7b Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:10 -0700
Subject: Btrfs: add helper for em merge logic

This is a prepare work for the following extent map selftest, which
runs tests against em merge logic.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/inode.c | 80 ++++++++++++++++++++++++++++++++------------------------
 2 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a462ab85c49..1e05fc7e0e35 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3143,6 +3143,8 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
 						    int delay_iput);
 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 
+int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+			     struct extent_map **em_in, u64 start, u64 len);
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
 		struct page *page, size_t pg_offset, u64 start,
 		u64 len, int create);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4ab713bd4139..c6a05ee3d74b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6925,6 +6925,51 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	return ret;
 }
 
+int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+			     struct extent_map **em_in, u64 start, u64 len)
+{
+	int ret;
+	struct extent_map *em = *em_in;
+
+	ret = add_extent_mapping(em_tree, em, 0);
+	/* it is possible that someone inserted the extent into the tree
+	 * while we had the lock dropped.  It is also possible that
+	 * an overlapping map exists in the tree
+	 */
+	if (ret == -EEXIST) {
+		struct extent_map *existing;
+
+		ret = 0;
+
+		existing = search_extent_mapping(em_tree, start, len);
+		/*
+		 * existing will always be non-NULL, since there must be
+		 * extent causing the -EEXIST.
+		 */
+		if (start >= existing->start &&
+		    start < extent_map_end(existing)) {
+			free_extent_map(em);
+			*em_in = existing;
+			ret = 0;
+		} else {
+			/*
+			 * The existing extent map is the one nearest to
+			 * the [start, start + len) range which overlaps
+			 */
+			ret = merge_extent_mapping(em_tree, existing,
+						   em, start);
+			free_extent_map(existing);
+			if (ret) {
+				free_extent_map(em);
+				*em_in = NULL;
+			}
+		}
+	}
+
+	ASSERT(ret == 0 || ret == -EEXIST);
+	return ret;
+}
+
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
  * the ugly parts come from merging extents from the disk with the in-ram
@@ -7138,40 +7183,7 @@ insert:
 
 	err = 0;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em, 0);
-	/* it is possible that someone inserted the extent into the tree
-	 * while we had the lock dropped.  It is also possible that
-	 * an overlapping map exists in the tree
-	 */
-	if (ret == -EEXIST) {
-		struct extent_map *existing;
-
-		ret = 0;
-
-		existing = search_extent_mapping(em_tree, start, len);
-		/*
-		 * existing will always be non-NULL, since there must be
-		 * extent causing the -EEXIST.
-		 */
-		if (start >= existing->start &&
-		    start < extent_map_end(existing)) {
-			free_extent_map(em);
-			em = existing;
-			err = 0;
-		} else {
-			/*
-			 * The existing extent map is the one nearest to
-			 * the [start, start + len) range which overlaps
-			 */
-			err = merge_extent_mapping(em_tree, existing,
-						   em, start);
-			free_extent_map(existing);
-			if (err) {
-				free_extent_map(em);
-				em = NULL;
-			}
-		}
-	}
+	err = btrfs_add_extent_mapping(em_tree, &em, start, len);
 	write_unlock(&em_tree->lock);
 out:
 
-- 
cgit v1.2.3


From c04e61b5e41b0e8ace4aa4b67685fbe68ac37a46 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:11 -0700
Subject: Btrfs: move extent map specific code to extent_map.c

These helpers are extent map specific, move them to extent_map.c.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h      |   2 -
 fs/btrfs/extent_map.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_map.h |   2 +
 fs/btrfs/inode.c      | 107 ------------------------------------------
 4 files changed, 127 insertions(+), 109 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1e05fc7e0e35..1a462ab85c49 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3143,8 +3143,6 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
 						    int delay_iput);
 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
-			     struct extent_map **em_in, u64 start, u64 len);
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
 		struct page *page, size_t pg_offset, u64 start,
 		u64 len, int create);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2e348fb0b280..6fe8b14e11cf 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -454,3 +454,128 @@ void replace_extent_mapping(struct extent_map_tree *tree,
 
 	setup_extent_mapping(tree, new, modified);
 }
+
+static struct extent_map *next_extent_map(struct extent_map *em)
+{
+	struct rb_node *next;
+
+	next = rb_next(&em->rb_node);
+	if (!next)
+		return NULL;
+	return container_of(next, struct extent_map, rb_node);
+}
+
+static struct extent_map *prev_extent_map(struct extent_map *em)
+{
+	struct rb_node *prev;
+
+	prev = rb_prev(&em->rb_node);
+	if (!prev)
+		return NULL;
+	return container_of(prev, struct extent_map, rb_node);
+}
+
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * the existing extent is the nearest extent to map_start,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the best fitted new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+				struct extent_map *existing,
+				struct extent_map *em,
+				u64 map_start)
+{
+	struct extent_map *prev;
+	struct extent_map *next;
+	u64 start;
+	u64 end;
+	u64 start_diff;
+
+	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+
+	if (existing->start > map_start) {
+		next = existing;
+		prev = prev_extent_map(next);
+	} else {
+		prev = existing;
+		next = next_extent_map(prev);
+	}
+
+	start = prev ? extent_map_end(prev) : em->start;
+	start = max_t(u64, start, em->start);
+	end = next ? next->start : extent_map_end(em);
+	end = min_t(u64, end, extent_map_end(em));
+	start_diff = start - em->start;
+	em->start = start;
+	em->len = end - start;
+	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+		em->block_start += start_diff;
+		em->block_len = em->len;
+	}
+	return add_extent_mapping(em_tree, em, 0);
+}
+
+/**
+ * btrfs_add_extent_mapping - add extent mapping into em_tree
+ * @em_tree - the extent tree into which we want to insert the extent mapping
+ * @em_in   - extent we are inserting
+ * @start   - start of the logical range btrfs_get_extent() is requesting
+ * @len     - length of the logical range btrfs_get_extent() is requesting
+ *
+ * Note that @em_in's range may be different from [start, start+len),
+ * but they must be overlapped.
+ *
+ * Insert @em_in into @em_tree. In case there is an overlapping range, handle
+ * the -EEXIST by either:
+ * a) Returning the existing extent in @em_in if @start is within the
+ *    existing em.
+ * b) Merge the existing extent with @em_in passed in.
+ *
+ * Return 0 on success, otherwise -EEXIST.
+ *
+ */
+int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+			     struct extent_map **em_in, u64 start, u64 len)
+{
+	int ret;
+	struct extent_map *em = *em_in;
+
+	ret = add_extent_mapping(em_tree, em, 0);
+	/* it is possible that someone inserted the extent into the tree
+	 * while we had the lock dropped.  It is also possible that
+	 * an overlapping map exists in the tree
+	 */
+	if (ret == -EEXIST) {
+		struct extent_map *existing;
+
+		ret = 0;
+
+		existing = search_extent_mapping(em_tree, start, len);
+		/*
+		 * existing will always be non-NULL, since there must be
+		 * extent causing the -EEXIST.
+		 */
+		if (start >= existing->start &&
+		    start < extent_map_end(existing)) {
+			free_extent_map(em);
+			*em_in = existing;
+			ret = 0;
+		} else {
+			/*
+			 * The existing extent map is the one nearest to
+			 * the [start, start + len) range which overlaps
+			 */
+			ret = merge_extent_mapping(em_tree, existing,
+						   em, start);
+			free_extent_map(existing);
+			if (ret) {
+				free_extent_map(em);
+				*em_in = NULL;
+			}
+		}
+	}
+
+	ASSERT(ret == 0 || ret == -EEXIST);
+	return ret;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e9e285d45c7e..b29f77bc0732 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -91,4 +91,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
+int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+			     struct extent_map **em_in, u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c6a05ee3d74b..d55aef46e8ae 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6817,68 +6817,6 @@ out_fail_inode:
 	goto out_fail;
 }
 
-/* Find next extent map of a given extent map, caller needs to ensure locks */
-static struct extent_map *next_extent_map(struct extent_map *em)
-{
-	struct rb_node *next;
-
-	next = rb_next(&em->rb_node);
-	if (!next)
-		return NULL;
-	return container_of(next, struct extent_map, rb_node);
-}
-
-static struct extent_map *prev_extent_map(struct extent_map *em)
-{
-	struct rb_node *prev;
-
-	prev = rb_prev(&em->rb_node);
-	if (!prev)
-		return NULL;
-	return container_of(prev, struct extent_map, rb_node);
-}
-
-/* helper for btfs_get_extent.  Given an existing extent in the tree,
- * the existing extent is the nearest extent to map_start,
- * and an extent that you want to insert, deal with overlap and insert
- * the best fitted new extent into the tree.
- */
-static int merge_extent_mapping(struct extent_map_tree *em_tree,
-				struct extent_map *existing,
-				struct extent_map *em,
-				u64 map_start)
-{
-	struct extent_map *prev;
-	struct extent_map *next;
-	u64 start;
-	u64 end;
-	u64 start_diff;
-
-	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
-
-	if (existing->start > map_start) {
-		next = existing;
-		prev = prev_extent_map(next);
-	} else {
-		prev = existing;
-		next = next_extent_map(prev);
-	}
-
-	start = prev ? extent_map_end(prev) : em->start;
-	start = max_t(u64, start, em->start);
-	end = next ? next->start : extent_map_end(em);
-	end = min_t(u64, end, extent_map_end(em));
-	start_diff = start - em->start;
-	em->start = start;
-	em->len = end - start;
-	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
-	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-		em->block_start += start_diff;
-		em->block_len = em->len;
-	}
-	return add_extent_mapping(em_tree, em, 0);
-}
-
 static noinline int uncompress_inline(struct btrfs_path *path,
 				      struct page *page,
 				      size_t pg_offset, u64 extent_offset,
@@ -6925,51 +6863,6 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	return ret;
 }
 
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
-			     struct extent_map **em_in, u64 start, u64 len)
-{
-	int ret;
-	struct extent_map *em = *em_in;
-
-	ret = add_extent_mapping(em_tree, em, 0);
-	/* it is possible that someone inserted the extent into the tree
-	 * while we had the lock dropped.  It is also possible that
-	 * an overlapping map exists in the tree
-	 */
-	if (ret == -EEXIST) {
-		struct extent_map *existing;
-
-		ret = 0;
-
-		existing = search_extent_mapping(em_tree, start, len);
-		/*
-		 * existing will always be non-NULL, since there must be
-		 * extent causing the -EEXIST.
-		 */
-		if (start >= existing->start &&
-		    start < extent_map_end(existing)) {
-			free_extent_map(em);
-			*em_in = existing;
-			ret = 0;
-		} else {
-			/*
-			 * The existing extent map is the one nearest to
-			 * the [start, start + len) range which overlaps
-			 */
-			ret = merge_extent_mapping(em_tree, existing,
-						   em, start);
-			free_extent_map(existing);
-			if (ret) {
-				free_extent_map(em);
-				*em_in = NULL;
-			}
-		}
-	}
-
-	ASSERT(ret == 0 || ret == -EEXIST);
-	return ret;
-}
-
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
  * the ugly parts come from merging extents from the disk with the in-ram
-- 
cgit v1.2.3


From 72b28077a20a6a1f14494602466c219241f45d89 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:12 -0700
Subject: Btrfs: add extent map selftests

We've observed that btrfs_get_extent() and merge_extent_mapping() could
return -EEXIST in several cases, and they are caused by some racy
condition, e.g dio read vs dio write, which makes the problem very tricky
to reproduce.

This adds extent map selftests in order to simulate those racy situations.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
[ minor string adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Makefile                 |   2 +-
 fs/btrfs/tests/btrfs-tests.c      |   3 +
 fs/btrfs/tests/btrfs-tests.h      |   1 +
 fs/btrfs/tests/extent-map-tests.c | 203 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/tests/extent-map-tests.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6fe881d5cb38..0c4373628eb4 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -19,4 +19,4 @@ btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
 	tests/extent-buffer-tests.o tests/btrfs-tests.o \
 	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
-	tests/free-space-tree-tests.o
+	tests/free-space-tree-tests.o tests/extent-map-tests.o
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index d3f25376a0f8..9786d8cd0aa6 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -277,6 +277,9 @@ int btrfs_run_sanity_tests(void)
 				goto out;
 		}
 	}
+	ret = btrfs_test_extent_map();
+	if (ret)
+		goto out;
 out:
 	btrfs_destroy_test_fs();
 	return ret;
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 266f1e3d1784..bc0615bac3cc 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -33,6 +33,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
 int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
 int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
 int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
+int btrfs_test_extent_map(void);
 struct inode *btrfs_new_test_inode(void);
 struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
 void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
new file mode 100644
index 000000000000..e6f0dd18392e
--- /dev/null
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2017 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/types.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+
+static void free_extent_map_tree(struct extent_map_tree *em_tree)
+{
+	struct extent_map *em;
+	struct rb_node *node;
+
+	while (!RB_EMPTY_ROOT(&em_tree->map)) {
+		node = rb_first(&em_tree->map);
+		em = rb_entry(node, struct extent_map, rb_node);
+		remove_extent_mapping(em_tree, em);
+
+#ifdef CONFIG_BTRFS_DEBUG
+		if (refcount_read(&em->refs) != 1) {
+			test_msg(
+"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d\n",
+				 em->start, em->len, em->block_start,
+				 em->block_len, refcount_read(&em->refs));
+
+			refcount_set(&em->refs, 1);
+		}
+#endif
+		free_extent_map(em);
+	}
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet, there is a file
+ * extent [0, 16K), followed by another file extent [16K, 20K), two dio reads
+ * are entering btrfs_get_extent() concurrently, t1 is reading [8K, 16K), t2 is
+ * reading [0, 8K)
+ *
+ *     t1                            t2
+ *  btrfs_get_extent()              btrfs_get_extent()
+ *    -> lookup_extent_mapping()      ->lookup_extent_mapping()
+ *    -> add_extent_mapping(0, 16K)
+ *    -> return em
+ *                                    ->add_extent_mapping(0, 16K)
+ *                                    -> #handle -EEXIST
+ */
+static void test_case_1(struct extent_map_tree *em_tree)
+{
+	struct extent_map *em;
+	u64 start = 0;
+	u64 len = SZ_8K;
+	int ret;
+
+	em = alloc_extent_map();
+	if (!em)
+		/* Skip the test on error. */
+		return;
+
+	/* Add [0, 16K) */
+	em->start = 0;
+	em->len = SZ_16K;
+	em->block_start = 0;
+	em->block_len = SZ_16K;
+	ret = add_extent_mapping(em_tree, em, 0);
+	ASSERT(ret == 0);
+	free_extent_map(em);
+
+	/* Add [16K, 20K) following [0, 16K)  */
+	em = alloc_extent_map();
+	if (!em)
+		goto out;
+
+	em->start = SZ_16K;
+	em->len = SZ_4K;
+	em->block_start = SZ_32K; /* avoid merging */
+	em->block_len = SZ_4K;
+	ret = add_extent_mapping(em_tree, em, 0);
+	ASSERT(ret == 0);
+	free_extent_map(em);
+
+	em = alloc_extent_map();
+	if (!em)
+		goto out;
+
+	/* Add [0, 8K), should return [0, 16K) instead. */
+	em->start = start;
+	em->len = len;
+	em->block_start = start;
+	em->block_len = len;
+	ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+	if (ret)
+		test_msg("case1 [%llu %llu]: ret %d\n", start, start + len, ret);
+	if (em &&
+	    (em->start != 0 || extent_map_end(em) != SZ_16K ||
+	     em->block_start != 0 || em->block_len != SZ_16K))
+		test_msg(
+"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+			 start, start + len, ret, em->start, em->len,
+			 em->block_start, em->block_len);
+	free_extent_map(em);
+out:
+	/* free memory */
+	free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Reading the inline ending up with EEXIST, ie. read an inline
+ * extent and discard page cache and read it again.
+ */
+static void test_case_2(struct extent_map_tree *em_tree)
+{
+	struct extent_map *em;
+	int ret;
+
+	em = alloc_extent_map();
+	if (!em)
+		/* Skip the test on error. */
+		return;
+
+	/* Add [0, 1K) */
+	em->start = 0;
+	em->len = SZ_1K;
+	em->block_start = EXTENT_MAP_INLINE;
+	em->block_len = (u64)-1;
+	ret = add_extent_mapping(em_tree, em, 0);
+	ASSERT(ret == 0);
+	free_extent_map(em);
+
+	/* Add [4K, 4K) following [0, 1K)  */
+	em = alloc_extent_map();
+	if (!em)
+		goto out;
+
+	em->start = SZ_4K;
+	em->len = SZ_4K;
+	em->block_start = SZ_4K;
+	em->block_len = SZ_4K;
+	ret = add_extent_mapping(em_tree, em, 0);
+	ASSERT(ret == 0);
+	free_extent_map(em);
+
+	em = alloc_extent_map();
+	if (!em)
+		goto out;
+
+	/* Add [0, 1K) */
+	em->start = 0;
+	em->len = SZ_1K;
+	em->block_start = EXTENT_MAP_INLINE;
+	em->block_len = (u64)-1;
+	ret = btrfs_add_extent_mapping(em_tree, &em, em->start, em->len);
+	if (ret)
+		test_msg("case2 [0 1K]: ret %d\n", ret);
+	if (em &&
+	    (em->start != 0 || extent_map_end(em) != SZ_1K ||
+	     em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1))
+		test_msg(
+"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu\n",
+			 ret, em->start, em->len, em->block_start,
+			 em->block_len);
+	free_extent_map(em);
+out:
+	/* free memory */
+	free_extent_map_tree(em_tree);
+}
+
+int btrfs_test_extent_map()
+{
+	struct extent_map_tree *em_tree;
+
+	test_msg("Running extent_map tests\n");
+
+	em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL);
+	if (!em_tree)
+		/* Skip the test on error. */
+		return 0;
+
+	extent_map_tree_init(em_tree);
+
+	test_case_1(em_tree);
+	test_case_2(em_tree);
+
+	kfree(em_tree);
+	return 0;
+}
-- 
cgit v1.2.3


From fd87526fada701295656b3c695ae20cb037fdd95 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:13 -0700
Subject: Btrfs: extent map selftest: buffered write vs dio read

This test case simulates the racy situation of buffered write vs dio
read, and see if btrfs_get_extent() would return -EEXIST.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/extent-map-tests.c | 74 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index e6f0dd18392e..16830bf88f46 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -182,6 +182,79 @@ out:
 	free_extent_map_tree(em_tree);
 }
 
+static void __test_case_3(struct extent_map_tree *em_tree, u64 start)
+{
+	struct extent_map *em;
+	u64 len = SZ_4K;
+	int ret;
+
+	em = alloc_extent_map();
+	if (!em)
+		/* Skip this test on error. */
+		return;
+
+	/* Add [4K, 8K) */
+	em->start = SZ_4K;
+	em->len = SZ_4K;
+	em->block_start = SZ_4K;
+	em->block_len = SZ_4K;
+	ret = add_extent_mapping(em_tree, em, 0);
+	ASSERT(ret == 0);
+	free_extent_map(em);
+
+	em = alloc_extent_map();
+	if (!em)
+		goto out;
+
+	/* Add [0, 16K) */
+	em->start = 0;
+	em->len = SZ_16K;
+	em->block_start = 0;
+	em->block_len = SZ_16K;
+	ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+	if (ret)
+		test_msg("case3 [0x%llx 0x%llx): ret %d\n",
+			 start, start + len, ret);
+	/*
+	 * Since bytes within em are contiguous, em->block_start is identical to
+	 * em->start.
+	 */
+	if (em &&
+	    (start < em->start || start + len > extent_map_end(em) ||
+	     em->start != em->block_start || em->len != em->block_len))
+		test_msg(
+"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+			 start, start + len, ret, em->start, em->len,
+			 em->block_start, em->block_len);
+	free_extent_map(em);
+out:
+	/* free memory */
+	free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet.
+ * There is a file extent [0, 16K), two jobs are running concurrently
+ * against it, t1 is buffered writing to [4K, 8K) and t2 is doing dio
+ * read from [0, 4K) or [8K, 12K) or [12K, 16K).
+ *
+ * t1 goes ahead of t2 and adds em [4K, 8K) into tree.
+ *
+ *         t1                       t2
+ *  cow_file_range()	     btrfs_get_extent()
+ *                            -> lookup_extent_mapping()
+ *   -> add_extent_mapping()
+ *                            -> add_extent_mapping()
+ */
+static void test_case_3(struct extent_map_tree *em_tree)
+{
+	__test_case_3(em_tree, 0);
+	__test_case_3(em_tree, SZ_8K);
+	__test_case_3(em_tree, (12 * 1024ULL));
+}
+
 int btrfs_test_extent_map()
 {
 	struct extent_map_tree *em_tree;
@@ -197,6 +270,7 @@ int btrfs_test_extent_map()
 
 	test_case_1(em_tree);
 	test_case_2(em_tree);
+	test_case_3(em_tree);
 
 	kfree(em_tree);
 	return 0;
-- 
cgit v1.2.3


From cd77f4f8363602e5fbee481f38241110e65ff014 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:14 -0700
Subject: Btrfs: extent map selftest: dio write vs dio read

This test case simulates the racy situation of dio write vs dio read,
and see if btrfs_get_extent() would return -EEXIST.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/extent-map-tests.c | 89 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 16830bf88f46..70c993f01670 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -255,6 +255,94 @@ static void test_case_3(struct extent_map_tree *em_tree)
 	__test_case_3(em_tree, (12 * 1024ULL));
 }
 
+static void __test_case_4(struct extent_map_tree *em_tree, u64 start)
+{
+	struct extent_map *em;
+	u64 len = SZ_4K;
+	int ret;
+
+	em = alloc_extent_map();
+	if (!em)
+		/* Skip this test on error. */
+		return;
+
+	/* Add [0K, 8K) */
+	em->start = 0;
+	em->len = SZ_8K;
+	em->block_start = 0;
+	em->block_len = SZ_8K;
+	ret = add_extent_mapping(em_tree, em, 0);
+	ASSERT(ret == 0);
+	free_extent_map(em);
+
+	em = alloc_extent_map();
+	if (!em)
+		goto out;
+
+	/* Add [8K, 24K) */
+	em->start = SZ_8K;
+	em->len = 24 * 1024ULL;
+	em->block_start = SZ_16K; /* avoid merging */
+	em->block_len = 24 * 1024ULL;
+	ret = add_extent_mapping(em_tree, em, 0);
+	ASSERT(ret == 0);
+	free_extent_map(em);
+
+	em = alloc_extent_map();
+	if (!em)
+		goto out;
+	/* Add [0K, 32K) */
+	em->start = 0;
+	em->len = SZ_32K;
+	em->block_start = 0;
+	em->block_len = SZ_32K;
+	ret = btrfs_add_extent_mapping(em_tree, &em, start, len);
+	if (ret)
+		test_msg("case4 [0x%llx 0x%llx): ret %d\n",
+			 start, len, ret);
+	if (em &&
+	    (start < em->start || start + len > extent_map_end(em)))
+		test_msg(
+"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)\n",
+			 start, len, ret, em->start, em->len, em->block_start,
+			 em->block_len);
+	free_extent_map(em);
+out:
+	/* free memory */
+	free_extent_map_tree(em_tree);
+}
+
+/*
+ * Test scenario:
+ *
+ * Suppose that no extent map has been loaded into memory yet.
+ * There is a file extent [0, 32K), two jobs are running concurrently
+ * against it, t1 is doing dio write to [8K, 32K) and t2 is doing dio
+ * read from [0, 4K) or [4K, 8K).
+ *
+ * t1 goes ahead of t2 and splits em [0, 32K) to em [0K, 8K) and [8K 32K).
+ *
+ *         t1                                t2
+ *  btrfs_get_blocks_direct()	       btrfs_get_blocks_direct()
+ *   -> btrfs_get_extent()              -> btrfs_get_extent()
+ *       -> lookup_extent_mapping()
+ *       -> add_extent_mapping()            -> lookup_extent_mapping()
+ *          # load [0, 32K)
+ *   -> btrfs_new_extent_direct()
+ *       -> btrfs_drop_extent_cache()
+ *          # split [0, 32K)
+ *       -> add_extent_mapping()
+ *          # add [8K, 32K)
+ *                                          -> add_extent_mapping()
+ *                                             # handle -EEXIST when adding
+ *                                             # [0, 32K)
+ */
+static void test_case_4(struct extent_map_tree *em_tree)
+{
+	__test_case_4(em_tree, 0);
+	__test_case_4(em_tree, SZ_4K);
+}
+
 int btrfs_test_extent_map()
 {
 	struct extent_map_tree *em_tree;
@@ -271,6 +359,7 @@ int btrfs_test_extent_map()
 	test_case_1(em_tree);
 	test_case_2(em_tree);
 	test_case_3(em_tree);
+	test_case_4(em_tree);
 
 	kfree(em_tree);
 	return 0;
-- 
cgit v1.2.3


From 9a7e10e7ba66ce23c8fdc1cac18cade7a0f6840d Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:15 -0700
Subject: Btrfs: add WARN_ONCE to detect unexpected error from
 merge_extent_mapping

This is a subtle case, so in order to understand the problem, it'd be good
to know the content of existing and em when any error occurs.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6fe8b14e11cf..914662428dbd 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -562,17 +562,24 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
 			*em_in = existing;
 			ret = 0;
 		} else {
+			u64 orig_start = em->start;
+			u64 orig_len = em->len;
+
 			/*
 			 * The existing extent map is the one nearest to
 			 * the [start, start + len) range which overlaps
 			 */
 			ret = merge_extent_mapping(em_tree, existing,
 						   em, start);
-			free_extent_map(existing);
 			if (ret) {
 				free_extent_map(em);
 				*em_in = NULL;
+				WARN_ONCE(ret,
+"unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n",
+					  ret, existing->start, existing->len,
+					  orig_start, orig_len);
 			}
+			free_extent_map(existing);
 		}
 	}
 
-- 
cgit v1.2.3


From 5f4791f4a6479cdeb8caae1a3e5895c8a6e99c09 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:17 -0700
Subject: Btrfs: noinline merge_extent_mapping

In order to debug subtle bugs around merge_extent_mapping(), perf probe
can be used to check the arguments, but sometimes merge_extent_mapping()
got inlined by compiler and couldn't be probed.

This is adding noinline attribute to merge_extent_mapping().

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 914662428dbd..d3bd02105d1c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -480,10 +480,10 @@ static struct extent_map *prev_extent_map(struct extent_map *em)
  * and an extent that you want to insert, deal with overlap and insert
  * the best fitted new extent into the tree.
  */
-static int merge_extent_mapping(struct extent_map_tree *em_tree,
-				struct extent_map *existing,
-				struct extent_map *em,
-				u64 map_start)
+static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
+					 struct extent_map *existing,
+					 struct extent_map *em,
+					 u64 map_start)
 {
 	struct extent_map *prev;
 	struct extent_map *next;
-- 
cgit v1.2.3


From b03ebd992ff329889d2e1f342cb75ee3ebc21adc Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 18 Jan 2018 14:47:06 +0200
Subject: btrfs: Use IS_ALIGNED in btrfs_truncate_block instead of opencoding
 it

No functional changes, just makes the code more readable

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d55aef46e8ae..c5f31817778b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4769,8 +4769,8 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
 	u64 block_start;
 	u64 block_end;
 
-	if ((offset & (blocksize - 1)) == 0 &&
-	    (!len || ((len & (blocksize - 1)) == 0)))
+	if (IS_ALIGNED(offset, blocksize) &&
+	    (!len || IS_ALIGNED(len, blocksize)))
 		goto out;
 
 	block_start = round_down(from, blocksize);
-- 
cgit v1.2.3


From a848b3e547d167692483ff08622bb56dd3e4a9d6 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 18 Jan 2018 22:00:33 +0800
Subject: btrfs: no need to check for btrfs_fs_devices::seeding

There is no need to check for btrfs_fs_devices::seeding when we
have checked for btrfs_fs_devices::opened, because we can't sprout
without its seed FS being opened.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5750cd9df417..011bb9b116e6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -618,8 +618,6 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 
 		if (fs_devs->opened)
 			continue;
-		if (fs_devs->seeding)
-			continue;
 
 		list_for_each_entry(dev, &fs_devs->devices, dev_list) {
 
-- 
cgit v1.2.3


From 38cf665d338fca33af4b16f9ec7cad6637fc0fec Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Thu, 18 Jan 2018 22:00:34 +0800
Subject: btrfs: make btrfs_free_stale_device() to iterate all stales

Let the list iterator iterate further and find other stale
devices and delete it. This is in preparation to add support
for user land request-able stale devices cleanup. Also rename
btrfs_free_stale_device() to btrfs_free_stale_devices().

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 011bb9b116e6..207014fce7ea 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -605,21 +605,22 @@ static void pending_bios_fn(struct btrfs_work *work)
 }
 
 
-static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+static void btrfs_free_stale_devices(struct btrfs_device *cur_dev)
 {
-	struct btrfs_fs_devices *fs_devs;
-	struct btrfs_device *dev;
+	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
+	struct btrfs_device *dev, *tmp_dev;
 
 	if (!cur_dev->name)
 		return;
 
-	list_for_each_entry(fs_devs, &fs_uuids, list) {
-		int del = 1;
+	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
 
 		if (fs_devs->opened)
 			continue;
 
-		list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+		list_for_each_entry_safe(dev, tmp_dev,
+					 &fs_devs->devices, dev_list) {
+			int not_found;
 
 			if (dev == cur_dev)
 				continue;
@@ -633,14 +634,12 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 			 * either use mapper or non mapper path throughout.
 			 */
 			rcu_read_lock();
-			del = strcmp(rcu_str_deref(dev->name),
+			not_found = strcmp(rcu_str_deref(dev->name),
 						rcu_str_deref(cur_dev->name));
 			rcu_read_unlock();
-			if (!del)
-				break;
-		}
+			if (not_found)
+				continue;
 
-		if (!del) {
 			/* delete the stale device */
 			if (fs_devs->num_devices == 1) {
 				btrfs_sysfs_remove_fsid(fs_devs);
@@ -651,7 +650,6 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 				list_del(&dev->dev_list);
 				free_device(dev);
 			}
-			break;
 		}
 	}
 }
@@ -779,7 +777,7 @@ static noinline int device_list_add(const char *path,
 
 		ret = 1;
 		device->fs_devices = fs_devices;
-		btrfs_free_stale_device(device);
+		btrfs_free_stale_devices(device);
 	} else if (!device->name || strcmp(device->name->str, path)) {
 		/*
 		 * When FS is already mounted.
-- 
cgit v1.2.3


From 522f1b45e41d893e4d1debd7f80c1e25e060c137 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Thu, 18 Jan 2018 22:00:35 +0800
Subject: btrfs: make btrfs_free_stale_devices() argument optional

This updates btrfs_free_stale_devices() helper function to delete all
unmouted devices, when arg is NULL.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 207014fce7ea..9712377862d9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -610,9 +610,6 @@ static void btrfs_free_stale_devices(struct btrfs_device *cur_dev)
 	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
 	struct btrfs_device *dev, *tmp_dev;
 
-	if (!cur_dev->name)
-		return;
-
 	list_for_each_entry_safe(fs_devs, tmp_fs_devs, &fs_uuids, list) {
 
 		if (fs_devs->opened)
@@ -620,11 +617,9 @@ static void btrfs_free_stale_devices(struct btrfs_device *cur_dev)
 
 		list_for_each_entry_safe(dev, tmp_dev,
 					 &fs_devs->devices, dev_list) {
-			int not_found;
+			int not_found = 0;
 
-			if (dev == cur_dev)
-				continue;
-			if (!dev->name)
+			if (cur_dev && (cur_dev == dev || !dev->name))
 				continue;
 
 			/*
@@ -634,8 +629,9 @@ static void btrfs_free_stale_devices(struct btrfs_device *cur_dev)
 			 * either use mapper or non mapper path throughout.
 			 */
 			rcu_read_lock();
-			not_found = strcmp(rcu_str_deref(dev->name),
-						rcu_str_deref(cur_dev->name));
+			if (cur_dev)
+				not_found = strcmp(rcu_str_deref(dev->name),
+						   rcu_str_deref(cur_dev->name));
 			rcu_read_unlock();
 			if (not_found)
 				continue;
-- 
cgit v1.2.3


From 0d34097f66831843a0e27f65fed23c41d07d44aa Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 18 Jan 2018 22:00:36 +0800
Subject: btrfs: rename btrfs_free_stale_devices() arg to skip_dev

No functional changes.
Rename btrfs_free_stale_devices() arg to skip_dev, so that it
reflects what that arg for.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9712377862d9..7d33bd2a0fb4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -605,7 +605,7 @@ static void pending_bios_fn(struct btrfs_work *work)
 }
 
 
-static void btrfs_free_stale_devices(struct btrfs_device *cur_dev)
+static void btrfs_free_stale_devices(struct btrfs_device *skip_dev)
 {
 	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
 	struct btrfs_device *dev, *tmp_dev;
@@ -619,7 +619,7 @@ static void btrfs_free_stale_devices(struct btrfs_device *cur_dev)
 					 &fs_devs->devices, dev_list) {
 			int not_found = 0;
 
-			if (cur_dev && (cur_dev == dev || !dev->name))
+			if (skip_dev && (skip_dev == dev || !dev->name))
 				continue;
 
 			/*
@@ -629,9 +629,9 @@ static void btrfs_free_stale_devices(struct btrfs_device *cur_dev)
 			 * either use mapper or non mapper path throughout.
 			 */
 			rcu_read_lock();
-			if (cur_dev)
+			if (skip_dev)
 				not_found = strcmp(rcu_str_deref(dev->name),
-						   rcu_str_deref(cur_dev->name));
+						   rcu_str_deref(skip_dev->name));
 			rcu_read_unlock();
 			if (not_found)
 				continue;
-- 
cgit v1.2.3


From d8367db30a101123f82a9408c8c6dd3fb287653f Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Thu, 18 Jan 2018 22:00:37 +0800
Subject: btrfs: make btrfs_free_stale_devices() to match the path

The btrfs_free_stale_devices() is updated to match for the given device
path and delete it. (It searches for only unmounted list of devices.)
Also drop the comment about different path being used for the same
device, since now we will have cli to clean any device that's not a
concern any more.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7d33bd2a0fb4..fe5ecc1d84ad 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -604,8 +604,16 @@ static void pending_bios_fn(struct btrfs_work *work)
 	run_scheduled_bios(device);
 }
 
-
-static void btrfs_free_stale_devices(struct btrfs_device *skip_dev)
+/*
+ *  Search and remove all stale (devices which are not mounted) devices.
+ *  When both inputs are NULL, it will search and release all stale devices.
+ *  path:	Optional. When provided will it release all unmounted devices
+ *		matching this path only.
+ *  skip_dev:	Optional. Will skip this device when searching for the stale
+ *		devices.
+ */
+static void btrfs_free_stale_devices(const char *path,
+				     struct btrfs_device *skip_dev)
 {
 	struct btrfs_fs_devices *fs_devs, *tmp_fs_devs;
 	struct btrfs_device *dev, *tmp_dev;
@@ -619,19 +627,15 @@ static void btrfs_free_stale_devices(struct btrfs_device *skip_dev)
 					 &fs_devs->devices, dev_list) {
 			int not_found = 0;
 
-			if (skip_dev && (skip_dev == dev || !dev->name))
+			if (skip_dev && skip_dev == dev)
+				continue;
+			if (path && !dev->name)
 				continue;
 
-			/*
-			 * Todo: This won't be enough. What if the same device
-			 * comes back (with new uuid and) with its mapper path?
-			 * But for now, this does help as mostly an admin will
-			 * either use mapper or non mapper path throughout.
-			 */
 			rcu_read_lock();
-			if (skip_dev)
+			if (path)
 				not_found = strcmp(rcu_str_deref(dev->name),
-						   rcu_str_deref(skip_dev->name));
+						   path);
 			rcu_read_unlock();
 			if (not_found)
 				continue;
@@ -773,7 +777,7 @@ static noinline int device_list_add(const char *path,
 
 		ret = 1;
 		device->fs_devices = fs_devices;
-		btrfs_free_stale_devices(device);
+		btrfs_free_stale_devices(path, device);
 	} else if (!device->name || strcmp(device->name->str, path)) {
 		/*
 		 * When FS is already mounted.
-- 
cgit v1.2.3


From 327f18cc7f44bc09170200514f42e61bce25d022 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 18 Jan 2018 22:02:33 +0800
Subject: btrfs: move pr_info into device_list_add

Commit 60999ca4b403 ("btrfs: make device scan less noisy")
adds return value 1 to device_list_add(), so that parent function can
call pr_info only when new device is added. Move the pr_info() part
into device_list_add() so that this function can be kept simple.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fe5ecc1d84ad..46228e9312b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -724,8 +724,7 @@ error_brelse:
  * Add new device to list of registered devices
  *
  * Returns:
- * 1   - first time device is seen
- * 0   - device already known
+ * 0   - device already known or newly added
  * < 0 - error
  */
 static noinline int device_list_add(const char *path,
@@ -735,7 +734,6 @@ static noinline int device_list_add(const char *path,
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices;
 	struct rcu_string *name;
-	int ret = 0;
 	u64 found_transid = btrfs_super_generation(disk_super);
 
 	fs_devices = find_fsid(disk_super->fsid);
@@ -775,9 +773,16 @@ static noinline int device_list_add(const char *path,
 		fs_devices->num_devices++;
 		mutex_unlock(&fs_devices->device_list_mutex);
 
-		ret = 1;
 		device->fs_devices = fs_devices;
 		btrfs_free_stale_devices(path, device);
+
+		if (disk_super->label[0])
+			pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
+				disk_super->label, devid, found_transid, path);
+		else
+			pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
+				disk_super->fsid, devid, found_transid, path);
+
 	} else if (!device->name || strcmp(device->name->str, path)) {
 		/*
 		 * When FS is already mounted.
@@ -838,7 +843,7 @@ static noinline int device_list_add(const char *path,
 
 	*fs_devices_ret = fs_devices;
 
-	return ret;
+	return 0;
 }
 
 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -1177,7 +1182,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	struct page *page;
 	int ret;
 	u64 devid;
-	u64 transid;
 	u64 total_devices;
 	u64 bytenr;
 
@@ -1203,19 +1207,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	}
 
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
-	transid = btrfs_super_generation(disk_super);
 	total_devices = btrfs_super_num_devices(disk_super);
 
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
-	if (ret > 0) {
-		if (disk_super->label[0])
-			pr_info("BTRFS: device label %s ", disk_super->label);
-		else
-			pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
-
-		pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
-		ret = 0;
-	}
 	if (!ret && fs_devices_ret)
 		(*fs_devices_ret)->total_devices = total_devices;
 
-- 
cgit v1.2.3


From f2788d2f76cafef160e959999624c6a87e71d26d Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 18 Jan 2018 22:02:34 +0800
Subject: btrfs: set the total_devices in device_list_add()

There is no other parent for device_list_add() except for
btrfs_scan_one_device(), which would set btrfs_fs_devices::total_devices
if device_list_add is successful and this can be done with in
device_list_add() itself.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 46228e9312b2..4224a735493b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -841,6 +841,8 @@ static noinline int device_list_add(const char *path,
 	if (!fs_devices->opened)
 		device->generation = found_transid;
 
+	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
+
 	*fs_devices_ret = fs_devices;
 
 	return 0;
@@ -1182,7 +1184,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	struct page *page;
 	int ret;
 	u64 devid;
-	u64 total_devices;
 	u64 bytenr;
 
 	/*
@@ -1207,11 +1208,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	}
 
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
-	total_devices = btrfs_super_num_devices(disk_super);
 
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
-	if (!ret && fs_devices_ret)
-		(*fs_devices_ret)->total_devices = total_devices;
 
 	btrfs_release_disk_super(page);
 
-- 
cgit v1.2.3


From e124ece53edf3a80d594fa537d5c0a37b26f4eb7 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 18 Jan 2018 22:02:35 +0800
Subject: btrfs: get device pointer from device_list_add()

Instead of pointer to btrfs_fs_devices as an arg in device_list_add()
better to get pointer to btrfs_device as return value, then we have
both, pointer to btrfs_device and btrfs_fs_devices. btrfs_device is
needed to handle reappearing missing device.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4224a735493b..34c889a66240 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -724,12 +724,11 @@ error_brelse:
  * Add new device to list of registered devices
  *
  * Returns:
- * 0   - device already known or newly added
- * < 0 - error
+ * device pointer which was just added or updated when successful
+ * error pointer when failed
  */
-static noinline int device_list_add(const char *path,
-			   struct btrfs_super_block *disk_super,
-			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+static noinline struct btrfs_device *device_list_add(const char *path,
+			   struct btrfs_super_block *disk_super, u64 devid)
 {
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices;
@@ -740,7 +739,7 @@ static noinline int device_list_add(const char *path,
 	if (!fs_devices) {
 		fs_devices = alloc_fs_devices(disk_super->fsid);
 		if (IS_ERR(fs_devices))
-			return PTR_ERR(fs_devices);
+			return ERR_CAST(fs_devices);
 
 		list_add(&fs_devices->list, &fs_uuids);
 
@@ -752,19 +751,19 @@ static noinline int device_list_add(const char *path,
 
 	if (!device) {
 		if (fs_devices->opened)
-			return -EBUSY;
+			return ERR_PTR(-EBUSY);
 
 		device = btrfs_alloc_device(NULL, &devid,
 					    disk_super->dev_item.uuid);
 		if (IS_ERR(device)) {
 			/* we can safely leave the fs_devices entry around */
-			return PTR_ERR(device);
+			return device;
 		}
 
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name) {
 			free_device(device);
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 		}
 		rcu_assign_pointer(device->name, name);
 
@@ -818,12 +817,12 @@ static noinline int device_list_add(const char *path,
 			 * with larger generation number or the last-in if
 			 * generation are equal.
 			 */
-			return -EEXIST;
+			return ERR_PTR(-EEXIST);
 		}
 
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name)
-			return -ENOMEM;
+			return ERR_PTR(-ENOMEM);
 		rcu_string_free(device->name);
 		rcu_assign_pointer(device->name, name);
 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
@@ -843,9 +842,7 @@ static noinline int device_list_add(const char *path,
 
 	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 
-	*fs_devices_ret = fs_devices;
-
-	return 0;
+	return device;
 }
 
 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -1180,9 +1177,10 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret)
 {
 	struct btrfs_super_block *disk_super;
+	struct btrfs_device *device;
 	struct block_device *bdev;
 	struct page *page;
-	int ret;
+	int ret = 0;
 	u64 devid;
 	u64 bytenr;
 
@@ -1209,7 +1207,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 
-	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+	device = device_list_add(path, disk_super, devid);
+	if (IS_ERR(device))
+		ret = PTR_ERR(device);
+	else
+		*fs_devices_ret = device->fs_devices;
 
 	btrfs_release_disk_super(page);
 
-- 
cgit v1.2.3


From 3acbcbfc8f06d4ade2aab2ebba0a2542a05ce90c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 18 Jan 2018 22:02:36 +0800
Subject: btrfs: drop devid as device_list_add() arg

As struct btrfs_disk_super is being passed, so it can get devid
the same way its parent does.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 34c889a66240..b5036bd69e6a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -728,12 +728,13 @@ error_brelse:
  * error pointer when failed
  */
 static noinline struct btrfs_device *device_list_add(const char *path,
-			   struct btrfs_super_block *disk_super, u64 devid)
+			   struct btrfs_super_block *disk_super)
 {
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices;
 	struct rcu_string *name;
 	u64 found_transid = btrfs_super_generation(disk_super);
+	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
@@ -1181,7 +1182,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	struct block_device *bdev;
 	struct page *page;
 	int ret = 0;
-	u64 devid;
 	u64 bytenr;
 
 	/*
@@ -1205,9 +1205,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error_bdev_put;
 	}
 
-	devid = btrfs_stack_device_id(&disk_super->dev_item);
-
-	device = device_list_add(path, disk_super, devid);
+	device = device_list_add(path, disk_super);
 	if (IS_ERR(device))
 		ret = PTR_ERR(device);
 	else
-- 
cgit v1.2.3