Merge tag 'for-4.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 4 Jun 2018 21:29:13 +0000 (14:29 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 4 Jun 2018 21:29:13 +0000 (14:29 -0700)
Pull btrfs updates from David Sterba:
 "User visible features:

   - added support for the ioctl FS_IOC_FSGETXATTR, per-inode flags,
     successor of GET/SETFLAGS; now supports only existing flags:
     append, immutable, noatime, nodump, sync

   - 3 new unprivileged ioctls to allow users to enumerate subvolumes

   - dedupe syscall implementation does not restrict the range to 16MiB,
     though it still splits the whole range to 16MiB chunks

   - on user demand, rmdir() is able to delete an empty subvolume,
     export the capability in sysfs

   - fix inode number types in tracepoints, other cleanups

   - send: improved speed when dealing with a large removed directory,
     measurements show decrease from 2000 minutes to 2 minutes on a
     directory with 2 million entries

   - pre-commit check of superblock to detect a mysterious in-memory
     corruption

   - log message updates

  Other changes:

   - orphan inode cleanup improved, does no keep long-standing
     reservations that could lead up to early ENOSPC in some cases

   - slight improvement of handling snapshotted NOCOW files by avoiding
     some unnecessary tree searches

   - avoid OOM when dealing with many unmergeable small extents at flush
     time

   - speedup conversion of free space tree representations from/to
     bitmap/tree

   - code refactoring, deletion, cleanups:
      + delayed refs
      + delayed iput
      + redundant argument removals
      + memory barrier cleanups
      + remove a redundant mutex supposedly excluding several ioctls to
        run in parallel

   - new tracepoints for blockgroup manipulation

   - more sanity checks of compressed headers"

* tag 'for-4.18-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (183 commits)
  btrfs: Add unprivileged version of ino_lookup ioctl
  btrfs: Add unprivileged ioctl which returns subvolume's ROOT_REF
  btrfs: Add unprivileged ioctl which returns subvolume information
  Btrfs: clean up error handling in btrfs_truncate()
  btrfs: Factor out write portion of btrfs_get_blocks_direct
  btrfs: Factor out read portion of btrfs_get_blocks_direct
  btrfs: return ENOMEM if path allocation fails in btrfs_cross_ref_exist
  btrfs: raid56: Remove VLA usage
  btrfs: return error value if create_io_em failed in cow_file_range
  btrfs: drop useless member qgroup_reserved of btrfs_pending_snapshot
  btrfs: drop unused parameter qgroup_reserved
  btrfs: balance dirty metadata pages in btrfs_finish_ordered_io
  btrfs: lift some btrfs_cross_ref_exist checks in nocow path
  btrfs: Remove fs_info argument from btrfs_uuid_tree_rem
  btrfs: Remove fs_info argument from btrfs_uuid_tree_add
  Btrfs: remove unused check of skip_locking
  Btrfs: remove always true check in unlock_up
  Btrfs: grab write lock directly if write_lock_level is the max level
  Btrfs: move get root out of btrfs_search_slot to a helper
  Btrfs: use more straightforward extent_buffer_uptodate check
  ...

49 files changed:
fs/btrfs/btrfs_inode.h
fs/btrfs/compression.c
fs/btrfs/compression.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/free-space-cache.c
fs/btrfs/free-space-tree.c
fs/btrfs/free-space-tree.h
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/locking.c
fs/btrfs/lzo.c
fs/btrfs/ordered-data.c
fs/btrfs/print-tree.c
fs/btrfs/qgroup.c
fs/btrfs/raid56.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/super.c
fs/btrfs/sysfs.c
fs/btrfs/sysfs.h
fs/btrfs/tests/btrfs-tests.c
fs/btrfs/tests/btrfs-tests.h
fs/btrfs/tests/extent-buffer-tests.c
fs/btrfs/tests/extent-io-tests.c
fs/btrfs/tests/extent-map-tests.c
fs/btrfs/tests/free-space-tests.c
fs/btrfs/tests/free-space-tree-tests.c
fs/btrfs/tests/inode-tests.c
fs/btrfs/tests/qgroup-tests.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-log.c
fs/btrfs/uuid-tree.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
include/trace/events/btrfs.h
include/uapi/linux/btrfs.h

index 234bae55b85d2b5c4421c115e2a87bbfe969ed1b..7e075343daa508b7e7c0c6f8746ff2d2ff4a5653 100644 (file)
  * ordered operations list so that we make sure to flush out any
  * new data the application may have written before commit.
  */
-#define BTRFS_INODE_ORDERED_DATA_CLOSE         0
-#define BTRFS_INODE_ORPHAN_META_RESERVED       1
-#define BTRFS_INODE_DUMMY                      2
-#define BTRFS_INODE_IN_DEFRAG                  3
-#define BTRFS_INODE_HAS_ORPHAN_ITEM            4
-#define BTRFS_INODE_HAS_ASYNC_EXTENT           5
-#define BTRFS_INODE_NEEDS_FULL_SYNC            6
-#define BTRFS_INODE_COPY_EVERYTHING            7
-#define BTRFS_INODE_IN_DELALLOC_LIST           8
-#define BTRFS_INODE_READDIO_NEED_LOCK          9
-#define BTRFS_INODE_HAS_PROPS                  10
+enum {
+       BTRFS_INODE_ORDERED_DATA_CLOSE = 0,
+       BTRFS_INODE_DUMMY,
+       BTRFS_INODE_IN_DEFRAG,
+       BTRFS_INODE_HAS_ASYNC_EXTENT,
+       BTRFS_INODE_NEEDS_FULL_SYNC,
+       BTRFS_INODE_COPY_EVERYTHING,
+       BTRFS_INODE_IN_DELALLOC_LIST,
+       BTRFS_INODE_READDIO_NEED_LOCK,
+       BTRFS_INODE_HAS_PROPS,
+};
 
 /* in memory btrfs inode */
 struct btrfs_inode {
index 1061575a7d2520ba15b250e424ad39bea9da15f6..d3e447b45bf793abd7ca555af7e9e6ad40b2da2d 100644 (file)
@@ -990,12 +990,7 @@ static void __free_workspace(int type, struct list_head *workspace,
                btrfs_compress_op[idx]->free_workspace(workspace);
        atomic_dec(total_ws);
 wake:
-       /*
-        * Make sure counter is updated before we wake up waiters.
-        */
-       smp_mb();
-       if (waitqueue_active(ws_wait))
-               wake_up(ws_wait);
+       cond_wake_up(ws_wait);
 }
 
 static void free_workspace(int type, struct list_head *ws)
index cc605f7b23fbf3bf0f0ed56ac625daa9982b149a..ddda9b80bf2044edc3ae210bb401ce8e728fdbe2 100644 (file)
@@ -6,6 +6,8 @@
 #ifndef BTRFS_COMPRESSION_H
 #define BTRFS_COMPRESSION_H
 
+#include <linux/sizes.h>
+
 /*
  * We want to make sure that amount of RAM required to uncompress an extent is
  * reasonable, so we limit the total size in ram of a compressed extent to
index 8c68961925b1482517bfe2c96635f5ee1cc79694..4bc326df472ef4c81af658cb08ea77bae20c2916 100644 (file)
@@ -2330,7 +2330,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
                        no_skips = 1;
 
                t = path->nodes[i];
-               if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+               if (i >= lowest_unlock && i > skip_level) {
                        btrfs_tree_unlock_rw(t, path->locks[i]);
                        path->locks[i] = 0;
                        if (write_lock_level &&
@@ -2432,7 +2432,6 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
        btrfs_unlock_up_safe(p, level + 1);
        btrfs_set_path_blocking(p);
 
-       free_extent_buffer(tmp);
        if (p->reada != READA_NONE)
                reada_for_search(fs_info, p, level, slot, key->objectid);
 
@@ -2446,7 +2445,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
                 * and give up so that our caller doesn't loop forever
                 * on our EAGAINs.
                 */
-               if (!btrfs_buffer_uptodate(tmp, 0, 0))
+               if (!extent_buffer_uptodate(tmp))
                        ret = -EIO;
                free_extent_buffer(tmp);
        } else {
@@ -2599,6 +2598,78 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
        return 0;
 }
 
+static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
+                                                       struct btrfs_path *p,
+                                                       int write_lock_level)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct extent_buffer *b;
+       int root_lock;
+       int level = 0;
+
+       /* We try very hard to do read locks on the root */
+       root_lock = BTRFS_READ_LOCK;
+
+       if (p->search_commit_root) {
+               /* The commit roots are read only so we always do read locks */
+               if (p->need_commit_sem)
+                       down_read(&fs_info->commit_root_sem);
+               b = root->commit_root;
+               extent_buffer_get(b);
+               level = btrfs_header_level(b);
+               if (p->need_commit_sem)
+                       up_read(&fs_info->commit_root_sem);
+               /*
+                * Ensure that all callers have set skip_locking when
+                * p->search_commit_root = 1.
+                */
+               ASSERT(p->skip_locking == 1);
+
+               goto out;
+       }
+
+       if (p->skip_locking) {
+               b = btrfs_root_node(root);
+               level = btrfs_header_level(b);
+               goto out;
+       }
+
+       /*
+        * If the level is set to maximum, we can skip trying to get the read
+        * lock.
+        */
+       if (write_lock_level < BTRFS_MAX_LEVEL) {
+               /*
+                * We don't know the level of the root node until we actually
+                * have it read locked
+                */
+               b = btrfs_read_lock_root_node(root);
+               level = btrfs_header_level(b);
+               if (level > write_lock_level)
+                       goto out;
+
+               /* Whoops, must trade for write lock */
+               btrfs_tree_read_unlock(b);
+               free_extent_buffer(b);
+       }
+
+       b = btrfs_lock_root_node(root);
+       root_lock = BTRFS_WRITE_LOCK;
+
+       /* The level might have changed, check again */
+       level = btrfs_header_level(b);
+
+out:
+       p->nodes[level] = b;
+       if (!p->skip_locking)
+               p->locks[level] = root_lock;
+       /*
+        * Callers are responsible for dropping b's references.
+        */
+       return b;
+}
+
+
 /*
  * btrfs_search_slot - look for a key in a tree and perform necessary
  * modifications to preserve tree invariants.
@@ -2635,7 +2706,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        int err;
        int level;
        int lowest_unlock = 1;
-       int root_lock;
        /* everything at write_lock_level or lower must be write locked */
        int write_lock_level = 0;
        u8 lowest_level = 0;
@@ -2673,50 +2743,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 again:
        prev_cmp = -1;
-       /*
-        * we try very hard to do read locks on the root
-        */
-       root_lock = BTRFS_READ_LOCK;
-       level = 0;
-       if (p->search_commit_root) {
-               /*
-                * the commit roots are read only
-                * so we always do read locks
-                */
-               if (p->need_commit_sem)
-                       down_read(&fs_info->commit_root_sem);
-               b = root->commit_root;
-               extent_buffer_get(b);
-               level = btrfs_header_level(b);
-               if (p->need_commit_sem)
-                       up_read(&fs_info->commit_root_sem);
-               if (!p->skip_locking)
-                       btrfs_tree_read_lock(b);
-       } else {
-               if (p->skip_locking) {
-                       b = btrfs_root_node(root);
-                       level = btrfs_header_level(b);
-               } else {
-                       /* we don't know the level of the root node
-                        * until we actually have it read locked
-                        */
-                       b = btrfs_read_lock_root_node(root);
-                       level = btrfs_header_level(b);
-                       if (level <= write_lock_level) {
-                               /* whoops, must trade for write lock */
-                               btrfs_tree_read_unlock(b);
-                               free_extent_buffer(b);
-                               b = btrfs_lock_root_node(root);
-                               root_lock = BTRFS_WRITE_LOCK;
-
-                               /* the level might have changed, check again */
-                               level = btrfs_header_level(b);
-                       }
-               }
-       }
-       p->nodes[level] = b;
-       if (!p->skip_locking)
-               p->locks[level] = root_lock;
+       b = btrfs_search_slot_get_root(root, p, write_lock_level);
 
        while (b) {
                level = btrfs_header_level(b);
index 0d422c9908b8085f531a752a4bd6d5bf1b430e02..f4bf7874c24a4d63609a30a966d14f9db0a323e0 100644 (file)
@@ -739,6 +739,12 @@ struct btrfs_delayed_root;
  */
 #define BTRFS_FS_NEED_ASYNC_COMMIT             17
 
+/*
+ * Indicate that balance has been set up from the ioctl and is in the main
+ * phase. The fs_info::balance_ctl is initialized.
+ */
+#define BTRFS_FS_BALANCE_RUNNING               18
+
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -838,7 +844,6 @@ struct btrfs_fs_info {
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
        struct mutex chunk_mutex;
-       struct mutex volume_mutex;
 
        /*
         * this is taken to make sure we don't set block groups ro after
@@ -1004,7 +1009,6 @@ struct btrfs_fs_info {
        /* restriper state */
        spinlock_t balance_lock;
        struct mutex balance_mutex;
-       atomic_t balance_running;
        atomic_t balance_pause_req;
        atomic_t balance_cancel_req;
        struct btrfs_balance_control *balance_ctl;
@@ -1219,9 +1223,6 @@ struct btrfs_root {
        spinlock_t log_extents_lock[2];
        struct list_head logged_list[2];
 
-       spinlock_t orphan_lock;
-       atomic_t orphan_inodes;
-       struct btrfs_block_rsv *orphan_block_rsv;
        int orphan_cleanup_state;
 
        spinlock_t inode_lock;
@@ -2764,13 +2765,9 @@ void btrfs_delalloc_release_space(struct inode *inode,
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
                                            u64 len);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
-int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
-                                 struct btrfs_inode *inode);
-void btrfs_orphan_release_metadata(struct btrfs_inode *inode);
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                     struct btrfs_block_rsv *rsv,
-                                    int nitems,
-                                    u64 *qgroup_reserved, bool use_global_rsv);
+                                    int nitems, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
                                      struct btrfs_block_rsv *rsv);
 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
@@ -2828,7 +2825,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
 void check_system_chunk(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info, const u64 type);
 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-                      struct btrfs_fs_info *info, u64 start, u64 end);
+                      u64 start, u64 end);
 
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
@@ -3042,11 +3039,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
 
 /* uuid-tree.c */
-int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
-                       struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
                        u64 subid);
-int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
-                       struct btrfs_fs_info *fs_info, u8 *uuid, u8 type,
+int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type,
                        u64 subid);
 int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
                            int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
@@ -3163,18 +3158,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
                                     struct extent_map *em);
 
 /* inode.c */
-struct btrfs_delalloc_work {
-       struct inode *inode;
-       int delay_iput;
-       struct completion completion;
-       struct list_head list;
-       struct btrfs_work work;
-};
-
-struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
-                                                   int delay_iput);
-void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
-
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
                struct page *page, size_t pg_offset, u64 start,
                u64 len, int create);
@@ -3193,10 +3176,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 int btrfs_add_link(struct btrfs_trans_handle *trans,
                   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
                   const char *name, int name_len, int add_backref, u64 index);
-int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
-                       struct inode *dir, u64 objectid,
-                       const char *name, int name_len);
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry);
 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
                        int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
@@ -3204,9 +3184,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct inode *inode, u64 new_size,
                               u32 min_type);
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
-                              int nr);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              unsigned int extra_bits,
                              struct extent_state **cached_state, int dedupe);
@@ -3240,10 +3219,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
                struct btrfs_inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
-void btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -3262,14 +3238,14 @@ void btrfs_test_inode_set_ops(struct inode *inode);
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_update_iflags(struct inode *inode);
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
 int btrfs_is_empty_uuid(u8 *uuid);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
                      struct btrfs_ioctl_defrag_range_args *range,
                      u64 newer_than, unsigned long max_pages);
 void btrfs_get_block_group_info(struct list_head *groups_list,
                                struct btrfs_ioctl_space_info *space);
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
                               struct btrfs_ioctl_balance_args *bargs);
 ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
                           struct file *dst_file, u64 dst_loff);
@@ -3767,4 +3743,26 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
        return 0;
 }
 
+static inline void cond_wake_up(struct wait_queue_head *wq)
+{
+       /*
+        * This implies a full smp_mb barrier, see comments for
+        * waitqueue_active why.
+        */
+       if (wq_has_sleeper(wq))
+               wake_up(wq);
+}
+
+static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
+{
+       /*
+        * Special case for conditional wakeup where the barrier required for
+        * waitqueue_active is implied by some of the preceding code. Eg. one
+        * of such atomic operations (atomic_dec_and_return, ...), or a
+        * unlock/lock sequence, etc.
+        */
+       if (waitqueue_active(wq))
+               wake_up(wq);
+}
+
 #endif
index a8d492dbd3e7c100715011a9ddad8a22d82932dd..fe6caa7e698bedf4d86c4fe427a3a49a227dcbc6 100644 (file)
@@ -460,13 +460,10 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
 {
        int seq = atomic_inc_return(&delayed_root->items_seq);
 
-       /*
-        * atomic_dec_return implies a barrier for waitqueue_active
-        */
+       /* atomic_dec_return implies a barrier */
        if ((atomic_dec_return(&delayed_root->items) <
-           BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
-           waitqueue_active(&delayed_root->wait))
-               wake_up(&delayed_root->wait);
+           BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
+               cond_wake_up_nomb(&delayed_root->wait);
 }
 
 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
index e1b0651686f7c4e988766d927c669306560c468d..03dec673d12abc4b5d753f1cc8ae585f4e6978f6 100644 (file)
@@ -286,10 +286,10 @@ static bool merge_ref(struct btrfs_trans_handle *trans,
 }
 
 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info,
                              struct btrfs_delayed_ref_root *delayed_refs,
                              struct btrfs_delayed_ref_head *head)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_node *ref;
        struct rb_node *node;
        u64 seq = 0;
@@ -323,9 +323,7 @@ again:
        }
 }
 
-int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
-                           struct btrfs_delayed_ref_root *delayed_refs,
-                           u64 seq)
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
 {
        struct seq_list *elem;
        int ret = 0;
@@ -336,10 +334,9 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
                                        struct seq_list, list);
                if (seq >= elem->seq) {
                        btrfs_debug(fs_info,
-                               "holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)",
+                               "holding back delayed_ref %#x.%x, lowest is %#x.%x",
                                (u32)(seq >> 32), (u32)seq,
-                               (u32)(elem->seq >> 32), (u32)elem->seq,
-                               delayed_refs);
+                               (u32)(elem->seq >> 32), (u32)elem->seq);
                        ret = 1;
                }
        }
@@ -529,33 +526,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
        spin_unlock(&existing->lock);
 }
 
-/*
- * helper function to actually insert a head node into the rbtree.
- * this does all the dirty work in terms of maintaining the correct
- * overall modification count.
- */
-static noinline struct btrfs_delayed_ref_head *
-add_delayed_ref_head(struct btrfs_fs_info *fs_info,
-                    struct btrfs_trans_handle *trans,
-                    struct btrfs_delayed_ref_head *head_ref,
-                    struct btrfs_qgroup_extent_record *qrecord,
-                    u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
-                    int action, int is_data, int is_system,
-                    int *qrecord_inserted_ret,
-                    int *old_ref_mod, int *new_ref_mod)
-
+static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+                                 struct btrfs_qgroup_extent_record *qrecord,
+                                 u64 bytenr, u64 num_bytes, u64 ref_root,
+                                 u64 reserved, int action, bool is_data,
+                                 bool is_system)
 {
-       struct btrfs_delayed_ref_head *existing;
-       struct btrfs_delayed_ref_root *delayed_refs;
        int count_mod = 1;
        int must_insert_reserved = 0;
-       int qrecord_inserted = 0;
 
        /* If reserved is provided, it must be a data extent. */
        BUG_ON(!is_data && reserved);
 
        /*
-        * the head node stores the sum of all the mods, so dropping a ref
+        * The head node stores the sum of all the mods, so dropping a ref
         * should drop the sum in the head node by one.
         */
        if (action == BTRFS_UPDATE_DELAYED_HEAD)
@@ -564,12 +548,11 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                count_mod = -1;
 
        /*
-        * BTRFS_ADD_DELAYED_EXTENT means that we need to update
-        * the reserved accounting when the extent is finally added, or
-        * if a later modification deletes the delayed ref without ever
-        * inserting the extent into the extent allocation tree.
-        * ref->must_insert_reserved is the flag used to record
-        * that accounting mods are required.
+        * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
+        * accounting when the extent is finally added, or if a later
+        * modification deletes the delayed ref without ever inserting the
+        * extent into the extent allocation tree.  ref->must_insert_reserved
+        * is the flag used to record that accounting mods are required.
         *
         * Once we record must_insert_reserved, switch the action to
         * BTRFS_ADD_DELAYED_REF because other special casing is not required.
@@ -579,8 +562,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        else
                must_insert_reserved = 0;
 
-       delayed_refs = &trans->transaction->delayed_refs;
-
        refcount_set(&head_ref->refs, 1);
        head_ref->bytenr = bytenr;
        head_ref->num_bytes = num_bytes;
@@ -598,7 +579,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        spin_lock_init(&head_ref->lock);
        mutex_init(&head_ref->mutex);
 
-       /* Record qgroup extent info if provided */
        if (qrecord) {
                if (ref_root && reserved) {
                        head_ref->qgroup_ref_root = ref_root;
@@ -608,20 +588,44 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                qrecord->bytenr = bytenr;
                qrecord->num_bytes = num_bytes;
                qrecord->old_roots = NULL;
+       }
+}
+
+/*
+ * helper function to actually insert a head node into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count.
+ */
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_trans_handle *trans,
+                    struct btrfs_delayed_ref_head *head_ref,
+                    struct btrfs_qgroup_extent_record *qrecord,
+                    int action, int *qrecord_inserted_ret,
+                    int *old_ref_mod, int *new_ref_mod)
+{
+       struct btrfs_delayed_ref_head *existing;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       int qrecord_inserted = 0;
 
-               if(btrfs_qgroup_trace_extent_nolock(fs_info,
+       delayed_refs = &trans->transaction->delayed_refs;
+
+       /* Record qgroup extent info if provided */
+       if (qrecord) {
+               if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
                                        delayed_refs, qrecord))
                        kfree(qrecord);
                else
                        qrecord_inserted = 1;
        }
 
-       trace_add_delayed_ref_head(fs_info, head_ref, action);
+       trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
 
        existing = htree_insert(&delayed_refs->href_root,
                                &head_ref->href_node);
        if (existing) {
-               WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+               WARN_ON(qrecord && head_ref->qgroup_ref_root
+                       && head_ref->qgroup_reserved
+                       && existing->qgroup_ref_root
                        && existing->qgroup_reserved);
                update_existing_head_ref(delayed_refs, existing, head_ref,
                                         old_ref_mod);
@@ -634,8 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        } else {
                if (old_ref_mod)
                        *old_ref_mod = 0;
-               if (is_data && count_mod < 0)
-                       delayed_refs->pending_csums += num_bytes;
+               if (head_ref->is_data && head_ref->ref_mod < 0)
+                       delayed_refs->pending_csums += head_ref->num_bytes;
                delayed_refs->num_heads++;
                delayed_refs->num_heads_ready++;
                atomic_inc(&delayed_refs->num_entries);
@@ -645,90 +649,48 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                *qrecord_inserted_ret = qrecord_inserted;
        if (new_ref_mod)
                *new_ref_mod = head_ref->total_ref_mod;
-       return head_ref;
-}
-
-/*
- * helper to insert a delayed tree ref into the rbtree.
- */
-static noinline void
-add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
-                    struct btrfs_trans_handle *trans,
-                    struct btrfs_delayed_ref_head *head_ref,
-                    struct btrfs_delayed_ref_node *ref, u64 bytenr,
-                    u64 num_bytes, u64 parent, u64 ref_root, int level,
-                    int action)
-{
-       struct btrfs_delayed_tree_ref *full_ref;
-       struct btrfs_delayed_ref_root *delayed_refs;
-       u64 seq = 0;
-       int ret;
-
-       if (action == BTRFS_ADD_DELAYED_EXTENT)
-               action = BTRFS_ADD_DELAYED_REF;
 
-       if (is_fstree(ref_root))
-               seq = atomic64_read(&fs_info->tree_mod_seq);
-       delayed_refs = &trans->transaction->delayed_refs;
-
-       /* first set the basic ref node struct up */
-       refcount_set(&ref->refs, 1);
-       ref->bytenr = bytenr;
-       ref->num_bytes = num_bytes;
-       ref->ref_mod = 1;
-       ref->action = action;
-       ref->is_head = 0;
-       ref->in_tree = 1;
-       ref->seq = seq;
-       RB_CLEAR_NODE(&ref->ref_node);
-       INIT_LIST_HEAD(&ref->add_list);
-
-       full_ref = btrfs_delayed_node_to_tree_ref(ref);
-       full_ref->parent = parent;
-       full_ref->root = ref_root;
-       if (parent)
-               ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-       else
-               ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-       full_ref->level = level;
-
-       trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
-
-       ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
-
-       /*
-        * XXX: memory should be freed at the same level allocated.
-        * But bad practice is anywhere... Follow it now. Need cleanup.
-        */
-       if (ret > 0)
-               kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
+       return head_ref;
 }
 
 /*
- * helper to insert a delayed data ref into the rbtree.
+ * init_delayed_ref_common - Initialize the structure which represents a
+ *                          modification to a an extent.
+ *
+ * @fs_info:    Internal to the mounted filesystem mount structure.
+ *
+ * @ref:       The structure which is going to be initialized.
+ *
+ * @bytenr:    The logical address of the extent for which a modification is
+ *             going to be recorded.
+ *
+ * @num_bytes:  Size of the extent whose modification is being recorded.
+ *
+ * @ref_root:  The id of the root where this modification has originated, this
+ *             can be either one of the well-known metadata trees or the
+ *             subvolume id which references this extent.
+ *
+ * @action:    Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
+ *             BTRFS_ADD_DELAYED_EXTENT
+ *
+ * @ref_type:  Holds the type of the extent which is being recorded, can be
+ *             one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
+ *             when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
+ *             BTRFS_EXTENT_DATA_REF_KEY when recording data extent
  */
-static noinline void
-add_delayed_data_ref(struct btrfs_fs_info *fs_info,
-                    struct btrfs_trans_handle *trans,
-                    struct btrfs_delayed_ref_head *head_ref,
-                    struct btrfs_delayed_ref_node *ref, u64 bytenr,
-                    u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
-                    u64 offset, int action)
+static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_delayed_ref_node *ref,
+                                   u64 bytenr, u64 num_bytes, u64 ref_root,
+                                   int action, u8 ref_type)
 {
-       struct btrfs_delayed_data_ref *full_ref;
-       struct btrfs_delayed_ref_root *delayed_refs;
        u64 seq = 0;
-       int ret;
 
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
 
-       delayed_refs = &trans->transaction->delayed_refs;
-
        if (is_fstree(ref_root))
                seq = atomic64_read(&fs_info->tree_mod_seq);
 
-       /* first set the basic ref node struct up */
        refcount_set(&ref->refs, 1);
        ref->bytenr = bytenr;
        ref->num_bytes = num_bytes;
@@ -737,25 +699,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        ref->is_head = 0;
        ref->in_tree = 1;
        ref->seq = seq;
+       ref->type = ref_type;
        RB_CLEAR_NODE(&ref->ref_node);
        INIT_LIST_HEAD(&ref->add_list);
-
-       full_ref = btrfs_delayed_node_to_data_ref(ref);
-       full_ref->parent = parent;
-       full_ref->root = ref_root;
-       if (parent)
-               ref->type = BTRFS_SHARED_DATA_REF_KEY;
-       else
-               ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-
-       full_ref->objectid = owner;
-       full_ref->offset = offset;
-
-       trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
-
-       ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref);
-       if (ret > 0)
-               kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
 }
 
 /*
@@ -775,13 +721,25 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_qgroup_extent_record *record = NULL;
        int qrecord_inserted;
-       int is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+       bool is_system = (ref_root == BTRFS_CHUNK_TREE_OBJECTID);
+       int ret;
+       u8 ref_type;
 
        BUG_ON(extent_op && extent_op->is_data);
        ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
        if (!ref)
                return -ENOMEM;
 
+       if (parent)
+               ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
+       else
+               ref_type = BTRFS_TREE_BLOCK_REF_KEY;
+       init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+                               ref_root, action, ref_type);
+       ref->root = ref_root;
+       ref->parent = parent;
+       ref->level = level;
+
        head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref)
                goto free_ref;
@@ -793,6 +751,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
                        goto free_head_ref;
        }
 
+       init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+                             ref_root, 0, action, false, is_system);
        head_ref->extent_op = extent_op;
 
        delayed_refs = &trans->transaction->delayed_refs;
@@ -802,15 +762,19 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-       head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
-                                       bytenr, num_bytes, 0, 0, action, 0,
-                                       is_system, &qrecord_inserted,
+       head_ref = add_delayed_ref_head(trans, head_ref, record,
+                                       action, &qrecord_inserted,
                                        old_ref_mod, new_ref_mod);
 
-       add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
-                            num_bytes, parent, ref_root, level, action);
+       ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
        spin_unlock(&delayed_refs->lock);
 
+       trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
+                                  action == BTRFS_ADD_DELAYED_EXTENT ?
+                                  BTRFS_ADD_DELAYED_REF : action);
+       if (ret > 0)
+               kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
        if (qrecord_inserted)
                btrfs_qgroup_trace_extent_post(fs_info, record);
 
@@ -839,11 +803,25 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_qgroup_extent_record *record = NULL;
        int qrecord_inserted;
+       int ret;
+       u8 ref_type;
 
        ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
        if (!ref)
                return -ENOMEM;
 
+       if (parent)
+               ref_type = BTRFS_SHARED_DATA_REF_KEY;
+       else
+               ref_type = BTRFS_EXTENT_DATA_REF_KEY;
+       init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
+                               ref_root, action, ref_type);
+       ref->root = ref_root;
+       ref->parent = parent;
+       ref->objectid = owner;
+       ref->offset = offset;
+
+
        head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref) {
                kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
@@ -861,6 +839,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                }
        }
 
+       init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
+                             reserved, action, true, false);
        head_ref->extent_op = NULL;
 
        delayed_refs = &trans->transaction->delayed_refs;
@@ -870,16 +850,20 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-       head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
-                                       bytenr, num_bytes, ref_root, reserved,
-                                       action, 1, 0, &qrecord_inserted,
+       head_ref = add_delayed_ref_head(trans, head_ref, record,
+                                       action, &qrecord_inserted,
                                        old_ref_mod, new_ref_mod);
 
-       add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
-                                  num_bytes, parent, ref_root, owner, offset,
-                                  action);
+       ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
        spin_unlock(&delayed_refs->lock);
 
+       trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
+                                  action == BTRFS_ADD_DELAYED_EXTENT ?
+                                  BTRFS_ADD_DELAYED_REF : action);
+       if (ret > 0)
+               kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+
+
        if (qrecord_inserted)
                return btrfs_qgroup_trace_extent_post(fs_info, record);
        return 0;
@@ -897,19 +881,16 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
        if (!head_ref)
                return -ENOMEM;
 
+       init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
+                             BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
+                             false);
        head_ref->extent_op = extent_op;
 
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
 
-       /*
-        * extent_ops just modify the flags of an extent and they don't result
-        * in ref count changes, hence it's safe to pass false/0 for is_system
-        * argument
-        */
-       add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr,
-                            num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
-                            extent_op->is_data, 0, NULL, NULL, NULL);
+       add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
+                            NULL, NULL, NULL);
 
        spin_unlock(&delayed_refs->lock);
        return 0;
index 7f00db50bd242d0a29f35c430339950b90588697..ea1aecb6a50dd977c471d1cf58611406b347954c 100644 (file)
@@ -251,7 +251,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op);
 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info,
                              struct btrfs_delayed_ref_root *delayed_refs,
                              struct btrfs_delayed_ref_head *head);
 
@@ -269,9 +268,7 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
 struct btrfs_delayed_ref_head *
 btrfs_select_ref_head(struct btrfs_trans_handle *trans);
 
-int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
-                           struct btrfs_delayed_ref_root *delayed_refs,
-                           u64 seq);
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
 
 /*
  * helper functions to cast a node into its container
index f82be266ba4ba6dfb8fe3eb4ec2160beb080d77a..e2ba0419297a51e4ff53fc7522eb2bd780b56840 100644 (file)
@@ -33,8 +33,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
                                                struct btrfs_device *srcdev,
                                                struct btrfs_device *tgtdev);
 static int btrfs_dev_replace_kthread(void *data);
-static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
-
 
 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
 {
@@ -178,6 +176,105 @@ out:
        return ret;
 }
 
+/*
+ * Initialize a new device for device replace target from a given source dev
+ * and path.
+ *
+ * Return 0 and new device in @device_out, otherwise return < 0
+ */
+static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+                                 const char *device_path,
+                                 struct btrfs_device *srcdev,
+                                 struct btrfs_device **device_out)
+{
+       struct btrfs_device *device;
+       struct block_device *bdev;
+       struct list_head *devices;
+       struct rcu_string *name;
+       u64 devid = BTRFS_DEV_REPLACE_DEVID;
+       int ret = 0;
+
+       *device_out = NULL;
+       if (fs_info->fs_devices->seeding) {
+               btrfs_err(fs_info, "the filesystem is a seed filesystem!");
+               return -EINVAL;
+       }
+
+       bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+                                 fs_info->bdev_holder);
+       if (IS_ERR(bdev)) {
+               btrfs_err(fs_info, "target device %s is invalid!", device_path);
+               return PTR_ERR(bdev);
+       }
+
+       filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+       devices = &fs_info->fs_devices->devices;
+       list_for_each_entry(device, devices, dev_list) {
+               if (device->bdev == bdev) {
+                       btrfs_err(fs_info,
+                                 "target device is in the filesystem!");
+                       ret = -EEXIST;
+                       goto error;
+               }
+       }
+
+
+       if (i_size_read(bdev->bd_inode) <
+           btrfs_device_get_total_bytes(srcdev)) {
+               btrfs_err(fs_info,
+                         "target device is smaller than source device!");
+               ret = -EINVAL;
+               goto error;
+       }
+
+
+       device = btrfs_alloc_device(NULL, &devid, NULL);
+       if (IS_ERR(device)) {
+               ret = PTR_ERR(device);
+               goto error;
+       }
+
+       name = rcu_string_strdup(device_path, GFP_KERNEL);
+       if (!name) {
+               btrfs_free_device(device);
+               ret = -ENOMEM;
+               goto error;
+       }
+       rcu_assign_pointer(device->name, name);
+
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+       device->generation = 0;
+       device->io_width = fs_info->sectorsize;
+       device->io_align = fs_info->sectorsize;
+       device->sector_size = fs_info->sectorsize;
+       device->total_bytes = btrfs_device_get_total_bytes(srcdev);
+       device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
+       device->bytes_used = btrfs_device_get_bytes_used(srcdev);
+       device->commit_total_bytes = srcdev->commit_total_bytes;
+       device->commit_bytes_used = device->bytes_used;
+       device->fs_info = fs_info;
+       device->bdev = bdev;
+       set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
+       set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+       device->mode = FMODE_EXCL;
+       device->dev_stats_valid = 1;
+       set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+       device->fs_devices = fs_info->fs_devices;
+       list_add(&device->dev_list, &fs_info->fs_devices->devices);
+       fs_info->fs_devices->num_devices++;
+       fs_info->fs_devices->open_devices++;
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+       *device_out = device;
+       return 0;
+
+error:
+       blkdev_put(bdev, FMODE_EXCL);
+       return ret;
+}
+
 /*
  * called from commit_transaction. Writes changed device replace state to
  * disk.
@@ -317,18 +414,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
        struct btrfs_device *tgt_device = NULL;
        struct btrfs_device *src_device = NULL;
 
-       /* the disk copy procedure reuses the scrub code */
-       mutex_lock(&fs_info->volume_mutex);
        ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
                                            srcdev_name, &src_device);
-       if (ret) {
-               mutex_unlock(&fs_info->volume_mutex);
+       if (ret)
                return ret;
-       }
 
        ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
                                            src_device, &tgt_device);
-       mutex_unlock(&fs_info->volume_mutex);
        if (ret)
                return ret;
 
@@ -360,7 +452,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
        dev_replace->cont_reading_from_srcdev_mode = read_src;
        WARN_ON(!src_device);
        dev_replace->srcdev = src_device;
-       WARN_ON(!tgt_device);
        dev_replace->tgtdev = tgt_device;
 
        btrfs_info_in_rcu(fs_info,
@@ -503,7 +594,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
-       ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
+       ret = btrfs_start_delalloc_roots(fs_info, -1);
        if (ret) {
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return ret;
@@ -518,7 +609,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        ret = btrfs_commit_transaction(trans);
        WARN_ON(ret);
 
-       mutex_lock(&uuid_mutex);
        /* keep away write_all_supers() during the finishing procedure */
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        mutex_lock(&fs_info->chunk_mutex);
@@ -545,7 +635,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                btrfs_dev_replace_write_unlock(dev_replace);
                mutex_unlock(&fs_info->chunk_mutex);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-               mutex_unlock(&uuid_mutex);
                btrfs_rm_dev_replace_blocked(fs_info);
                if (tgt_device)
                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
@@ -596,7 +685,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         */
        mutex_unlock(&fs_info->chunk_mutex);
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-       mutex_unlock(&uuid_mutex);
 
        /* replace the sysfs entry */
        btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
@@ -800,7 +888,17 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
        }
        btrfs_dev_replace_write_unlock(dev_replace);
 
-       WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
+       /*
+        * This could collide with a paused balance, but the exclusive op logic
+        * should never allow both to start and pause. We don't want to allow
+        * dev-replace to start anyway.
+        */
+       if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+               btrfs_info(fs_info,
+               "cannot resume dev-replace, other exclusive operation running");
+               return 0;
+       }
+
        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
        return PTR_ERR_OR_ZERO(task);
 }
@@ -810,6 +908,7 @@ static int btrfs_dev_replace_kthread(void *data)
        struct btrfs_fs_info *fs_info = data;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        u64 progress;
+       int ret;
 
        progress = btrfs_dev_replace_progress(fs_info);
        progress = div_u64(progress, 10);
@@ -820,23 +919,14 @@ static int btrfs_dev_replace_kthread(void *data)
                btrfs_dev_name(dev_replace->tgtdev),
                (unsigned int)progress);
 
-       btrfs_dev_replace_continue_on_mount(fs_info);
-       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
-
-       return 0;
-}
-
-static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
-{
-       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-       int ret;
-
        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
                              dev_replace->committed_cursor_left,
                              btrfs_device_get_total_bytes(dev_replace->srcdev),
                              &dev_replace->scrub_progress, 0, 1);
        ret = btrfs_dev_replace_finishing(fs_info, ret);
        WARN_ON(ret);
+
+       clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
        return 0;
 }
 
@@ -916,9 +1006,9 @@ void btrfs_dev_replace_clear_lock_blocking(
        ASSERT(atomic_read(&dev_replace->read_locks) > 0);
        ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
        read_lock(&dev_replace->lock);
-       if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
-           waitqueue_active(&dev_replace->read_lock_wq))
-               wake_up(&dev_replace->read_lock_wq);
+       /* Barrier implied by atomic_dec_and_test */
+       if (atomic_dec_and_test(&dev_replace->blocking_readers))
+               cond_wake_up_nomb(&dev_replace->read_lock_wq);
 }
 
 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
@@ -929,9 +1019,7 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
 void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
 {
        percpu_counter_sub(&fs_info->bio_counter, amount);
-
-       if (waitqueue_active(&fs_info->replace_wait))
-               wake_up(&fs_info->replace_wait);
+       cond_wake_up_nomb(&fs_info->replace_wait);
 }
 
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
index c3504b4d281b5cd76bb0861b781e228bcec4b3bb..205092dc939082d48cabc43d7d4a4340a1feffb4 100644 (file)
@@ -55,7 +55,6 @@
 static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
-static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                      struct btrfs_fs_info *fs_info);
@@ -416,7 +415,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 
 static int verify_level_key(struct btrfs_fs_info *fs_info,
                            struct extent_buffer *eb, int level,
-                           struct btrfs_key *first_key)
+                           struct btrfs_key *first_key, u64 parent_transid)
 {
        int found_level;
        struct btrfs_key found_key;
@@ -454,10 +453,11 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
        if (ret) {
                WARN_ON(1);
                btrfs_err(fs_info,
-"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
-                         eb->start, first_key->objectid, first_key->type,
-                         first_key->offset, found_key.objectid,
-                         found_key.type, found_key.offset);
+"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
+                         eb->start, parent_transid, first_key->objectid,
+                         first_key->type, first_key->offset,
+                         found_key.objectid, found_key.type,
+                         found_key.offset);
        }
 #endif
        return ret;
@@ -493,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
                                                   parent_transid, 0))
                                ret = -EIO;
                        else if (verify_level_key(fs_info, eb, level,
-                                                 first_key))
+                                                 first_key, parent_transid))
                                ret = -EUCLEAN;
                        else
                                break;
@@ -1185,7 +1185,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        root->inode_tree = RB_ROOT;
        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
        root->block_rsv = NULL;
-       root->orphan_block_rsv = NULL;
 
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
@@ -1195,7 +1194,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        INIT_LIST_HEAD(&root->ordered_root);
        INIT_LIST_HEAD(&root->logged_list[0]);
        INIT_LIST_HEAD(&root->logged_list[1]);
-       spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
        spin_lock_init(&root->delalloc_lock);
        spin_lock_init(&root->ordered_extent_lock);
@@ -1216,7 +1214,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
-       atomic_set(&root->orphan_inodes, 0);
        refcount_set(&root->refs, 1);
        atomic_set(&root->will_be_snapshotted, 0);
        root->log_transid = 0;
@@ -2164,7 +2161,6 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
 {
        spin_lock_init(&fs_info->balance_lock);
        mutex_init(&fs_info->balance_mutex);
-       atomic_set(&fs_info->balance_running, 0);
        atomic_set(&fs_info->balance_pause_req, 0);
        atomic_set(&fs_info->balance_cancel_req, 0);
        fs_info->balance_ctl = NULL;
@@ -2442,6 +2438,211 @@ out:
        return ret;
 }
 
+/*
+ * Real super block validation
+ * NOTE: super csum type and incompat features will not be checked here.
+ *
+ * @sb:                super block to check
+ * @mirror_num:        the super block number to check its bytenr:
+ *             0       the primary (1st) sb
+ *             1, 2    2nd and 3rd backup copy
+ *            -1       skip bytenr check
+ */
+static int validate_super(struct btrfs_fs_info *fs_info,
+                           struct btrfs_super_block *sb, int mirror_num)
+{
+       u64 nodesize = btrfs_super_nodesize(sb);
+       u64 sectorsize = btrfs_super_sectorsize(sb);
+       int ret = 0;
+
+       if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+               btrfs_err(fs_info, "no valid FS found");
+               ret = -EINVAL;
+       }
+       if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
+               btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
+                               btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
+               ret = -EINVAL;
+       }
+       if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+               btrfs_err(fs_info, "tree_root level too big: %d >= %d",
+                               btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
+               ret = -EINVAL;
+       }
+       if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+               btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
+                               btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
+               ret = -EINVAL;
+       }
+       if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+               btrfs_err(fs_info, "log_root level too big: %d >= %d",
+                               btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
+               ret = -EINVAL;
+       }
+
+       /*
+        * Check sectorsize and nodesize first, other check will need it.
+        * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
+        */
+       if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+           sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+               btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
+               ret = -EINVAL;
+       }
+       /* Only PAGE SIZE is supported yet */
+       if (sectorsize != PAGE_SIZE) {
+               btrfs_err(fs_info,
+                       "sectorsize %llu not supported yet, only support %lu",
+                       sectorsize, PAGE_SIZE);
+               ret = -EINVAL;
+       }
+       if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+           nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+               btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
+               ret = -EINVAL;
+       }
+       if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+               btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
+                         le32_to_cpu(sb->__unused_leafsize), nodesize);
+               ret = -EINVAL;
+       }
+
+       /* Root alignment check */
+       if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
+               btrfs_warn(fs_info, "tree_root block unaligned: %llu",
+                          btrfs_super_root(sb));
+               ret = -EINVAL;
+       }
+       if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
+               btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
+                          btrfs_super_chunk_root(sb));
+               ret = -EINVAL;
+       }
+       if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+               btrfs_warn(fs_info, "log_root block unaligned: %llu",
+                          btrfs_super_log_root(sb));
+               ret = -EINVAL;
+       }
+
+       if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
+               btrfs_err(fs_info,
+                          "dev_item UUID does not match fsid: %pU != %pU",
+                          fs_info->fsid, sb->dev_item.fsid);
+               ret = -EINVAL;
+       }
+
+       /*
+        * Hint to catch really bogus numbers, bitflips or so, more exact checks are
+        * done later
+        */
+       if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
+               btrfs_err(fs_info, "bytes_used is too small %llu",
+                         btrfs_super_bytes_used(sb));
+               ret = -EINVAL;
+       }
+       if (!is_power_of_2(btrfs_super_stripesize(sb))) {
+               btrfs_err(fs_info, "invalid stripesize %u",
+                         btrfs_super_stripesize(sb));
+               ret = -EINVAL;
+       }
+       if (btrfs_super_num_devices(sb) > (1UL << 31))
+               btrfs_warn(fs_info, "suspicious number of devices: %llu",
+                          btrfs_super_num_devices(sb));
+       if (btrfs_super_num_devices(sb) == 0) {
+               btrfs_err(fs_info, "number of devices is 0");
+               ret = -EINVAL;
+       }
+
+       if (mirror_num >= 0 &&
+           btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
+               btrfs_err(fs_info, "super offset mismatch %llu != %u",
+                         btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
+               ret = -EINVAL;
+       }
+
+       /*
+        * Obvious sys_chunk_array corruptions, it must hold at least one key
+        * and one chunk
+        */
+       if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+               btrfs_err(fs_info, "system chunk array too big %u > %u",
+                         btrfs_super_sys_array_size(sb),
+                         BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+               ret = -EINVAL;
+       }
+       if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+                       + sizeof(struct btrfs_chunk)) {
+               btrfs_err(fs_info, "system chunk array too small %u < %zu",
+                         btrfs_super_sys_array_size(sb),
+                         sizeof(struct btrfs_disk_key)
+                         + sizeof(struct btrfs_chunk));
+               ret = -EINVAL;
+       }
+
+       /*
+        * The generation is a global counter, we'll trust it more than the others
+        * but it's still possible that it's the one that's wrong.
+        */
+       if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
+               btrfs_warn(fs_info,
+                       "suspicious: generation < chunk_root_generation: %llu < %llu",
+                       btrfs_super_generation(sb),
+                       btrfs_super_chunk_root_generation(sb));
+       if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+           && btrfs_super_cache_generation(sb) != (u64)-1)
+               btrfs_warn(fs_info,
+                       "suspicious: generation < cache_generation: %llu < %llu",
+                       btrfs_super_generation(sb),
+                       btrfs_super_cache_generation(sb));
+
+       return ret;
+}
+
+/*
+ * Validation of super block at mount time.
+ * Some checks already done early at mount time, like csum type and incompat
+ * flags will be skipped.
+ */
+static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
+{
+       return validate_super(fs_info, fs_info->super_copy, 0);
+}
+
+/*
+ * Validation of super block at write time.
+ * Some checks like bytenr check will be skipped as their values will be
+ * overwritten soon.
+ * Extra checks like csum type and incompat flags will be done here.
+ */
+static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_super_block *sb)
+{
+       int ret;
+
+       ret = validate_super(fs_info, sb, -1);
+       if (ret < 0)
+               goto out;
+       if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
+               ret = -EUCLEAN;
+               btrfs_err(fs_info, "invalid csum type, has %u want %u",
+                         btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
+               goto out;
+       }
+       if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+               ret = -EUCLEAN;
+               btrfs_err(fs_info,
+               "invalid incompat flags, has 0x%llx valid mask 0x%llx",
+                         btrfs_super_incompat_flags(sb),
+                         (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
+               goto out;
+       }
+out:
+       if (ret < 0)
+               btrfs_err(fs_info,
+               "super block corruption detected before writing it to disk");
+       return ret;
+}
+
 int open_ctree(struct super_block *sb,
               struct btrfs_fs_devices *fs_devices,
               char *options)
@@ -2601,7 +2802,6 @@ int open_ctree(struct super_block *sb,
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
-       mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->ro_block_group_mutex);
        init_rwsem(&fs_info->commit_root_sem);
        init_rwsem(&fs_info->cleanup_work_sem);
@@ -2668,7 +2868,7 @@ int open_ctree(struct super_block *sb,
 
        memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
 
-       ret = btrfs_check_super_valid(fs_info);
+       ret = btrfs_validate_mount_super(fs_info);
        if (ret) {
                btrfs_err(fs_info, "superblock contains fatal errors");
                err = -EINVAL;
@@ -3523,7 +3723,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
        for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
                if (raid_type == BTRFS_RAID_SINGLE)
                        continue;
-               if (!(flags & btrfs_raid_group[raid_type]))
+               if (!(flags & btrfs_raid_array[raid_type].bg_flag))
                        continue;
                min_tolerated = min(min_tolerated,
                                    btrfs_raid_array[raid_type].
@@ -3603,6 +3803,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
                flags = btrfs_super_flags(sb);
                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
+               ret = btrfs_validate_write_super(fs_info, sb);
+               if (ret < 0) {
+                       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+                       btrfs_handle_fs_error(fs_info, -EUCLEAN,
+                               "unexpected superblock corruption detected");
+                       return -EUCLEAN;
+               }
+
                ret = write_dev_supers(dev, sb, max_mirrors);
                if (ret)
                        total_errors++;
@@ -3674,8 +3882,6 @@ static void free_fs_root(struct btrfs_root *root)
 {
        iput(root->ino_cache_inode);
        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
-       btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv);
-       root->orphan_block_rsv = NULL;
        if (root->anon_dev)
                free_anon_bdev(root->anon_dev);
        if (root->subv_writers)
@@ -3766,7 +3972,6 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
 
 void close_ctree(struct btrfs_fs_info *fs_info)
 {
-       struct btrfs_root *root = fs_info->tree_root;
        int ret;
 
        set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
@@ -3862,9 +4067,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
        btrfs_free_stripe_hash_table(fs_info);
        btrfs_free_ref_cache(fs_info);
 
-       __btrfs_free_block_rsv(root->orphan_block_rsv);
-       root->orphan_block_rsv = NULL;
-
        while (!list_empty(&fs_info->pinned_chunks)) {
                struct extent_map *em;
 
@@ -3975,155 +4177,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
                                              level, first_key);
 }
 
-static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
-{
-       struct btrfs_super_block *sb = fs_info->super_copy;
-       u64 nodesize = btrfs_super_nodesize(sb);
-       u64 sectorsize = btrfs_super_sectorsize(sb);
-       int ret = 0;
-
-       if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
-               btrfs_err(fs_info, "no valid FS found");
-               ret = -EINVAL;
-       }
-       if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
-               btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
-                               btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
-               ret = -EINVAL;
-       }
-       if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
-               btrfs_err(fs_info, "tree_root level too big: %d >= %d",
-                               btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
-               ret = -EINVAL;
-       }
-       if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
-               btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
-                               btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
-               ret = -EINVAL;
-       }
-       if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
-               btrfs_err(fs_info, "log_root level too big: %d >= %d",
-                               btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
-               ret = -EINVAL;
-       }
-
-       /*
-        * Check sectorsize and nodesize first, other check will need it.
-        * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
-        */
-       if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
-           sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
-               btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
-               ret = -EINVAL;
-       }
-       /* Only PAGE SIZE is supported yet */
-       if (sectorsize != PAGE_SIZE) {
-               btrfs_err(fs_info,
-                       "sectorsize %llu not supported yet, only support %lu",
-                       sectorsize, PAGE_SIZE);
-               ret = -EINVAL;
-       }
-       if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
-           nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
-               btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
-               ret = -EINVAL;
-       }
-       if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
-               btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
-                         le32_to_cpu(sb->__unused_leafsize), nodesize);
-               ret = -EINVAL;
-       }
-
-       /* Root alignment check */
-       if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
-               btrfs_warn(fs_info, "tree_root block unaligned: %llu",
-                          btrfs_super_root(sb));
-               ret = -EINVAL;
-       }
-       if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
-               btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
-                          btrfs_super_chunk_root(sb));
-               ret = -EINVAL;
-       }
-       if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
-               btrfs_warn(fs_info, "log_root block unaligned: %llu",
-                          btrfs_super_log_root(sb));
-               ret = -EINVAL;
-       }
-
-       if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
-               btrfs_err(fs_info,
-                          "dev_item UUID does not match fsid: %pU != %pU",
-                          fs_info->fsid, sb->dev_item.fsid);
-               ret = -EINVAL;
-       }
-
-       /*
-        * Hint to catch really bogus numbers, bitflips or so, more exact checks are
-        * done later
-        */
-       if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
-               btrfs_err(fs_info, "bytes_used is too small %llu",
-                         btrfs_super_bytes_used(sb));
-               ret = -EINVAL;
-       }
-       if (!is_power_of_2(btrfs_super_stripesize(sb))) {
-               btrfs_err(fs_info, "invalid stripesize %u",
-                         btrfs_super_stripesize(sb));
-               ret = -EINVAL;
-       }
-       if (btrfs_super_num_devices(sb) > (1UL << 31))
-               btrfs_warn(fs_info, "suspicious number of devices: %llu",
-                          btrfs_super_num_devices(sb));
-       if (btrfs_super_num_devices(sb) == 0) {
-               btrfs_err(fs_info, "number of devices is 0");
-               ret = -EINVAL;
-       }
-
-       if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
-               btrfs_err(fs_info, "super offset mismatch %llu != %u",
-                         btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
-               ret = -EINVAL;
-       }
-
-       /*
-        * Obvious sys_chunk_array corruptions, it must hold at least one key
-        * and one chunk
-        */
-       if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
-               btrfs_err(fs_info, "system chunk array too big %u > %u",
-                         btrfs_super_sys_array_size(sb),
-                         BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
-               ret = -EINVAL;
-       }
-       if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
-                       + sizeof(struct btrfs_chunk)) {
-               btrfs_err(fs_info, "system chunk array too small %u < %zu",
-                         btrfs_super_sys_array_size(sb),
-                         sizeof(struct btrfs_disk_key)
-                         + sizeof(struct btrfs_chunk));
-               ret = -EINVAL;
-       }
-
-       /*
-        * The generation is a global counter, we'll trust it more than the others
-        * but it's still possible that it's the one that's wrong.
-        */
-       if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
-               btrfs_warn(fs_info,
-                       "suspicious: generation < chunk_root_generation: %llu < %llu",
-                       btrfs_super_generation(sb),
-                       btrfs_super_chunk_root_generation(sb));
-       if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
-           && btrfs_super_cache_generation(sb) != (u64)-1)
-               btrfs_warn(fs_info,
-                       "suspicious: generation < cache_generation: %llu < %llu",
-                       btrfs_super_generation(sb),
-                       btrfs_super_cache_generation(sb));
-
-       return ret;
-}
-
 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
 {
        /* cleanup FS via transaction */
index 51b5e2da708c4e77566cb686aea81b4a1cfa3497..3d9fe58c0080db89f4933f162f8bf7f8d6df1c85 100644 (file)
@@ -66,10 +66,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      u64 flags, u64 owner, u64 offset,
                                      struct btrfs_key *ins, int ref_mod);
 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-                                    struct btrfs_fs_info *fs_info,
-                                    u64 parent, u64 root_objectid,
-                                    u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins);
+                                    struct btrfs_delayed_ref_node *node,
+                                    struct btrfs_delayed_extent_op *extent_op);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_fs_info *fs_info, u64 flags,
                          int force);
@@ -256,7 +254,7 @@ static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
                ret = btrfs_rmap_block(fs_info, cache->key.objectid,
-                                      bytenr, 0, &logical, &nr, &stripe_len);
+                                      bytenr, &logical, &nr, &stripe_len);
                if (ret)
                        return ret;
 
@@ -343,8 +341,9 @@ static void fragment_free_space(struct btrfs_block_group_cache *block_group)
  * since their free space will be released as soon as the transaction commits.
  */
 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-                      struct btrfs_fs_info *info, u64 start, u64 end)
+                      u64 start, u64 end)
 {
+       struct btrfs_fs_info *info = block_group->fs_info;
        u64 extent_start, extent_end, size, total_added = 0;
        int ret;
 
@@ -489,8 +488,7 @@ next:
 
                if (key.type == BTRFS_EXTENT_ITEM_KEY ||
                    key.type == BTRFS_METADATA_ITEM_KEY) {
-                       total_found += add_new_free_space(block_group,
-                                                         fs_info, last,
+                       total_found += add_new_free_space(block_group, last,
                                                          key.objectid);
                        if (key.type == BTRFS_METADATA_ITEM_KEY)
                                last = key.objectid +
@@ -508,7 +506,7 @@ next:
        }
        ret = 0;
 
-       total_found += add_new_free_space(block_group, fs_info, last,
+       total_found += add_new_free_space(block_group, last,
                                          block_group->key.objectid +
                                          block_group->key.offset);
        caching_ctl->progress = (u64)-1;
@@ -744,12 +742,12 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 }
 
 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
-                            u64 owner, u64 root_objectid)
+                            bool metadata, u64 root_objectid)
 {
        struct btrfs_space_info *space_info;
        u64 flags;
 
-       if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+       if (metadata) {
                if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
                else
@@ -2200,8 +2198,11 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                                 &old_ref_mod, &new_ref_mod);
        }
 
-       if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
-               add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
+       if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
+               bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+               add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
+       }
 
        return ret;
 }
@@ -2428,10 +2429,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        struct btrfs_delayed_tree_ref *ref;
-       struct btrfs_key ins;
        u64 parent = 0;
        u64 ref_root = 0;
-       bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 
        ref = btrfs_delayed_node_to_tree_ref(node);
        trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
@@ -2440,15 +2439,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                parent = ref->parent;
        ref_root = ref->root;
 
-       ins.objectid = node->bytenr;
-       if (skinny_metadata) {
-               ins.offset = ref->level;
-               ins.type = BTRFS_METADATA_ITEM_KEY;
-       } else {
-               ins.offset = node->num_bytes;
-               ins.type = BTRFS_EXTENT_ITEM_KEY;
-       }
-
        if (node->ref_mod != 1) {
                btrfs_err(fs_info,
        "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
@@ -2458,11 +2448,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
        }
        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
                BUG_ON(!extent_op || !extent_op->update_flags);
-               ret = alloc_reserved_tree_block(trans, fs_info,
-                                               parent, ref_root,
-                                               extent_op->flags_to_set,
-                                               &extent_op->key,
-                                               ref->level, &ins);
+               ret = alloc_reserved_tree_block(trans, node, extent_op);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                ret = __btrfs_inc_extent_ref(trans, fs_info, node,
                                             parent, ref_root,
@@ -2594,8 +2580,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
        delayed_refs->num_heads--;
        rb_erase(&head->href_node, &delayed_refs->href_root);
        RB_CLEAR_NODE(&head->href_node);
-       spin_unlock(&delayed_refs->lock);
        spin_unlock(&head->lock);
+       spin_unlock(&delayed_refs->lock);
        atomic_dec(&delayed_refs->num_entries);
 
        trace_run_delayed_ref_head(fs_info, head, 0);
@@ -2700,17 +2686,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 * insert_inline_extent_backref()).
                 */
                spin_lock(&locked_ref->lock);
-               btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
-                                        locked_ref);
+               btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
 
-               /*
-                * locked_ref is the head node, so we have to go one
-                * node back for any delayed ref updates
-                */
                ref = select_delayed_ref(locked_ref);
 
                if (ref && ref->seq &&
-                   btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+                   btrfs_check_delayed_seq(fs_info, ref->seq)) {
                        spin_unlock(&locked_ref->lock);
                        unselect_delayed_ref_head(delayed_refs, locked_ref);
                        locked_ref = NULL;
@@ -3291,7 +3272,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
 
        path = btrfs_alloc_path();
        if (!path)
-               return -ENOENT;
+               return -ENOMEM;
 
        do {
                ret = check_committed_ref(root, path, objectid,
@@ -4026,8 +4007,7 @@ static const char *alloc_name(u64 flags)
        };
 }
 
-static int create_space_info(struct btrfs_fs_info *info, u64 flags,
-                            struct btrfs_space_info **new)
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
 {
 
        struct btrfs_space_info *space_info;
@@ -4065,7 +4045,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags,
                return ret;
        }
 
-       *new = space_info;
        list_add_rcu(&space_info->list, &info->space_info);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                info->data_sinfo = space_info;
@@ -4122,7 +4101,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  * returns target flags in extended format or 0 if restripe for this
  * chunk_type is not in progress
  *
- * should be called with either volume_mutex or balance_lock held
+ * should be called with balance_lock held
  */
 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
 {
@@ -4178,7 +4157,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
        /* First, mask out the RAID levels which aren't possible */
        for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
                if (num_devices >= btrfs_raid_array[raid_type].devs_min)
-                       allowed |= btrfs_raid_group[raid_type];
+                       allowed |= btrfs_raid_array[raid_type].bg_flag;
        }
        allowed &= flags;
 
@@ -4341,7 +4320,7 @@ commit_trans:
                        need_commit--;
 
                        if (need_commit > 0) {
-                               btrfs_start_delalloc_roots(fs_info, 0, -1);
+                               btrfs_start_delalloc_roots(fs_info, -1);
                                btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
                                                         (u64)-1);
                        }
@@ -4678,12 +4657,14 @@ again:
        trans->allocating_chunk = false;
 
        spin_lock(&space_info->lock);
-       if (ret < 0 && ret != -ENOSPC)
-               goto out;
-       if (ret)
-               space_info->full = 1;
-       else
+       if (ret < 0) {
+               if (ret == -ENOSPC)
+                       space_info->full = 1;
+               else
+                       goto out;
+       } else {
                ret = 1;
+       }
 
        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
 out:
@@ -4792,7 +4773,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
                 * the filesystem is readonly(all dirty pages are written to
                 * the disk).
                 */
-               btrfs_start_delalloc_roots(fs_info, 0, nr_items);
+               btrfs_start_delalloc_roots(fs_info, nr_items);
                if (!current->journal_info)
                        btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
        }
@@ -5949,44 +5930,6 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
        trans->chunk_bytes_reserved = 0;
 }
 
-/* Can only return 0 or -ENOSPC */
-int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
-                                 struct btrfs_inode *inode)
-{
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       struct btrfs_root *root = inode->root;
-       /*
-        * We always use trans->block_rsv here as we will have reserved space
-        * for our orphan when starting the transaction, using get_block_rsv()
-        * here will sometimes make us choose the wrong block rsv as we could be
-        * doing a reloc inode for a non refcounted root.
-        */
-       struct btrfs_block_rsv *src_rsv = trans->block_rsv;
-       struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
-
-       /*
-        * We need to hold space in order to delete our orphan item once we've
-        * added it, so this takes the reservation so we can release it later
-        * when we are truly done with the orphan item.
-        */
-       u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-
-       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
-                       num_bytes, 1);
-       return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-}
-
-void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
-{
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       struct btrfs_root *root = inode->root;
-       u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
-
-       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
-                       num_bytes, 0);
-       btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
-}
-
 /*
  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
  * root: the root of the parent directory
@@ -6004,7 +5947,6 @@ void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                     struct btrfs_block_rsv *rsv,
                                     int items,
-                                    u64 *qgroup_reserved,
                                     bool use_global_rsv)
 {
        u64 num_bytes;
@@ -6022,8 +5964,6 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                num_bytes = 0;
        }
 
-       *qgroup_reserved = num_bytes;
-
        num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
        rsv->space_info = __find_space_info(fs_info,
                                            BTRFS_BLOCK_GROUP_METADATA);
@@ -6033,8 +5973,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (ret == -ENOSPC && use_global_rsv)
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
 
-       if (ret && *qgroup_reserved)
-               btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
+       if (ret && num_bytes)
+               btrfs_qgroup_free_meta_prealloc(root, num_bytes);
 
        return ret;
 }
@@ -6354,6 +6294,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        spin_lock(&info->unused_bgs_lock);
                        if (list_empty(&cache->bg_list)) {
                                btrfs_get_block_group(cache);
+                               trace_btrfs_add_unused_block_group(cache);
                                list_add_tail(&cache->bg_list,
                                              &info->unused_bgs);
                        }
@@ -6511,6 +6452,7 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
        struct btrfs_key key;
        int found_type;
        int i;
+       int ret = 0;
 
        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
                return 0;
@@ -6527,10 +6469,12 @@ int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
                        continue;
                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
-               __exclude_logged_extent(fs_info, key.objectid, key.offset);
+               ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
+               if (ret)
+                       break;
        }
 
-       return 0;
+       return ret;
 }
 
 static void
@@ -7122,7 +7066,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
 
-               ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
+               ret = add_to_free_space_tree(trans, bytenr, num_bytes);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
@@ -7266,7 +7210,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        }
 out:
        if (pin)
-               add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
+               add_pinned_bytes(fs_info, buf->len, true,
                                 root->root_key.objectid);
 
        if (last_ref) {
@@ -7320,8 +7264,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                                                 &old_ref_mod, &new_ref_mod);
        }
 
-       if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
-               add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
+       if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
+               bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+
+               add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
+       }
 
        return ret;
 }
@@ -7373,24 +7320,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return ret;
 }
 
-static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
-       [BTRFS_RAID_RAID10]     = "raid10",
-       [BTRFS_RAID_RAID1]      = "raid1",
-       [BTRFS_RAID_DUP]        = "dup",
-       [BTRFS_RAID_RAID0]      = "raid0",
-       [BTRFS_RAID_SINGLE]     = "single",
-       [BTRFS_RAID_RAID5]      = "raid5",
-       [BTRFS_RAID_RAID6]      = "raid6",
-};
-
-static const char *get_raid_name(enum btrfs_raid_types type)
-{
-       if (type >= BTRFS_NR_RAID_TYPES)
-               return NULL;
-
-       return btrfs_raid_type_names[type];
-}
-
 enum btrfs_loop_type {
        LOOP_CACHING_NOWAIT = 0,
        LOOP_CACHING_WAIT = 1,
@@ -7662,7 +7591,7 @@ have_block_group:
                        if (offset) {
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
-                               trace_btrfs_reserve_extent_cluster(fs_info,
+                               trace_btrfs_reserve_extent_cluster(
                                                used_block_group,
                                                search_start, num_bytes);
                                if (used_block_group != block_group) {
@@ -7735,7 +7664,7 @@ refill_cluster:
                                if (offset) {
                                        /* we found one, proceed */
                                        spin_unlock(&last_ptr->refill_lock);
-                                       trace_btrfs_reserve_extent_cluster(fs_info,
+                                       trace_btrfs_reserve_extent_cluster(
                                                block_group, search_start,
                                                num_bytes);
                                        goto checks;
@@ -7835,8 +7764,7 @@ checks:
                ins->objectid = search_start;
                ins->offset = num_bytes;
 
-               trace_btrfs_reserve_extent(fs_info, block_group,
-                                          search_start, num_bytes);
+               trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
                btrfs_release_block_group(block_group, delalloc);
                break;
 loop:
@@ -8184,8 +8112,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
 
-       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
-                                         ins->offset);
+       ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
        if (ret)
                return ret;
 
@@ -8200,37 +8127,52 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 }
 
 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-                                    struct btrfs_fs_info *fs_info,
-                                    u64 parent, u64 root_objectid,
-                                    u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins)
+                                    struct btrfs_delayed_ref_node *node,
+                                    struct btrfs_delayed_extent_op *extent_op)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;
        struct btrfs_extent_item *extent_item;
+       struct btrfs_key extent_key;
        struct btrfs_tree_block_info *block_info;
        struct btrfs_extent_inline_ref *iref;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
+       struct btrfs_delayed_tree_ref *ref;
        u32 size = sizeof(*extent_item) + sizeof(*iref);
-       u64 num_bytes = ins->offset;
+       u64 num_bytes;
+       u64 flags = extent_op->flags_to_set;
        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 
-       if (!skinny_metadata)
+       ref = btrfs_delayed_node_to_tree_ref(node);
+
+       extent_key.objectid = node->bytenr;
+       if (skinny_metadata) {
+               extent_key.offset = ref->level;
+               extent_key.type = BTRFS_METADATA_ITEM_KEY;
+               num_bytes = fs_info->nodesize;
+       } else {
+               extent_key.offset = node->num_bytes;
+               extent_key.type = BTRFS_EXTENT_ITEM_KEY;
                size += sizeof(*block_info);
+               num_bytes = node->num_bytes;
+       }
 
        path = btrfs_alloc_path();
        if (!path) {
-               btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+               btrfs_free_and_pin_reserved_extent(fs_info,
+                                                  extent_key.objectid,
                                                   fs_info->nodesize);
                return -ENOMEM;
        }
 
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-                                     ins, size);
+                                     &extent_key, size);
        if (ret) {
                btrfs_free_path(path);
-               btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
+               btrfs_free_and_pin_reserved_extent(fs_info,
+                                                  extent_key.objectid,
                                                   fs_info->nodesize);
                return ret;
        }
@@ -8245,42 +8187,41 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 
        if (skinny_metadata) {
                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
-               num_bytes = fs_info->nodesize;
        } else {
                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
-               btrfs_set_tree_block_key(leaf, block_info, key);
-               btrfs_set_tree_block_level(leaf, block_info, level);
+               btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
+               btrfs_set_tree_block_level(leaf, block_info, ref->level);
                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
        }
 
-       if (parent > 0) {
+       if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
                btrfs_set_extent_inline_ref_type(leaf, iref,
                                                 BTRFS_SHARED_BLOCK_REF_KEY);
-               btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+               btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
        } else {
                btrfs_set_extent_inline_ref_type(leaf, iref,
                                                 BTRFS_TREE_BLOCK_REF_KEY);
-               btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+               btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
        }
 
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
 
-       ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+       ret = remove_from_free_space_tree(trans, extent_key.objectid,
                                          num_bytes);
        if (ret)
                return ret;
 
-       ret = update_block_group(trans, fs_info, ins->objectid,
+       ret = update_block_group(trans, fs_info, extent_key.objectid,
                                 fs_info->nodesize, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
-                       ins->objectid, ins->offset);
+                       extent_key.objectid, extent_key.offset);
                BUG();
        }
 
-       trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
+       trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
                                          fs_info->nodesize);
        return ret;
 }
@@ -10173,8 +10114,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
                } else if (btrfs_block_group_used(&cache->item) == 0) {
                        cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
-                       add_new_free_space(cache, info,
-                                          found_key.objectid,
+                       add_new_free_space(cache, found_key.objectid,
                                           found_key.objectid +
                                           found_key.offset);
                        free_excluded_extents(info, cache);
@@ -10204,6 +10144,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
                        /* Should always be true but just in case. */
                        if (list_empty(&cache->bg_list)) {
                                btrfs_get_block_group(cache);
+                               trace_btrfs_add_unused_block_group(cache);
                                list_add_tail(&cache->bg_list,
                                              &info->unused_bgs);
                        }
@@ -10269,7 +10210,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
                                               key.offset);
                if (ret)
                        btrfs_abort_transaction(trans, ret);
-               add_block_group_free_space(trans, fs_info, block_group);
+               add_block_group_free_space(trans, block_group);
                /* already aborted the transaction if it failed. */
 next:
                list_del_init(&block_group->bg_list);
@@ -10310,7 +10251,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                return ret;
        }
 
-       add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
+       add_new_free_space(cache, chunk_offset, chunk_offset + size);
 
        free_excluded_extents(fs_info, cache);
 
@@ -10391,6 +10332,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(!block_group);
        BUG_ON(!block_group->ro);
 
+       trace_btrfs_remove_block_group(block_group);
        /*
         * Free the reserved super bytes from this block group before
         * remove it.
@@ -10648,7 +10590,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
        mutex_unlock(&fs_info->chunk_mutex);
 
-       ret = remove_block_group_free_space(trans, fs_info, block_group);
+       ret = remove_block_group_free_space(trans, block_group);
        if (ret)
                goto out;
 
@@ -10755,6 +10697,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                         * the ro check in case balance is currently acting on
                         * this block group.
                         */
+                       trace_btrfs_skip_unused_block_group(block_group);
                        spin_unlock(&block_group->lock);
                        up_write(&space_info->groups_sem);
                        goto next;
@@ -10877,7 +10820,6 @@ next:
 
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 {
-       struct btrfs_space_info *space_info;
        struct btrfs_super_block *disk_super;
        u64 features;
        u64 flags;
@@ -10893,21 +10835,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
                mixed = 1;
 
        flags = BTRFS_BLOCK_GROUP_SYSTEM;
-       ret = create_space_info(fs_info, flags, &space_info);
+       ret = create_space_info(fs_info, flags);
        if (ret)
                goto out;
 
        if (mixed) {
                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
-               ret = create_space_info(fs_info, flags, &space_info);
+               ret = create_space_info(fs_info, flags);
        } else {
                flags = BTRFS_BLOCK_GROUP_METADATA;
-               ret = create_space_info(fs_info, flags, &space_info);
+               ret = create_space_info(fs_info, flags);
                if (ret)
                        goto out;
 
                flags = BTRFS_BLOCK_GROUP_DATA;
-               ret = create_space_info(fs_info, flags, &space_info);
+               ret = create_space_info(fs_info, flags);
        }
 out:
        return ret;
@@ -11092,12 +11034,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
 {
        percpu_counter_dec(&root->subv_writers->counter);
-       /*
-        * Make sure counter is updated before we wake up waiters.
-        */
-       smp_mb();
-       if (waitqueue_active(&root->subv_writers->wait))
-               wake_up(&root->subv_writers->wait);
+       cond_wake_up(&root->subv_writers->wait);
 }
 
 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
index 56d32bb462f941d6890f8a6b80226ee3f1381eb0..51fc015c7d2c69bd198e94c3ffc4b2ef267f7ef2 100644 (file)
@@ -4106,14 +4106,13 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
        return ret;
 }
 
-int extent_writepages(struct extent_io_tree *tree,
-                     struct address_space *mapping,
+int extent_writepages(struct address_space *mapping,
                      struct writeback_control *wbc)
 {
        int ret = 0;
        struct extent_page_data epd = {
                .bio = NULL,
-               .tree = tree,
+               .tree = &BTRFS_I(mapping->host)->io_tree,
                .extent_locked = 0,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
@@ -4123,9 +4122,8 @@ int extent_writepages(struct extent_io_tree *tree,
        return ret;
 }
 
-int extent_readpages(struct extent_io_tree *tree,
-                    struct address_space *mapping,
-                    struct list_head *pages, unsigned nr_pages)
+int extent_readpages(struct address_space *mapping, struct list_head *pages,
+                    unsigned nr_pages)
 {
        struct bio *bio = NULL;
        unsigned page_idx;
@@ -4133,6 +4131,7 @@ int extent_readpages(struct extent_io_tree *tree,
        struct page *pagepool[16];
        struct page *page;
        struct extent_map *em_cached = NULL;
+       struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
        int nr = 0;
        u64 prev_em_start = (u64)-1;
 
@@ -4199,8 +4198,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
  * are locked or under IO and drops the related state bits if it is safe
  * to drop the page.
  */
-static int try_release_extent_state(struct extent_map_tree *map,
-                                   struct extent_io_tree *tree,
+static int try_release_extent_state(struct extent_io_tree *tree,
                                    struct page *page, gfp_t mask)
 {
        u64 start = page_offset(page);
@@ -4235,13 +4233,13 @@ static int try_release_extent_state(struct extent_map_tree *map,
  * in the range corresponding to the page, both state records and extent
  * map records are removed
  */
-int try_release_extent_mapping(struct extent_map_tree *map,
-                              struct extent_io_tree *tree, struct page *page,
-                              gfp_t mask)
+int try_release_extent_mapping(struct page *page, gfp_t mask)
 {
        struct extent_map *em;
        u64 start = page_offset(page);
        u64 end = start + PAGE_SIZE - 1;
+       struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
+       struct extent_map_tree *map = &BTRFS_I(page->mapping->host)->extent_tree;
 
        if (gfpflags_allow_blocking(mask) &&
            page->mapping->host->i_size > SZ_16M) {
@@ -4275,7 +4273,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                        free_extent_map(em);
                }
        }
-       return try_release_extent_state(map, tree, page, mask);
+       return try_release_extent_state(tree, page, mask);
 }
 
 /*
@@ -5617,46 +5615,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
        }
 }
 
-void le_bitmap_set(u8 *map, unsigned int start, int len)
-{
-       u8 *p = map + BIT_BYTE(start);
-       const unsigned int size = start + len;
-       int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
-       u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
-
-       while (len - bits_to_set >= 0) {
-               *p |= mask_to_set;
-               len -= bits_to_set;
-               bits_to_set = BITS_PER_BYTE;
-               mask_to_set = ~0;
-               p++;
-       }
-       if (len) {
-               mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
-               *p |= mask_to_set;
-       }
-}
-
-void le_bitmap_clear(u8 *map, unsigned int start, int len)
-{
-       u8 *p = map + BIT_BYTE(start);
-       const unsigned int size = start + len;
-       int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE);
-       u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start);
-
-       while (len - bits_to_clear >= 0) {
-               *p &= ~mask_to_clear;
-               len -= bits_to_clear;
-               bits_to_clear = BITS_PER_BYTE;
-               mask_to_clear = ~0;
-               p++;
-       }
-       if (len) {
-               mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
-               *p &= ~mask_to_clear;
-       }
-}
-
 /*
  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
  * given bit number
index a53009694b164dad592013603b8d83bb6df999e9..0bfd4aeb822dd95f42e1cbe9ecc76fb5dc06a41d 100644 (file)
 #define BITMAP_LAST_BYTE_MASK(nbits) \
        (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
 
-static inline int le_test_bit(int nr, const u8 *addr)
-{
-       return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
-}
-
-void le_bitmap_set(u8 *map, unsigned int start, int len);
-void le_bitmap_clear(u8 *map, unsigned int start, int len);
-
 struct extent_state;
 struct btrfs_root;
 struct btrfs_inode;
@@ -278,9 +270,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
                                          int create);
 
 void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
-int try_release_extent_mapping(struct extent_map_tree *map,
-                              struct extent_io_tree *tree, struct page *page,
-                              gfp_t mask);
+int try_release_extent_mapping(struct page *page, gfp_t mask);
 int try_release_extent_buffer(struct page *page);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                     struct extent_state **cached);
@@ -421,14 +411,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 int extent_write_full_page(struct page *page, struct writeback_control *wbc);
 int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
                              int mode);
-int extent_writepages(struct extent_io_tree *tree,
-                     struct address_space *mapping,
+int extent_writepages(struct address_space *mapping,
                      struct writeback_control *wbc);
 int btree_write_cache_pages(struct address_space *mapping,
                            struct writeback_control *wbc);
-int extent_readpages(struct extent_io_tree *tree,
-                    struct address_space *mapping,
-                    struct list_head *pages, unsigned nr_pages);
+int extent_readpages(struct address_space *mapping, struct list_head *pages,
+                    unsigned nr_pages);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len);
 void set_page_extent_mapped(struct page *page);
index 1b8a078f92eb0b25400e596566331aaf9647150d..6648d55e5339622c1a835873fe58d01d7aa24f30 100644 (file)
@@ -518,6 +518,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
 
 /**
  * btrfs_add_extent_mapping - add extent mapping into em_tree
+ * @fs_info - used for tracepoint
  * @em_tree - the extent tree into which we want to insert the extent mapping
  * @em_in   - extent we are inserting
  * @start   - start of the logical range btrfs_get_extent() is requesting
@@ -535,7 +536,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
  * Return 0 on success, otherwise -EEXIST.
  *
  */
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+                            struct extent_map_tree *em_tree,
                             struct extent_map **em_in, u64 start, u64 len)
 {
        int ret;
@@ -553,7 +555,7 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
 
                existing = search_extent_mapping(em_tree, start, len);
 
-               trace_btrfs_handle_em_exist(existing, em, start, len);
+               trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
 
                /*
                 * existing will always be non-NULL, since there must be
index 5fcb80a6ce370b2669c12dc1a6b7401697019c74..25d985e7532a6d5cad742d29b91fb15a40d4c938 100644 (file)
@@ -92,7 +92,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
-int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
+int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
+                            struct extent_map_tree *em_tree,
                             struct extent_map **em_in, u64 start, u64 len);
 
 #endif
index e5b569bebc731e816622b40f0c5f531b3d45b295..d5f80cb300bea03a2a417fd5cc03d55c960f8314 100644 (file)
@@ -253,10 +253,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
        truncate_pagecache(inode, 0);
 
        /*
-        * We don't need an orphan item because truncating the free space cache
-        * will never be split across transactions.
-        * We don't need to check for -EAGAIN because we're a free space
-        * cache inode
+        * We skip the throttling logic for free space cache inodes, so we don't
+        * need to check for -EAGAIN.
         */
        ret = btrfs_truncate_inode_items(trans, root, inode,
                                         0, BTRFS_EXTENT_DATA_KEY);
index 32a0f6cb55948aac654bc30e986adbeeaf5b451b..b5950aacd6975170c59dc3de29ed216eb9d49ffc 100644 (file)
@@ -12,7 +12,6 @@
 #include "transaction.h"
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
-                                       struct btrfs_fs_info *fs_info,
                                        struct btrfs_block_group_cache *block_group,
                                        struct btrfs_path *path);
 
@@ -45,11 +44,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
 }
 
 static int add_new_free_space_info(struct btrfs_trans_handle *trans,
-                                  struct btrfs_fs_info *fs_info,
                                   struct btrfs_block_group_cache *block_group,
                                   struct btrfs_path *path)
 {
-       struct btrfs_root *root = fs_info->free_space_root;
+       struct btrfs_root *root = trans->fs_info->free_space_root;
        struct btrfs_free_space_info *info;
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -138,10 +136,11 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
        return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
 }
 
-static u8 *alloc_bitmap(u32 bitmap_size)
+static unsigned long *alloc_bitmap(u32 bitmap_size)
 {
-       u8 *ret;
+       unsigned long *ret;
        unsigned int nofs_flag;
+       u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long));
 
        /*
         * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
@@ -152,21 +151,42 @@ static u8 *alloc_bitmap(u32 bitmap_size)
         * know that recursion is unsafe.
         */
        nofs_flag = memalloc_nofs_save();
-       ret = kvzalloc(bitmap_size, GFP_KERNEL);
+       ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
        memalloc_nofs_restore(nofs_flag);
        return ret;
 }
 
+static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
+{
+       u8 *p = ((u8 *)map) + BIT_BYTE(start);
+       const unsigned int size = start + len;
+       int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
+       u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
+
+       while (len - bits_to_set >= 0) {
+               *p |= mask_to_set;
+               len -= bits_to_set;
+               bits_to_set = BITS_PER_BYTE;
+               mask_to_set = ~0;
+               p++;
+       }
+       if (len) {
+               mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+               *p |= mask_to_set;
+       }
+}
+
 int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info,
                                  struct btrfs_block_group_cache *block_group,
                                  struct btrfs_path *path)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root = fs_info->free_space_root;
        struct btrfs_free_space_info *info;
        struct btrfs_key key, found_key;
        struct extent_buffer *leaf;
-       u8 *bitmap, *bitmap_cursor;
+       unsigned long *bitmap;
+       char *bitmap_cursor;
        u64 start, end;
        u64 bitmap_range, i;
        u32 bitmap_size, flags, expected_extent_count;
@@ -255,7 +275,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
                goto out;
        }
 
-       bitmap_cursor = bitmap;
+       bitmap_cursor = (char *)bitmap;
        bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
        i = start;
        while (i < end) {
@@ -296,21 +316,18 @@ out:
 }
 
 int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info,
                                  struct btrfs_block_group_cache *block_group,
                                  struct btrfs_path *path)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root = fs_info->free_space_root;
        struct btrfs_free_space_info *info;
        struct btrfs_key key, found_key;
        struct extent_buffer *leaf;
-       u8 *bitmap;
+       unsigned long *bitmap;
        u64 start, end;
-       /* Initialize to silence GCC. */
-       u64 extent_start = 0;
-       u64 offset;
        u32 bitmap_size, flags, expected_extent_count;
-       int prev_bit = 0, bit, bitnr;
+       unsigned long nrbits, start_bit, end_bit;
        u32 extent_count = 0;
        int done = 0, nr;
        int ret;
@@ -348,7 +365,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
                                break;
                        } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
                                unsigned long ptr;
-                               u8 *bitmap_cursor;
+                               char *bitmap_cursor;
                                u32 bitmap_pos, data_size;
 
                                ASSERT(found_key.objectid >= start);
@@ -358,7 +375,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
                                bitmap_pos = div_u64(found_key.objectid - start,
                                                     fs_info->sectorsize *
                                                     BITS_PER_BYTE);
-                               bitmap_cursor = bitmap + bitmap_pos;
+                               bitmap_cursor = ((char *)bitmap) + bitmap_pos;
                                data_size = free_space_bitmap_size(found_key.offset,
                                                                   fs_info->sectorsize);
 
@@ -392,32 +409,16 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
 
-       offset = start;
-       bitnr = 0;
-       while (offset < end) {
-               bit = !!le_test_bit(bitnr, bitmap);
-               if (prev_bit == 0 && bit == 1) {
-                       extent_start = offset;
-               } else if (prev_bit == 1 && bit == 0) {
-                       key.objectid = extent_start;
-                       key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
-                       key.offset = offset - extent_start;
-
-                       ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-                       if (ret)
-                               goto out;
-                       btrfs_release_path(path);
+       nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize);
+       start_bit = find_next_bit_le(bitmap, nrbits, 0);
 
-                       extent_count++;
-               }
-               prev_bit = bit;
-               offset += fs_info->sectorsize;
-               bitnr++;
-       }
-       if (prev_bit == 1) {
-               key.objectid = extent_start;
+       while (start_bit < nrbits) {
+               end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
+               ASSERT(start_bit < end_bit);
+
+               key.objectid = start + start_bit * block_group->fs_info->sectorsize;
                key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
-               key.offset = end - extent_start;
+               key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
 
                ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
                if (ret)
@@ -425,6 +426,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
                btrfs_release_path(path);
 
                extent_count++;
+
+               start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
        }
 
        if (extent_count != expected_extent_count) {
@@ -446,7 +449,6 @@ out:
 }
 
 static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
-                                         struct btrfs_fs_info *fs_info,
                                          struct btrfs_block_group_cache *block_group,
                                          struct btrfs_path *path,
                                          int new_extents)
@@ -459,7 +461,8 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
        if (new_extents == 0)
                return 0;
 
-       info = search_free_space_info(trans, fs_info, block_group, path, 1);
+       info = search_free_space_info(trans, trans->fs_info, block_group, path,
+                                     1);
        if (IS_ERR(info)) {
                ret = PTR_ERR(info);
                goto out;
@@ -474,12 +477,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
 
        if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
            extent_count > block_group->bitmap_high_thresh) {
-               ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
-                                                   path);
+               ret = convert_free_space_to_bitmaps(trans, block_group, path);
        } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
                   extent_count < block_group->bitmap_low_thresh) {
-               ret = convert_free_space_to_extents(trans, fs_info, block_group,
-                                                   path);
+               ret = convert_free_space_to_extents(trans, block_group, path);
        }
 
 out:
@@ -576,12 +577,11 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
  * the bitmap.
  */
 static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
-                                   struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_group_cache *block_group,
                                    struct btrfs_path *path,
                                    u64 start, u64 size, int remove)
 {
-       struct btrfs_root *root = fs_info->free_space_root;
+       struct btrfs_root *root = block_group->fs_info->free_space_root;
        struct btrfs_key key;
        u64 end = start + size;
        u64 cur_start, cur_size;
@@ -682,7 +682,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
        }
 
        btrfs_release_path(path);
-       ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+       ret = update_free_space_extent_count(trans, block_group, path,
                                             new_extents);
 
 out:
@@ -690,12 +690,11 @@ out:
 }
 
 static int remove_free_space_extent(struct btrfs_trans_handle *trans,
-                                   struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_group_cache *block_group,
                                    struct btrfs_path *path,
                                    u64 start, u64 size)
 {
-       struct btrfs_root *root = fs_info->free_space_root;
+       struct btrfs_root *root = trans->fs_info->free_space_root;
        struct btrfs_key key;
        u64 found_start, found_end;
        u64 end = start + size;
@@ -769,7 +768,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
        }
 
        btrfs_release_path(path);
-       ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+       ret = update_free_space_extent_count(trans, block_group, path,
                                             new_extents);
 
 out:
@@ -777,7 +776,6 @@ out:
 }
 
 int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info,
                                  struct btrfs_block_group_cache *block_group,
                                  struct btrfs_path *path, u64 start, u64 size)
 {
@@ -786,36 +784,35 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
        int ret;
 
        if (block_group->needs_free_space) {
-               ret = __add_block_group_free_space(trans, fs_info, block_group,
-                                                  path);
+               ret = __add_block_group_free_space(trans, block_group, path);
                if (ret)
                        return ret;
        }
 
-       info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+       info = search_free_space_info(NULL, trans->fs_info, block_group, path,
+                                     0);
        if (IS_ERR(info))
                return PTR_ERR(info);
        flags = btrfs_free_space_flags(path->nodes[0], info);
        btrfs_release_path(path);
 
        if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
-               return modify_free_space_bitmap(trans, fs_info, block_group,
-                                               path, start, size, 1);
+               return modify_free_space_bitmap(trans, block_group, path,
+                                               start, size, 1);
        } else {
-               return remove_free_space_extent(trans, fs_info, block_group,
-                                               path, start, size);
+               return remove_free_space_extent(trans, block_group, path,
+                                               start, size);
        }
 }
 
 int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
-                               struct btrfs_fs_info *fs_info,
                                u64 start, u64 size)
 {
        struct btrfs_block_group_cache *block_group;
        struct btrfs_path *path;
        int ret;
 
-       if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+       if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
                return 0;
 
        path = btrfs_alloc_path();
@@ -824,7 +821,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
                goto out;
        }
 
-       block_group = btrfs_lookup_block_group(fs_info, start);
+       block_group = btrfs_lookup_block_group(trans->fs_info, start);
        if (!block_group) {
                ASSERT(0);
                ret = -ENOENT;
@@ -832,8 +829,8 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
        }
 
        mutex_lock(&block_group->free_space_lock);
-       ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
-                                           start, size);
+       ret = __remove_from_free_space_tree(trans, block_group, path, start,
+                                           size);
        mutex_unlock(&block_group->free_space_lock);
 
        btrfs_put_block_group(block_group);
@@ -845,12 +842,11 @@ out:
 }
 
 static int add_free_space_extent(struct btrfs_trans_handle *trans,
-                                struct btrfs_fs_info *fs_info,
                                 struct btrfs_block_group_cache *block_group,
                                 struct btrfs_path *path,
                                 u64 start, u64 size)
 {
-       struct btrfs_root *root = fs_info->free_space_root;
+       struct btrfs_root *root = trans->fs_info->free_space_root;
        struct btrfs_key key, new_key;
        u64 found_start, found_end;
        u64 end = start + size;
@@ -965,7 +961,7 @@ insert:
                goto out;
 
        btrfs_release_path(path);
-       ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+       ret = update_free_space_extent_count(trans, block_group, path,
                                             new_extents);
 
 out:
@@ -973,17 +969,16 @@ out:
 }
 
 int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
-                            struct btrfs_fs_info *fs_info,
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_path *path, u64 start, u64 size)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_free_space_info *info;
        u32 flags;
        int ret;
 
        if (block_group->needs_free_space) {
-               ret = __add_block_group_free_space(trans, fs_info, block_group,
-                                                  path);
+               ret = __add_block_group_free_space(trans, block_group, path);
                if (ret)
                        return ret;
        }
@@ -995,23 +990,22 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
        btrfs_release_path(path);
 
        if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
-               return modify_free_space_bitmap(trans, fs_info, block_group,
-                                               path, start, size, 0);
+               return modify_free_space_bitmap(trans, block_group, path,
+                                               start, size, 0);
        } else {
-               return add_free_space_extent(trans, fs_info, block_group, path,
-                                            start, size);
+               return add_free_space_extent(trans, block_group, path, start,
+                                            size);
        }
 }
 
 int add_to_free_space_tree(struct btrfs_trans_handle *trans,
-                          struct btrfs_fs_info *fs_info,
                           u64 start, u64 size)
 {
        struct btrfs_block_group_cache *block_group;
        struct btrfs_path *path;
        int ret;
 
-       if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+       if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
                return 0;
 
        path = btrfs_alloc_path();
@@ -1020,7 +1014,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
                goto out;
        }
 
-       block_group = btrfs_lookup_block_group(fs_info, start);
+       block_group = btrfs_lookup_block_group(trans->fs_info, start);
        if (!block_group) {
                ASSERT(0);
                ret = -ENOENT;
@@ -1028,8 +1022,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
        }
 
        mutex_lock(&block_group->free_space_lock);
-       ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
-                                      size);
+       ret = __add_to_free_space_tree(trans, block_group, path, start, size);
        mutex_unlock(&block_group->free_space_lock);
 
        btrfs_put_block_group(block_group);
@@ -1046,10 +1039,9 @@ out:
  * through the normal add/remove hooks.
  */
 static int populate_free_space_tree(struct btrfs_trans_handle *trans,
-                                   struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_group_cache *block_group)
 {
-       struct btrfs_root *extent_root = fs_info->extent_root;
+       struct btrfs_root *extent_root = trans->fs_info->extent_root;
        struct btrfs_path *path, *path2;
        struct btrfs_key key;
        u64 start, end;
@@ -1066,7 +1058,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
-       ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+       ret = add_new_free_space_info(trans, block_group, path2);
        if (ret)
                goto out;
 
@@ -1099,7 +1091,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
                                break;
 
                        if (start < key.objectid) {
-                               ret = __add_to_free_space_tree(trans, fs_info,
+                               ret = __add_to_free_space_tree(trans,
                                                               block_group,
                                                               path2, start,
                                                               key.objectid -
@@ -1109,7 +1101,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
                        }
                        start = key.objectid;
                        if (key.type == BTRFS_METADATA_ITEM_KEY)
-                               start += fs_info->nodesize;
+                               start += trans->fs_info->nodesize;
                        else
                                start += key.offset;
                } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
@@ -1124,8 +1116,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
                        break;
        }
        if (start < end) {
-               ret = __add_to_free_space_tree(trans, fs_info, block_group,
-                                              path2, start, end - start);
+               ret = __add_to_free_space_tree(trans, block_group, path2,
+                                              start, end - start);
                if (ret)
                        goto out_locked;
        }
@@ -1165,7 +1157,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
        while (node) {
                block_group = rb_entry(node, struct btrfs_block_group_cache,
                                       cache_node);
-               ret = populate_free_space_tree(trans, fs_info, block_group);
+               ret = populate_free_space_tree(trans, block_group);
                if (ret)
                        goto abort;
                node = rb_next(node);
@@ -1269,7 +1261,6 @@ abort:
 }
 
 static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
-                                       struct btrfs_fs_info *fs_info,
                                        struct btrfs_block_group_cache *block_group,
                                        struct btrfs_path *path)
 {
@@ -1277,19 +1268,19 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
 
        block_group->needs_free_space = 0;
 
-       ret = add_new_free_space_info(trans, fs_info, block_group, path);
+       ret = add_new_free_space_info(trans, block_group, path);
        if (ret)
                return ret;
 
-       return __add_to_free_space_tree(trans, fs_info, block_group, path,
+       return __add_to_free_space_tree(trans, block_group, path,
                                        block_group->key.objectid,
                                        block_group->key.offset);
 }
 
 int add_block_group_free_space(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info,
                               struct btrfs_block_group_cache *block_group)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_path *path = NULL;
        int ret = 0;
 
@@ -1306,7 +1297,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans,
                goto out;
        }
 
-       ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+       ret = __add_block_group_free_space(trans, block_group, path);
 
 out:
        btrfs_free_path(path);
@@ -1317,10 +1308,9 @@ out:
 }
 
 int remove_block_group_free_space(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info,
                                  struct btrfs_block_group_cache *block_group)
 {
-       struct btrfs_root *root = fs_info->free_space_root;
+       struct btrfs_root *root = trans->fs_info->free_space_root;
        struct btrfs_path *path;
        struct btrfs_key key, found_key;
        struct extent_buffer *leaf;
@@ -1328,7 +1318,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
        int done = 0, nr;
        int ret;
 
-       if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+       if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
                return 0;
 
        if (block_group->needs_free_space) {
@@ -1439,7 +1429,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
                                extent_start = offset;
                        } else if (prev_bit == 1 && bit == 0) {
                                total_found += add_new_free_space(block_group,
-                                                                 fs_info,
                                                                  extent_start,
                                                                  offset);
                                if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1453,8 +1442,8 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
                }
        }
        if (prev_bit == 1) {
-               total_found += add_new_free_space(block_group, fs_info,
-                                                 extent_start, end);
+               total_found += add_new_free_space(block_group, extent_start,
+                                                 end);
                extent_count++;
        }
 
@@ -1511,8 +1500,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
 
                caching_ctl->progress = key.objectid;
 
-               total_found += add_new_free_space(block_group, fs_info,
-                                                 key.objectid,
+               total_found += add_new_free_space(block_group, key.objectid,
                                                  key.objectid + key.offset);
                if (total_found > CACHING_CTL_WAKE_UP) {
                        total_found = 0;
index 874b4feecad2022868f0306062166858e0443bf4..3133651d7d706acf0214cae98408604cb888c96a 100644 (file)
@@ -19,16 +19,12 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
 int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
 int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
 int add_block_group_free_space(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info,
                               struct btrfs_block_group_cache *block_group);
 int remove_block_group_free_space(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info,
                                  struct btrfs_block_group_cache *block_group);
 int add_to_free_space_tree(struct btrfs_trans_handle *trans,
-                          struct btrfs_fs_info *fs_info,
                           u64 start, u64 size);
 int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
-                               struct btrfs_fs_info *fs_info,
                                u64 start, u64 size);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -38,19 +34,15 @@ search_free_space_info(struct btrfs_trans_handle *trans,
                       struct btrfs_block_group_cache *block_group,
                       struct btrfs_path *path, int cow);
 int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
-                            struct btrfs_fs_info *fs_info,
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_path *path, u64 start, u64 size);
 int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info,
                                  struct btrfs_block_group_cache *block_group,
                                  struct btrfs_path *path, u64 start, u64 size);
 int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info,
                                  struct btrfs_block_group_cache *block_group,
                                  struct btrfs_path *path);
 int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info,
                                  struct btrfs_block_group_cache *block_group,
                                  struct btrfs_path *path);
 int free_space_test_bit(struct btrfs_block_group_cache *block_group,
index 0b86cf10cf2ac79732ea548bbe970819ea758d4e..89b2082017830942e040063a2ca000d1a10f75df 100644 (file)
@@ -1018,8 +1018,10 @@ static noinline int cow_file_range(struct inode *inode,
                                  ram_size, /* ram_bytes */
                                  BTRFS_COMPRESS_NONE, /* compress_type */
                                  BTRFS_ORDERED_REGULAR /* type */);
-               if (IS_ERR(em))
+               if (IS_ERR(em)) {
+                       ret = PTR_ERR(em);
                        goto out_reserve;
+               }
                free_extent_map(em);
 
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
@@ -1156,13 +1158,10 @@ static noinline void async_cow_submit(struct btrfs_work *work)
        nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
                PAGE_SHIFT;
 
-       /*
-        * atomic_sub_return implies a barrier for waitqueue_active
-        */
+       /* atomic_sub_return implies a barrier */
        if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
-           5 * SZ_1M &&
-           waitqueue_active(&fs_info->async_submit_wait))
-               wake_up(&fs_info->async_submit_wait);
+           5 * SZ_1M)
+               cond_wake_up_nomb(&fs_info->async_submit_wait);
 
        if (async_cow->inode)
                submit_compressed_extents(async_cow->inode, async_cow);
@@ -1373,6 +1372,13 @@ next_slot:
                            btrfs_file_extent_encryption(leaf, fi) ||
                            btrfs_file_extent_other_encoding(leaf, fi))
                                goto out_check;
+                       /*
+                        * Do the same check as in btrfs_cross_ref_exist but
+                        * without the unnecessary search.
+                        */
+                       if (btrfs_file_extent_generation(leaf, fi) <=
+                           btrfs_root_last_snapshot(&root->root_item))
+                               goto out_check;
                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
                                goto out_check;
                        if (btrfs_extent_readonly(fs_info, disk_bytenr))
@@ -1754,6 +1760,7 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
                          &inode->runtime_flags);
                root->nr_delalloc_inodes--;
                if (!root->nr_delalloc_inodes) {
+                       ASSERT(list_empty(&root->delalloc_inodes));
                        spin_lock(&fs_info->delalloc_root_lock);
                        BUG_ON(list_empty(&root->delalloc_root));
                        list_del_init(&root->delalloc_root);
@@ -3158,6 +3165,9 @@ out:
        /* once for the tree */
        btrfs_put_ordered_extent(ordered_extent);
 
+       /* Try to release some metadata so we don't get an OOM but don't wait */
+       btrfs_btree_balance_dirty_nodelay(fs_info);
+
        return ret;
 }
 
@@ -3300,177 +3310,31 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
 }
 
 /*
- * This is called in transaction commit time. If there are no orphan
- * files in the subvolume, it removes orphan item and frees block_rsv
- * structure.
- */
-void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root)
-{
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct btrfs_block_rsv *block_rsv;
-       int ret;
-
-       if (atomic_read(&root->orphan_inodes) ||
-           root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
-               return;
-
-       spin_lock(&root->orphan_lock);
-       if (atomic_read(&root->orphan_inodes)) {
-               spin_unlock(&root->orphan_lock);
-               return;
-       }
-
-       if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
-               spin_unlock(&root->orphan_lock);
-               return;
-       }
-
-       block_rsv = root->orphan_block_rsv;
-       root->orphan_block_rsv = NULL;
-       spin_unlock(&root->orphan_lock);
-
-       if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
-           btrfs_root_refs(&root->root_item) > 0) {
-               ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
-                                           root->root_key.objectid);
-               if (ret)
-                       btrfs_abort_transaction(trans, ret);
-               else
-                       clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
-                                 &root->state);
-       }
-
-       if (block_rsv) {
-               WARN_ON(block_rsv->size > 0);
-               btrfs_free_block_rsv(fs_info, block_rsv);
-       }
-}
-
-/*
- * This creates an orphan entry for the given inode in case something goes
- * wrong in the middle of an unlink/truncate.
- *
- * NOTE: caller of this function should reserve 5 units of metadata for
- *      this function.
+ * This creates an orphan entry for the given inode in case something goes wrong
+ * in the middle of an unlink.
  */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
-               struct btrfs_inode *inode)
+                    struct btrfs_inode *inode)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       struct btrfs_root *root = inode->root;
-       struct btrfs_block_rsv *block_rsv = NULL;
-       int reserve = 0;
-       bool insert = false;
        int ret;
 
-       if (!root->orphan_block_rsv) {
-               block_rsv = btrfs_alloc_block_rsv(fs_info,
-                                                 BTRFS_BLOCK_RSV_TEMP);
-               if (!block_rsv)
-                       return -ENOMEM;
-       }
-
-       if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-                             &inode->runtime_flags))
-               insert = true;
-
-       if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-                             &inode->runtime_flags))
-               reserve = 1;
-
-       spin_lock(&root->orphan_lock);
-       /* If someone has created ->orphan_block_rsv, be happy to use it. */
-       if (!root->orphan_block_rsv) {
-               root->orphan_block_rsv = block_rsv;
-       } else if (block_rsv) {
-               btrfs_free_block_rsv(fs_info, block_rsv);
-               block_rsv = NULL;
-       }
-
-       if (insert)
-               atomic_inc(&root->orphan_inodes);
-       spin_unlock(&root->orphan_lock);
-
-       /* grab metadata reservation from transaction handle */
-       if (reserve) {
-               ret = btrfs_orphan_reserve_metadata(trans, inode);
-               ASSERT(!ret);
-               if (ret) {
-                       /*
-                        * dec doesn't need spin_lock as ->orphan_block_rsv
-                        * would be released only if ->orphan_inodes is
-                        * zero.
-                        */
-                       atomic_dec(&root->orphan_inodes);
-                       clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-                                 &inode->runtime_flags);
-                       if (insert)
-                               clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-                                         &inode->runtime_flags);
-                       return ret;
-               }
-       }
-
-       /* insert an orphan item to track this unlinked/truncated file */
-       if (insert) {
-               ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-               if (ret) {
-                       if (reserve) {
-                               clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-                                         &inode->runtime_flags);
-                               btrfs_orphan_release_metadata(inode);
-                       }
-                       /*
-                        * btrfs_orphan_commit_root may race with us and set
-                        * ->orphan_block_rsv to zero, in order to avoid that,
-                        * decrease ->orphan_inodes after everything is done.
-                        */
-                       atomic_dec(&root->orphan_inodes);
-                       if (ret != -EEXIST) {
-                               clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-                                         &inode->runtime_flags);
-                               btrfs_abort_transaction(trans, ret);
-                               return ret;
-                       }
-               }
-               ret = 0;
+       ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
+       if (ret && ret != -EEXIST) {
+               btrfs_abort_transaction(trans, ret);
+               return ret;
        }
 
        return 0;
 }
 
 /*
- * We have done the truncate/delete so we can go ahead and remove the orphan
- * item for this particular inode.
+ * We have done the delete so we can go ahead and remove the orphan item for
+ * this particular inode.
  */
 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
                            struct btrfs_inode *inode)
 {
-       struct btrfs_root *root = inode->root;
-       int delete_item = 0;
-       int ret = 0;
-
-       if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-                              &inode->runtime_flags))
-               delete_item = 1;
-
-       if (delete_item && trans)
-               ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-
-       if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-                              &inode->runtime_flags))
-               btrfs_orphan_release_metadata(inode);
-
-       /*
-        * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
-        * to zero, in order to avoid that, decrease ->orphan_inodes after
-        * everything is done.
-        */
-       if (delete_item)
-               atomic_dec(&root->orphan_inodes);
-
-       return ret;
+       return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
 }
 
 /*
@@ -3486,7 +3350,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
        struct btrfs_trans_handle *trans;
        struct inode *inode;
        u64 last_objectid = 0;
-       int ret = 0, nr_unlink = 0, nr_truncate = 0;
+       int ret = 0, nr_unlink = 0;
 
        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
                return 0;
@@ -3586,12 +3450,31 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                key.offset = found_key.objectid - 1;
                                continue;
                        }
+
                }
+
                /*
-                * Inode is already gone but the orphan item is still there,
-                * kill the orphan item.
+                * If we have an inode with links, there are a couple of
+                * possibilities. Old kernels (before v3.12) used to create an
+                * orphan item for truncate indicating that there were possibly
+                * extent items past i_size that needed to be deleted. In v3.12,
+                * truncate was changed to update i_size in sync with the extent
+                * items, but the (useless) orphan item was still created. Since
+                * v4.18, we don't create the orphan item for truncate at all.
+                *
+                * So, this item could mean that we need to do a truncate, but
+                * only if this filesystem was last used on a pre-v3.12 kernel
+                * and was not cleanly unmounted. The odds of that are quite
+                * slim, and it's a pain to do the truncate now, so just delete
+                * the orphan item.
+                *
+                * It's also possible that this orphan item was supposed to be
+                * deleted but wasn't. The inode number may have been reused,
+                * but either way, we can delete the orphan item.
                 */
-               if (ret == -ENOENT) {
+               if (ret == -ENOENT || inode->i_nlink) {
+                       if (!ret)
+                               iput(inode);
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
@@ -3607,42 +3490,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                        continue;
                }
 
-               /*
-                * add this inode to the orphan list so btrfs_orphan_del does
-                * the proper thing when we hit it
-                */
-               set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-                       &BTRFS_I(inode)->runtime_flags);
-               atomic_inc(&root->orphan_inodes);
-
-               /* if we have links, this was a truncate, lets do that */
-               if (inode->i_nlink) {
-                       if (WARN_ON(!S_ISREG(inode->i_mode))) {
-                               iput(inode);
-                               continue;
-                       }
-                       nr_truncate++;
-
-                       /* 1 for the orphan item deletion. */
-                       trans = btrfs_start_transaction(root, 1);
-                       if (IS_ERR(trans)) {
-                               iput(inode);
-                               ret = PTR_ERR(trans);
-                               goto out;
-                       }
-                       ret = btrfs_orphan_add(trans, BTRFS_I(inode));
-                       btrfs_end_transaction(trans);
-                       if (ret) {
-                               iput(inode);
-                               goto out;
-                       }
-
-                       ret = btrfs_truncate(inode, false);
-                       if (ret)
-                               btrfs_orphan_del(NULL, BTRFS_I(inode));
-               } else {
-                       nr_unlink++;
-               }
+               nr_unlink++;
 
                /* this will do delete_inode and everything for us */
                iput(inode);
@@ -3654,12 +3502,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
 
-       if (root->orphan_block_rsv)
-               btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
-                                       (u64)-1);
-
-       if (root->orphan_block_rsv ||
-           test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
+       if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
                trans = btrfs_join_transaction(root);
                if (!IS_ERR(trans))
                        btrfs_end_transaction(trans);
@@ -3667,8 +3510,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
        if (nr_unlink)
                btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
-       if (nr_truncate)
-               btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
 
 out:
        if (ret)
@@ -3931,7 +3772,7 @@ cache_acl:
                break;
        }
 
-       btrfs_update_iflags(inode);
+       btrfs_sync_inode_flags_to_i_flags(inode);
        return 0;
 
 make_bad:
@@ -4245,7 +4086,7 @@ out:
        return ret;
 }
 
-int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct inode *dir, u64 objectid,
                        const char *name, int name_len)
@@ -4326,6 +4167,262 @@ out:
        return ret;
 }
 
+/*
+ * Helper to check if the subvolume references other subvolumes or if it's
+ * default.
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_path *path;
+       struct btrfs_dir_item *di;
+       struct btrfs_key key;
+       u64 dir_id;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       /* Make sure this root isn't set as the default subvol */
+       dir_id = btrfs_super_root_dir(fs_info->super_copy);
+       di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
+                                  dir_id, "default", 7, 0);
+       if (di && !IS_ERR(di)) {
+               btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+               if (key.objectid == root->root_key.objectid) {
+                       ret = -EPERM;
+                       btrfs_err(fs_info,
+                                 "deleting default subvolume %llu is not allowed",
+                                 key.objectid);
+                       goto out;
+               }
+               btrfs_release_path(path);
+       }
+
+       key.objectid = root->root_key.objectid;
+       key.type = BTRFS_ROOT_REF_KEY;
+       key.offset = (u64)-1;
+
+       ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       BUG_ON(ret == 0);
+
+       ret = 0;
+       if (path->slots[0] > 0) {
+               path->slots[0]--;
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+               if (key.objectid == root->root_key.objectid &&
+                   key.type == BTRFS_ROOT_REF_KEY)
+                       ret = -ENOTEMPTY;
+       }
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/* Delete all dentries for inodes belonging to the root */
+static void btrfs_prune_dentries(struct btrfs_root *root)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct rb_node *node;
+       struct rb_node *prev;
+       struct btrfs_inode *entry;
+       struct inode *inode;
+       u64 objectid = 0;
+
+       if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+               WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+       spin_lock(&root->inode_lock);
+again:
+       node = root->inode_tree.rb_node;
+       prev = NULL;
+       while (node) {
+               prev = node;
+               entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+               if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
+                       node = node->rb_left;
+               else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
+                       node = node->rb_right;
+               else
+                       break;
+       }
+       if (!node) {
+               while (prev) {
+                       entry = rb_entry(prev, struct btrfs_inode, rb_node);
+                       if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
+                               node = prev;
+                               break;
+                       }
+                       prev = rb_next(prev);
+               }
+       }
+       while (node) {
+               entry = rb_entry(node, struct btrfs_inode, rb_node);
+               objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
+               inode = igrab(&entry->vfs_inode);
+               if (inode) {
+                       spin_unlock(&root->inode_lock);
+                       if (atomic_read(&inode->i_count) > 1)
+                               d_prune_aliases(inode);
+                       /*
+                        * btrfs_drop_inode will have it removed from the inode
+                        * cache when its usage count hits zero.
+                        */
+                       iput(inode);
+                       cond_resched();
+                       spin_lock(&root->inode_lock);
+                       goto again;
+               }
+
+               if (cond_resched_lock(&root->inode_lock))
+                       goto again;
+
+               node = rb_next(node);
+       }
+       spin_unlock(&root->inode_lock);
+}
+
+int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+       struct btrfs_root *root = BTRFS_I(dir)->root;
+       struct inode *inode = d_inode(dentry);
+       struct btrfs_root *dest = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_block_rsv block_rsv;
+       u64 root_flags;
+       int ret;
+       int err;
+
+       /*
+        * Don't allow to delete a subvolume with send in progress. This is
+        * inside the inode lock so the error handling that has to drop the bit
+        * again is not run concurrently.
+        */
+       spin_lock(&dest->root_item_lock);
+       root_flags = btrfs_root_flags(&dest->root_item);
+       if (dest->send_in_progress == 0) {
+               btrfs_set_root_flags(&dest->root_item,
+                               root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+               spin_unlock(&dest->root_item_lock);
+       } else {
+               spin_unlock(&dest->root_item_lock);
+               btrfs_warn(fs_info,
+                          "attempt to delete subvolume %llu during send",
+                          dest->root_key.objectid);
+               return -EPERM;
+       }
+
+       down_write(&fs_info->subvol_sem);
+
+       err = may_destroy_subvol(dest);
+       if (err)
+               goto out_up_write;
+
+       btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+       /*
+        * One for dir inode,
+        * two for dir entries,
+        * two for root ref/backref.
+        */
+       err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
+       if (err)
+               goto out_up_write;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               err = PTR_ERR(trans);
+               goto out_release;
+       }
+       trans->block_rsv = &block_rsv;
+       trans->bytes_reserved = block_rsv.size;
+
+       btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
+       ret = btrfs_unlink_subvol(trans, root, dir,
+                               dest->root_key.objectid,
+                               dentry->d_name.name,
+                               dentry->d_name.len);
+       if (ret) {
+               err = ret;
+               btrfs_abort_transaction(trans, ret);
+               goto out_end_trans;
+       }
+
+       btrfs_record_root_in_trans(trans, dest);
+
+       memset(&dest->root_item.drop_progress, 0,
+               sizeof(dest->root_item.drop_progress));
+       dest->root_item.drop_level = 0;
+       btrfs_set_root_refs(&dest->root_item, 0);
+
+       if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
+               ret = btrfs_insert_orphan_item(trans,
+                                       fs_info->tree_root,
+                                       dest->root_key.objectid);
+               if (ret) {
+                       btrfs_abort_transaction(trans, ret);
+                       err = ret;
+                       goto out_end_trans;
+               }
+       }
+
+       ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
+                                 BTRFS_UUID_KEY_SUBVOL,
+                                 dest->root_key.objectid);
+       if (ret && ret != -ENOENT) {
+               btrfs_abort_transaction(trans, ret);
+               err = ret;
+               goto out_end_trans;
+       }
+       if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+               ret = btrfs_uuid_tree_remove(trans,
+                                         dest->root_item.received_uuid,
+                                         BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+                                         dest->root_key.objectid);
+               if (ret && ret != -ENOENT) {
+                       btrfs_abort_transaction(trans, ret);
+                       err = ret;
+                       goto out_end_trans;
+               }
+       }
+
+out_end_trans:
+       trans->block_rsv = NULL;
+       trans->bytes_reserved = 0;
+       ret = btrfs_end_transaction(trans);
+       if (ret && !err)
+               err = ret;
+       inode->i_flags |= S_DEAD;
+out_release:
+       btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+out_up_write:
+       up_write(&fs_info->subvol_sem);
+       if (err) {
+               spin_lock(&dest->root_item_lock);
+               root_flags = btrfs_root_flags(&dest->root_item);
+               btrfs_set_root_flags(&dest->root_item,
+                               root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+               spin_unlock(&dest->root_item_lock);
+       } else {
+               d_invalidate(dentry);
+               btrfs_prune_dentries(dest);
+               ASSERT(dest->send_in_progress == 0);
+
+               /* the last ref */
+               if (dest->ino_cache_inode) {
+                       iput(dest->ino_cache_inode);
+                       dest->ino_cache_inode = NULL;
+               }
+       }
+
+       return err;
+}
+
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = d_inode(dentry);
@@ -4337,7 +4434,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
        if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
-               return -EPERM;
+               return btrfs_delete_subvolume(dir, dentry);
 
        trans = __unlink_start_trans(dir);
        if (IS_ERR(trans))
@@ -4449,7 +4546,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int pending_del_slot = 0;
        int extent_type = -1;
        int ret;
-       int err = 0;
        u64 ino = btrfs_ino(BTRFS_I(inode));
        u64 bytes_deleted = 0;
        bool be_nice = false;
@@ -4501,22 +4597,19 @@ search_again:
         * up a huge file in a single leaf.  Most of the time that
         * bytes_deleted is > 0, it will be huge by the time we get here
         */
-       if (be_nice && bytes_deleted > SZ_32M) {
-               if (btrfs_should_end_transaction(trans)) {
-                       err = -EAGAIN;
-                       goto error;
-               }
+       if (be_nice && bytes_deleted > SZ_32M &&
+           btrfs_should_end_transaction(trans)) {
+               ret = -EAGAIN;
+               goto out;
        }
 
-
        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-       if (ret < 0) {
-               err = ret;
+       if (ret < 0)
                goto out;
-       }
 
        if (ret > 0) {
+               ret = 0;
                /* there are no items in the tree for us to truncate, we're
                 * done
                 */
@@ -4627,7 +4720,7 @@ search_again:
                                 * We have to bail so the last_size is set to
                                 * just before this extent.
                                 */
-                               err = NEED_TRUNCATE_BLOCK;
+                               ret = NEED_TRUNCATE_BLOCK;
                                break;
                        }
 
@@ -4666,7 +4759,10 @@ delete:
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
                                                ino, extent_offset);
-                       BUG_ON(ret);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, ret);
+                               break;
+                       }
                        if (btrfs_should_throttle_delayed_refs(trans, fs_info))
                                btrfs_async_run_delayed_refs(fs_info,
                                        trans->delayed_ref_updates * 2,
@@ -4694,7 +4790,7 @@ delete:
                                                pending_del_nr);
                                if (ret) {
                                        btrfs_abort_transaction(trans, ret);
-                                       goto error;
+                                       break;
                                }
                                pending_del_nr = 0;
                        }
@@ -4705,8 +4801,8 @@ delete:
                                        trans->delayed_ref_updates = 0;
                                        ret = btrfs_run_delayed_refs(trans,
                                                                   updates * 2);
-                                       if (ret && !err)
-                                               err = ret;
+                                       if (ret)
+                                               break;
                                }
                        }
                        /*
@@ -4714,8 +4810,8 @@ delete:
                         * and let the transaction restart
                         */
                        if (should_end) {
-                               err = -EAGAIN;
-                               goto error;
+                               ret = -EAGAIN;
+                               break;
                        }
                        goto search_again;
                } else {
@@ -4723,32 +4819,37 @@ delete:
                }
        }
 out:
-       if (pending_del_nr) {
-               ret = btrfs_del_items(trans, root, path, pending_del_slot,
+       if (ret >= 0 && pending_del_nr) {
+               int err;
+
+               err = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
-               if (ret)
-                       btrfs_abort_transaction(trans, ret);
+               if (err) {
+                       btrfs_abort_transaction(trans, err);
+                       ret = err;
+               }
        }
-error:
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                ASSERT(last_size >= new_size);
-               if (!err && last_size > new_size)
+               if (!ret && last_size > new_size)
                        last_size = new_size;
                btrfs_ordered_update_i_size(inode, last_size, NULL);
        }
 
        btrfs_free_path(path);
 
-       if (be_nice && bytes_deleted > SZ_32M) {
+       if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) {
                unsigned long updates = trans->delayed_ref_updates;
+               int err;
+
                if (updates) {
                        trans->delayed_ref_updates = 0;
-                       ret = btrfs_run_delayed_refs(trans, updates * 2);
-                       if (ret && !err)
-                               err = ret;
+                       err = btrfs_run_delayed_refs(trans, updates * 2);
+                       if (err)
+                               ret = err;
                }
        }
-       return err;
+       return ret;
 }
 
 /*
@@ -5090,30 +5191,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
                                &BTRFS_I(inode)->runtime_flags);
 
-               /*
-                * 1 for the orphan item we're going to add
-                * 1 for the orphan item deletion.
-                */
-               trans = btrfs_start_transaction(root, 2);
-               if (IS_ERR(trans))
-                       return PTR_ERR(trans);
-
-               /*
-                * We need to do this in case we fail at _any_ point during the
-                * actual truncate.  Once we do the truncate_setsize we could
-                * invalidate pages which forces any outstanding ordered io to
-                * be instantly completed which will give us extents that need
-                * to be truncated.  If we fail to get an orphan inode down we
-                * could have left over extents that were never meant to live,
-                * so we need to guarantee from this point on that everything
-                * will be consistent.
-                */
-               ret = btrfs_orphan_add(trans, BTRFS_I(inode));
-               btrfs_end_transaction(trans);
-               if (ret)
-                       return ret;
-
-               /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
 
                /* Disable nonlocked read DIO to avoid the end less truncate */
@@ -5125,29 +5202,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                if (ret && inode->i_nlink) {
                        int err;
 
-                       /* To get a stable disk_i_size */
-                       err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
-                       if (err) {
-                               btrfs_orphan_del(NULL, BTRFS_I(inode));
-                               return err;
-                       }
-
                        /*
-                        * failed to truncate, disk_i_size is only adjusted down
-                        * as we remove extents, so it should represent the true
-                        * size of the inode, so reset the in memory size and
-                        * delete our orphan entry.
+                        * Truncate failed, so fix up the in-memory size. We
+                        * adjusted disk_i_size down as we removed extents, so
+                        * wait for disk_i_size to be stable and then update the
+                        * in-memory size to match.
                         */
-                       trans = btrfs_join_transaction(root);
-                       if (IS_ERR(trans)) {
-                               btrfs_orphan_del(NULL, BTRFS_I(inode));
-                               return ret;
-                       }
-                       i_size_write(inode, BTRFS_I(inode)->disk_i_size);
-                       err = btrfs_orphan_del(trans, BTRFS_I(inode));
+                       err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
                        if (err)
-                               btrfs_abort_transaction(trans, err);
-                       btrfs_end_transaction(trans);
+                               return err;
+                       i_size_write(inode, BTRFS_I(inode)->disk_i_size);
                }
        }
 
@@ -5277,13 +5341,52 @@ static void evict_inode_truncate_pages(struct inode *inode)
        spin_unlock(&io_tree->lock);
 }
 
+static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
+                                                       struct btrfs_block_rsv *rsv,
+                                                       u64 min_size)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       int failures = 0;
+
+       for (;;) {
+               struct btrfs_trans_handle *trans;
+               int ret;
+
+               ret = btrfs_block_rsv_refill(root, rsv, min_size,
+                                            BTRFS_RESERVE_FLUSH_LIMIT);
+
+               if (ret && ++failures > 2) {
+                       btrfs_warn(fs_info,
+                                  "could not allocate space for a delete; will truncate on mount");
+                       return ERR_PTR(-ENOSPC);
+               }
+
+               trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans) || !ret)
+                       return trans;
+
+               /*
+                * Try to steal from the global reserve if there is space for
+                * it.
+                */
+               if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
+                   !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
+                       return trans;
+
+               /* If not, commit and try again. */
+               ret = btrfs_commit_transaction(trans);
+               if (ret)
+                       return ERR_PTR(ret);
+       }
+}
+
 void btrfs_evict_inode(struct inode *inode)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_block_rsv *rsv, *global_rsv;
-       int steal_from_global = 0;
+       struct btrfs_block_rsv *rsv;
        u64 min_size;
        int ret;
 
@@ -5304,21 +5407,16 @@ void btrfs_evict_inode(struct inode *inode)
             btrfs_is_free_space_inode(BTRFS_I(inode))))
                goto no_delete;
 
-       if (is_bad_inode(inode)) {
-               btrfs_orphan_del(NULL, BTRFS_I(inode));
+       if (is_bad_inode(inode))
                goto no_delete;
-       }
        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
        if (!special_file(inode->i_mode))
                btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
        btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
 
-       if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
-               BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-                                &BTRFS_I(inode)->runtime_flags));
+       if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
                goto no_delete;
-       }
 
        if (inode->i_nlink > 0) {
                BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
@@ -5327,130 +5425,63 @@ void btrfs_evict_inode(struct inode *inode)
        }
 
        ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
-       if (ret) {
-               btrfs_orphan_del(NULL, BTRFS_I(inode));
+       if (ret)
                goto no_delete;
-       }
 
        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
-       if (!rsv) {
-               btrfs_orphan_del(NULL, BTRFS_I(inode));
+       if (!rsv)
                goto no_delete;
-       }
        rsv->size = min_size;
        rsv->failfast = 1;
-       global_rsv = &fs_info->global_block_rsv;
 
        btrfs_i_size_write(BTRFS_I(inode), 0);
 
-       /*
-        * This is a bit simpler than btrfs_truncate since we've already
-        * reserved our space for our orphan item in the unlink, so we just
-        * need to reserve some slack space in case we add bytes and update
-        * inode item when doing the truncate.
-        */
        while (1) {
-               ret = btrfs_block_rsv_refill(root, rsv, min_size,
-                                            BTRFS_RESERVE_FLUSH_LIMIT);
-
-               /*
-                * Try and steal from the global reserve since we will
-                * likely not use this space anyway, we want to try as
-                * hard as possible to get this to work.
-                */
-               if (ret)
-                       steal_from_global++;
-               else
-                       steal_from_global = 0;
-               ret = 0;
-
-               /*
-                * steal_from_global == 0: we reserved stuff, hooray!
-                * steal_from_global == 1: we didn't reserve stuff, boo!
-                * steal_from_global == 2: we've committed, still not a lot of
-                * room but maybe we'll have room in the global reserve this
-                * time.
-                * steal_from_global == 3: abandon all hope!
-                */
-               if (steal_from_global > 2) {
-                       btrfs_warn(fs_info,
-                                  "Could not get space for a delete, will truncate on mount %d",
-                                  ret);
-                       btrfs_orphan_del(NULL, BTRFS_I(inode));
-                       btrfs_free_block_rsv(fs_info, rsv);
-                       goto no_delete;
-               }
-
-               trans = btrfs_join_transaction(root);
-               if (IS_ERR(trans)) {
-                       btrfs_orphan_del(NULL, BTRFS_I(inode));
-                       btrfs_free_block_rsv(fs_info, rsv);
-                       goto no_delete;
-               }
-
-               /*
-                * We can't just steal from the global reserve, we need to make
-                * sure there is room to do it, if not we need to commit and try
-                * again.
-                */
-               if (steal_from_global) {
-                       if (!btrfs_check_space_for_delayed_refs(trans, fs_info))
-                               ret = btrfs_block_rsv_migrate(global_rsv, rsv,
-                                                             min_size, 0);
-                       else
-                               ret = -ENOSPC;
-               }
-
-               /*
-                * Couldn't steal from the global reserve, we have too much
-                * pending stuff built up, commit the transaction and try it
-                * again.
-                */
-               if (ret) {
-                       ret = btrfs_commit_transaction(trans);
-                       if (ret) {
-                               btrfs_orphan_del(NULL, BTRFS_I(inode));
-                               btrfs_free_block_rsv(fs_info, rsv);
-                               goto no_delete;
-                       }
-                       continue;
-               } else {
-                       steal_from_global = 0;
-               }
+               trans = evict_refill_and_join(root, rsv, min_size);
+               if (IS_ERR(trans))
+                       goto free_rsv;
 
                trans->block_rsv = rsv;
 
                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-               if (ret != -ENOSPC && ret != -EAGAIN)
-                       break;
-
                trans->block_rsv = &fs_info->trans_block_rsv;
                btrfs_end_transaction(trans);
-               trans = NULL;
                btrfs_btree_balance_dirty(fs_info);
+               if (ret && ret != -ENOSPC && ret != -EAGAIN)
+                       goto free_rsv;
+               else if (!ret)
+                       break;
        }
 
-       btrfs_free_block_rsv(fs_info, rsv);
-
        /*
-        * Errors here aren't a big deal, it just means we leave orphan items
-        * in the tree.  They will be cleaned up on the next mount.
+        * Errors here aren't a big deal, it just means we leave orphan items in
+        * the tree. They will be cleaned up on the next mount. If the inode
+        * number gets reused, cleanup deletes the orphan item without doing
+        * anything, and unlink reuses the existing orphan item.
+        *
+        * If it turns out that we are dropping too many of these, we might want
+        * to add a mechanism for retrying these after a commit.
         */
-       if (ret == 0) {
-               trans->block_rsv = root->orphan_block_rsv;
+       trans = evict_refill_and_join(root, rsv, min_size);
+       if (!IS_ERR(trans)) {
+               trans->block_rsv = rsv;
                btrfs_orphan_del(trans, BTRFS_I(inode));
-       } else {
-               btrfs_orphan_del(NULL, BTRFS_I(inode));
+               trans->block_rsv = &fs_info->trans_block_rsv;
+               btrfs_end_transaction(trans);
        }
 
-       trans->block_rsv = &fs_info->trans_block_rsv;
        if (!(root == fs_info->tree_root ||
              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
                btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
 
-       btrfs_end_transaction(trans);
-       btrfs_btree_balance_dirty(fs_info);
+free_rsv:
+       btrfs_free_block_rsv(fs_info, rsv);
 no_delete:
+       /*
+        * If we didn't successfully delete, the orphan item will still be in
+        * the tree and we'll retry on the next mount. Again, we might also want
+        * to retry these periodically in the future.
+        */
        btrfs_remove_delayed_node(BTRFS_I(inode));
        clear_inode(inode);
 }
@@ -5612,84 +5643,21 @@ static void inode_tree_del(struct inode *inode)
        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-               empty = RB_EMPTY_ROOT(&root->inode_tree);
-       }
-       spin_unlock(&root->inode_lock);
-
-       if (empty && btrfs_root_refs(&root->root_item) == 0) {
-               synchronize_srcu(&fs_info->subvol_srcu);
-               spin_lock(&root->inode_lock);
-               empty = RB_EMPTY_ROOT(&root->inode_tree);
-               spin_unlock(&root->inode_lock);
-               if (empty)
-                       btrfs_add_dead_root(root);
-       }
-}
-
-void btrfs_invalidate_inodes(struct btrfs_root *root)
-{
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct rb_node *node;
-       struct rb_node *prev;
-       struct btrfs_inode *entry;
-       struct inode *inode;
-       u64 objectid = 0;
-
-       if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
-               WARN_ON(btrfs_root_refs(&root->root_item) != 0);
-
-       spin_lock(&root->inode_lock);
-again:
-       node = root->inode_tree.rb_node;
-       prev = NULL;
-       while (node) {
-               prev = node;
-               entry = rb_entry(node, struct btrfs_inode, rb_node);
-
-               if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
-                       node = node->rb_left;
-               else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
-                       node = node->rb_right;
-               else
-                       break;
-       }
-       if (!node) {
-               while (prev) {
-                       entry = rb_entry(prev, struct btrfs_inode, rb_node);
-                       if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
-                               node = prev;
-                               break;
-                       }
-                       prev = rb_next(prev);
-               }
-       }
-       while (node) {
-               entry = rb_entry(node, struct btrfs_inode, rb_node);
-               objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
-               inode = igrab(&entry->vfs_inode);
-               if (inode) {
-                       spin_unlock(&root->inode_lock);
-                       if (atomic_read(&inode->i_count) > 1)
-                               d_prune_aliases(inode);
-                       /*
-                        * btrfs_drop_inode will have it removed from
-                        * the inode cache when its usage count
-                        * hits zero.
-                        */
-                       iput(inode);
-                       cond_resched();
-                       spin_lock(&root->inode_lock);
-                       goto again;
-               }
-
-               if (cond_resched_lock(&root->inode_lock))
-                       goto again;
-
-               node = rb_next(node);
+               empty = RB_EMPTY_ROOT(&root->inode_tree);
        }
        spin_unlock(&root->inode_lock);
+
+       if (empty && btrfs_root_refs(&root->root_item) == 0) {
+               synchronize_srcu(&fs_info->subvol_srcu);
+               spin_lock(&root->inode_lock);
+               empty = RB_EMPTY_ROOT(&root->inode_tree);
+               spin_unlock(&root->inode_lock);
+               if (empty)
+                       btrfs_add_dead_root(root);
+       }
 }
 
+
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
        struct btrfs_iget_args *args = p;
@@ -5850,11 +5818,6 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
        return 0;
 }
 
-static void btrfs_dentry_release(struct dentry *dentry)
-{
-       kfree(dentry->d_fsdata);
-}
-
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
                                   unsigned int flags)
 {
@@ -6270,7 +6233,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
        }
 
-       btrfs_update_iflags(inode);
+       btrfs_sync_inode_flags_to_i_flags(inode);
 }
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -6705,8 +6668,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
         * 2 items for inode and inode ref
         * 2 items for dir items
         * 1 item for parent inode
+        * 1 item for orphan item deletion if O_TMPFILE
         */
-       trans = btrfs_start_transaction(root, 5);
+       trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                trans = NULL;
@@ -7083,7 +7047,7 @@ insert:
 
        err = 0;
        write_lock(&em_tree->lock);
-       err = btrfs_add_extent_mapping(em_tree, &em, start, len);
+       err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
        write_unlock(&em_tree->lock);
 out:
 
@@ -7368,6 +7332,14 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
            btrfs_file_extent_other_encoding(leaf, fi))
                goto out;
 
+       /*
+        * Do the same check as in btrfs_cross_ref_exist but without the
+        * unnecessary search.
+        */
+       if (btrfs_file_extent_generation(leaf, fi) <=
+           btrfs_root_last_snapshot(&root->root_item))
+               goto out;
+
        backref_offset = btrfs_file_extent_offset(leaf, fi);
 
        if (orig_start) {
@@ -7568,6 +7540,125 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
        return em;
 }
 
+
+static int btrfs_get_blocks_direct_read(struct extent_map *em,
+                                       struct buffer_head *bh_result,
+                                       struct inode *inode,
+                                       u64 start, u64 len)
+{
+       if (em->block_start == EXTENT_MAP_HOLE ||
+                       test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+               return -ENOENT;
+
+       len = min(len, em->len - (start - em->start));
+
+       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+               inode->i_blkbits;
+       bh_result->b_size = len;
+       bh_result->b_bdev = em->bdev;
+       set_buffer_mapped(bh_result);
+
+       return 0;
+}
+
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+                                        struct buffer_head *bh_result,
+                                        struct inode *inode,
+                                        struct btrfs_dio_data *dio_data,
+                                        u64 start, u64 len)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       struct extent_map *em = *map;
+       int ret = 0;
+
+       /*
+        * We don't allocate a new extent in the following cases
+        *
+        * 1) The inode is marked as NODATACOW. In this case we'll just use the
+        * existing extent.
+        * 2) The extent is marked as PREALLOC. We're good to go here and can
+        * just use the extent.
+        *
+        */
+       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+           ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+            em->block_start != EXTENT_MAP_HOLE)) {
+               int type;
+               u64 block_start, orig_start, orig_block_len, ram_bytes;
+
+               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                       type = BTRFS_ORDERED_PREALLOC;
+               else
+                       type = BTRFS_ORDERED_NOCOW;
+               len = min(len, em->len - (start - em->start));
+               block_start = em->block_start + (start - em->start);
+
+               if (can_nocow_extent(inode, start, &len, &orig_start,
+                                    &orig_block_len, &ram_bytes) == 1 &&
+                   btrfs_inc_nocow_writers(fs_info, block_start)) {
+                       struct extent_map *em2;
+
+                       em2 = btrfs_create_dio_extent(inode, start, len,
+                                                     orig_start, block_start,
+                                                     len, orig_block_len,
+                                                     ram_bytes, type);
+                       btrfs_dec_nocow_writers(fs_info, block_start);
+                       if (type == BTRFS_ORDERED_PREALLOC) {
+                               free_extent_map(em);
+                               *map = em = em2;
+                       }
+
+                       if (em2 && IS_ERR(em2)) {
+                               ret = PTR_ERR(em2);
+                               goto out;
+                       }
+                       /*
+                        * For inode marked NODATACOW or extent marked PREALLOC,
+                        * use the existing or preallocated extent, so does not
+                        * need to adjust btrfs_space_info's bytes_may_use.
+                        */
+                       btrfs_free_reserved_data_space_noquota(inode, start,
+                                                              len);
+                       goto skip_cow;
+               }
+       }
+
+       /* this will cow the extent */
+       len = bh_result->b_size;
+       free_extent_map(em);
+       *map = em = btrfs_new_extent_direct(inode, start, len);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto out;
+       }
+
+       len = min(len, em->len - (start - em->start));
+
+skip_cow:
+       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+               inode->i_blkbits;
+       bh_result->b_size = len;
+       bh_result->b_bdev = em->bdev;
+       set_buffer_mapped(bh_result);
+
+       if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+               set_buffer_new(bh_result);
+
+       /*
+        * Need to update the i_size under the extent lock so buffered
+        * readers will get the updated i_size when we unlock.
+        */
+       if (!dio_data->overwrite && start + len > i_size_read(inode))
+               i_size_write(inode, start + len);
+
+       WARN_ON(dio_data->reserve < len);
+       dio_data->reserve -= len;
+       dio_data->unsubmitted_oe_range_end = start + len;
+       current->journal_info = dio_data;
+out:
+       return ret;
+}
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
@@ -7636,116 +7727,36 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                goto unlock_err;
        }
 
-       /* Just a good old fashioned hole, return */
-       if (!create && (em->block_start == EXTENT_MAP_HOLE ||
-                       test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-               free_extent_map(em);
-               goto unlock_err;
-       }
-
-       /*
-        * We don't allocate a new extent in the following cases
-        *
-        * 1) The inode is marked as NODATACOW.  In this case we'll just use the
-        * existing extent.
-        * 2) The extent is marked as PREALLOC.  We're good to go here and can
-        * just use the extent.
-        *
-        */
-       if (!create) {
-               len = min(len, em->len - (start - em->start));
-               lockstart = start + len;
-               goto unlock;
-       }
-
-       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
-           ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
-            em->block_start != EXTENT_MAP_HOLE)) {
-               int type;
-               u64 block_start, orig_start, orig_block_len, ram_bytes;
-
-               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-                       type = BTRFS_ORDERED_PREALLOC;
-               else
-                       type = BTRFS_ORDERED_NOCOW;
-               len = min(len, em->len - (start - em->start));
-               block_start = em->block_start + (start - em->start);
-
-               if (can_nocow_extent(inode, start, &len, &orig_start,
-                                    &orig_block_len, &ram_bytes) == 1 &&
-                   btrfs_inc_nocow_writers(fs_info, block_start)) {
-                       struct extent_map *em2;
-
-                       em2 = btrfs_create_dio_extent(inode, start, len,
-                                                     orig_start, block_start,
-                                                     len, orig_block_len,
-                                                     ram_bytes, type);
-                       btrfs_dec_nocow_writers(fs_info, block_start);
-                       if (type == BTRFS_ORDERED_PREALLOC) {
-                               free_extent_map(em);
-                               em = em2;
-                       }
-                       if (em2 && IS_ERR(em2)) {
-                               ret = PTR_ERR(em2);
-                               goto unlock_err;
-                       }
-                       /*
-                        * For inode marked NODATACOW or extent marked PREALLOC,
-                        * use the existing or preallocated extent, so does not
-                        * need to adjust btrfs_space_info's bytes_may_use.
-                        */
-                       btrfs_free_reserved_data_space_noquota(inode,
-                                       start, len);
-                       goto unlock;
-               }
-       }
-
-       /*
-        * this will cow the extent, reset the len in case we changed
-        * it above
-        */
-       len = bh_result->b_size;
-       free_extent_map(em);
-       em = btrfs_new_extent_direct(inode, start, len);
-       if (IS_ERR(em)) {
-               ret = PTR_ERR(em);
-               goto unlock_err;
-       }
-       len = min(len, em->len - (start - em->start));
-unlock:
-       bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
-               inode->i_blkbits;
-       bh_result->b_size = len;
-       bh_result->b_bdev = em->bdev;
-       set_buffer_mapped(bh_result);
        if (create) {
-               if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-                       set_buffer_new(bh_result);
+               ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
+                                                   dio_data, start, len);
+               if (ret < 0)
+                       goto unlock_err;
 
+               /* clear and unlock the entire range */
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                unlock_bits, 1, 0, &cached_state);
+       } else {
+               ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
+                                                  start, len);
+               /* Can be negative only if we read from a hole */
+               if (ret < 0) {
+                       ret = 0;
+                       free_extent_map(em);
+                       goto unlock_err;
+               }
                /*
-                * Need to update the i_size under the extent lock so buffered
-                * readers will get the updated i_size when we unlock.
+                * We need to unlock only the end area that we aren't using.
+                * The rest is going to be unlocked by the endio routine.
                 */
-               if (!dio_data->overwrite && start + len > i_size_read(inode))
-                       i_size_write(inode, start + len);
-
-               WARN_ON(dio_data->reserve < len);
-               dio_data->reserve -= len;
-               dio_data->unsubmitted_oe_range_end = start + len;
-               current->journal_info = dio_data;
-       }
-
-       /*
-        * In the case of write we need to clear and unlock the entire range,
-        * in the case of read we need to unlock only the end area that we
-        * aren't using if there is any left over space.
-        */
-       if (lockstart < lockend) {
-               clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                lockend, unlock_bits, 1, 0,
-                                &cached_state);
-       } else {
-               free_extent_state(cached_state);
+               lockstart = start + bh_result->b_size;
+               if (lockstart < lockend) {
+                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                        lockend, unlock_bits, 1, 0,
+                                        &cached_state);
+               } else {
+                       free_extent_state(cached_state);
+               }
        }
 
        free_extent_map(em);
@@ -8131,7 +8142,6 @@ static void __endio_write_update_ordered(struct inode *inode,
        u64 ordered_offset = offset;
        u64 ordered_bytes = bytes;
        u64 last_offset;
-       int ret;
 
        if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
                wq = fs_info->endio_freespace_worker;
@@ -8141,32 +8151,31 @@ static void __endio_write_update_ordered(struct inode *inode,
                func = btrfs_endio_write_helper;
        }
 
-again:
-       last_offset = ordered_offset;
-       ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
-                                                  &ordered_offset,
-                                                  ordered_bytes,
-                                                  uptodate);
-       if (!ret)
-               goto out_test;
-
-       btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
-       btrfs_queue_work(wq, &ordered->work);
-out_test:
-       /*
-        * If btrfs_dec_test_ordered_pending does not find any ordered extent
-        * in the range, we can exit.
-        */
-       if (ordered_offset == last_offset)
-               return;
-       /*
-        * our bio might span multiple ordered extents.  If we haven't
-        * completed the accounting for the whole dio, go back and try again
-        */
-       if (ordered_offset < offset + bytes) {
-               ordered_bytes = offset + bytes - ordered_offset;
-               ordered = NULL;
-               goto again;
+       while (ordered_offset < offset + bytes) {
+               last_offset = ordered_offset;
+               if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
+                                                          &ordered_offset,
+                                                          ordered_bytes,
+                                                          uptodate)) {
+                       btrfs_init_work(&ordered->work, func,
+                                       finish_ordered_fn,
+                                       NULL, NULL);
+                       btrfs_queue_work(wq, &ordered->work);
+               }
+               /*
+                * If btrfs_dec_test_ordered_pending does not find any ordered
+                * extent in the range, we can exit.
+                */
+               if (ordered_offset == last_offset)
+                       return;
+               /*
+                * Our bio might span multiple ordered extents. In this case
+                * we keep goin until we have accounted the whole dio.
+                */
+               if (ordered_offset < offset + bytes) {
+                       ordered_bytes = offset + bytes - ordered_offset;
+                       ordered = NULL;
+               }
        }
 }
 
@@ -8705,29 +8714,19 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 static int btrfs_writepages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
-       struct extent_io_tree *tree;
-
-       tree = &BTRFS_I(mapping->host)->io_tree;
-       return extent_writepages(tree, mapping, wbc);
+       return extent_writepages(mapping, wbc);
 }
 
 static int
 btrfs_readpages(struct file *file, struct address_space *mapping,
                struct list_head *pages, unsigned nr_pages)
 {
-       struct extent_io_tree *tree;
-       tree = &BTRFS_I(mapping->host)->io_tree;
-       return extent_readpages(tree, mapping, pages, nr_pages);
+       return extent_readpages(mapping, pages, nr_pages);
 }
+
 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
-       struct extent_io_tree *tree;
-       struct extent_map_tree *map;
-       int ret;
-
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
-       map = &BTRFS_I(page->mapping->host)->extent_tree;
-       ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+       int ret = try_release_extent_mapping(page, gfp_flags);
        if (ret == 1) {
                ClearPagePrivate(page);
                set_page_private(page, 0);
@@ -8868,8 +8867,8 @@ again:
  *
  * We are not allowed to take the i_mutex here so we have to play games to
  * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
- * page lock we can determine safely if the page is beyond EOF. If it is not
+ * truncate_setsize() writes the inode size before removing pages, once we have
+ * the page lock we can determine safely if the page is beyond EOF. If it is not
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
@@ -9031,8 +9030,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv;
-       int ret = 0;
-       int err = 0;
+       int ret;
        struct btrfs_trans_handle *trans;
        u64 mask = fs_info->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
@@ -9045,39 +9043,31 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
        }
 
        /*
-        * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
-        * 3 things going on here
+        * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
+        * things going on here:
         *
-        * 1) We need to reserve space for our orphan item and the space to
-        * delete our orphan item.  Lord knows we don't want to have a dangling
-        * orphan item because we didn't reserve space to remove it.
+        * 1) We need to reserve space to update our inode.
         *
-        * 2) We need to reserve space to update our inode.
-        *
-        * 3) We need to have something to cache all the space that is going to
+        * 2) We need to have something to cache all the space that is going to
         * be free'd up by the truncate operation, but also have some slack
         * space reserved in case it uses space during the truncate (thank you
         * very much snapshotting).
         *
-        * And we need these to all be separate.  The fact is we can use a lot of
+        * And we need these to be separate.  The fact is we can use a lot of
         * space doing the truncate, and we have no earthly idea how much space
         * we will use, so we need the truncate reservation to be separate so it
-        * doesn't end up using space reserved for updating the inode or
-        * removing the orphan item.  We also need to be able to stop the
-        * transaction and start a new one, which means we need to be able to
-        * update the inode several times, and we have no idea of knowing how
-        * many times that will be, so we can't just reserve 1 item for the
-        * entirety of the operation, so that has to be done separately as well.
-        * Then there is the orphan item, which does indeed need to be held on
-        * to for the whole operation, and we need nobody to touch this reserved
-        * space except the orphan code.
+        * doesn't end up using space reserved for updating the inode.  We also
+        * need to be able to stop the transaction and start a new one, which
+        * means we need to be able to update the inode several times, and we
+        * have no idea of knowing how many times that will be, so we can't just
+        * reserve 1 item for the entirety of the operation, so that has to be
+        * done separately as well.
         *
         * So that leaves us with
         *
-        * 1) root->orphan_block_rsv - for the orphan deletion.
-        * 2) rsv - for the truncate reservation, which we will steal from the
+        * 1) rsv - for the truncate reservation, which we will steal from the
         * transaction reservation.
-        * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+        * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
         * updating the inode.
         */
        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
@@ -9092,7 +9082,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
         */
        trans = btrfs_start_transaction(root, 2);
        if (IS_ERR(trans)) {
-               err = PTR_ERR(trans);
+               ret = PTR_ERR(trans);
                goto out;
        }
 
@@ -9116,24 +9106,19 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
                trans->block_rsv = &fs_info->trans_block_rsv;
-               if (ret != -ENOSPC && ret != -EAGAIN) {
-                       if (ret < 0)
-                               err = ret;
+               if (ret != -ENOSPC && ret != -EAGAIN)
                        break;
-               }
 
                ret = btrfs_update_inode(trans, root, inode);
-               if (ret) {
-                       err = ret;
+               if (ret)
                        break;
-               }
 
                btrfs_end_transaction(trans);
                btrfs_btree_balance_dirty(fs_info);
 
                trans = btrfs_start_transaction(root, 2);
                if (IS_ERR(trans)) {
-                       ret = err = PTR_ERR(trans);
+                       ret = PTR_ERR(trans);
                        trans = NULL;
                        break;
                }
@@ -9166,29 +9151,23 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
        }
 
-       if (ret == 0 && inode->i_nlink > 0) {
-               trans->block_rsv = root->orphan_block_rsv;
-               ret = btrfs_orphan_del(trans, BTRFS_I(inode));
-               if (ret)
-                       err = ret;
-       }
-
        if (trans) {
+               int ret2;
+
                trans->block_rsv = &fs_info->trans_block_rsv;
-               ret = btrfs_update_inode(trans, root, inode);
-               if (ret && !err)
-                       err = ret;
+               ret2 = btrfs_update_inode(trans, root, inode);
+               if (ret2 && !ret)
+                       ret = ret2;
 
-               ret = btrfs_end_transaction(trans);
+               ret2 = btrfs_end_transaction(trans);
+               if (ret2 && !ret)
+                       ret = ret2;
                btrfs_btree_balance_dirty(fs_info);
        }
 out:
        btrfs_free_block_rsv(fs_info, rsv);
 
-       if (ret && !err)
-               err = ret;
-
-       return err;
+       return ret;
 }
 
 /*
@@ -9324,13 +9303,6 @@ void btrfs_destroy_inode(struct inode *inode)
        if (!root)
                goto free;
 
-       if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-                    &BTRFS_I(inode)->runtime_flags)) {
-               btrfs_info(fs_info, "inode %llu still on the orphan list",
-                          btrfs_ino(BTRFS_I(inode)));
-               atomic_dec(&root->orphan_inodes);
-       }
-
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
                if (!ordered)
@@ -9964,6 +9936,13 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
        return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
+struct btrfs_delalloc_work {
+       struct inode *inode;
+       struct completion completion;
+       struct list_head list;
+       struct btrfs_work work;
+};
+
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
        struct btrfs_delalloc_work *delalloc_work;
@@ -9977,15 +9956,11 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
                                &BTRFS_I(inode)->runtime_flags))
                filemap_flush(inode->i_mapping);
 
-       if (delalloc_work->delay_iput)
-               btrfs_add_delayed_iput(inode);
-       else
-               iput(inode);
+       iput(inode);
        complete(&delalloc_work->completion);
 }
 
-struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
-                                                   int delay_iput)
+static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
 {
        struct btrfs_delalloc_work *work;
 
@@ -9996,7 +9971,6 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
        init_completion(&work->completion);
        INIT_LIST_HEAD(&work->list);
        work->inode = inode;
-       work->delay_iput = delay_iput;
        WARN_ON_ONCE(!inode);
        btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
                        btrfs_run_delalloc_work, NULL, NULL);
@@ -10004,18 +9978,11 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
        return work;
 }
 
-void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
-{
-       wait_for_completion(&work->completion);
-       kfree(work);
-}
-
 /*
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
-                                  int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr)
 {
        struct btrfs_inode *binode;
        struct inode *inode;
@@ -10043,12 +10010,9 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
                }
                spin_unlock(&root->delalloc_lock);
 
-               work = btrfs_alloc_delalloc_work(inode, delay_iput);
+               work = btrfs_alloc_delalloc_work(inode);
                if (!work) {
-                       if (delay_iput)
-                               btrfs_add_delayed_iput(inode);
-                       else
-                               iput(inode);
+                       iput(inode);
                        ret = -ENOMEM;
                        goto out;
                }
@@ -10066,10 +10030,11 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
 out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
-               btrfs_wait_and_free_delalloc_work(work);
+               wait_for_completion(&work->completion);
+               kfree(work);
        }
 
-       if (!list_empty_careful(&splice)) {
+       if (!list_empty(&splice)) {
                spin_lock(&root->delalloc_lock);
                list_splice_tail(&splice, &root->delalloc_inodes);
                spin_unlock(&root->delalloc_lock);
@@ -10078,7 +10043,7 @@ out:
        return ret;
 }
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
@@ -10086,14 +10051,13 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                return -EROFS;
 
-       ret = __start_delalloc_inodes(root, delay_iput, -1);
+       ret = start_delalloc_inodes(root, -1);
        if (ret > 0)
                ret = 0;
        return ret;
 }
 
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
-                              int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
 {
        struct btrfs_root *root;
        struct list_head splice;
@@ -10116,7 +10080,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
                               &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
 
-               ret = __start_delalloc_inodes(root, delay_iput, nr);
+               ret = start_delalloc_inodes(root, nr);
                btrfs_put_fs_root(root);
                if (ret < 0)
                        goto out;
@@ -10131,7 +10095,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
 
        ret = 0;
 out:
-       if (!list_empty_careful(&splice)) {
+       if (!list_empty(&splice)) {
                spin_lock(&fs_info->delalloc_root_lock);
                list_splice_tail(&splice, &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
@@ -10669,5 +10633,4 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
 
 const struct dentry_operations btrfs_dentry_operations = {
        .d_delete       = btrfs_dentry_delete,
-       .d_release      = btrfs_dentry_release,
 };
index 632e26d6f7ce0cd26284b0e39c140c7f09a1f3d2..d29992f7dc6356b2de853f5dde76cebedc3e30d0 100644 (file)
@@ -93,20 +93,22 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
                       int no_time_update);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags)
+static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
+               unsigned int flags)
 {
-       if (S_ISDIR(mode))
+       if (S_ISDIR(inode->i_mode))
                return flags;
-       else if (S_ISREG(mode))
+       else if (S_ISREG(inode->i_mode))
                return flags & ~FS_DIRSYNC_FL;
        else
                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
 }
 
 /*
- * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
+ * ioctl.
  */
-static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
 {
        unsigned int iflags = 0;
 
@@ -136,20 +138,20 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
 /*
  * Update inode->i_flags based on the btrfs internal flags.
  */
-void btrfs_update_iflags(struct inode *inode)
+void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
 {
-       struct btrfs_inode *ip = BTRFS_I(inode);
+       struct btrfs_inode *binode = BTRFS_I(inode);
        unsigned int new_fl = 0;
 
-       if (ip->flags & BTRFS_INODE_SYNC)
+       if (binode->flags & BTRFS_INODE_SYNC)
                new_fl |= S_SYNC;
-       if (ip->flags & BTRFS_INODE_IMMUTABLE)
+       if (binode->flags & BTRFS_INODE_IMMUTABLE)
                new_fl |= S_IMMUTABLE;
-       if (ip->flags & BTRFS_INODE_APPEND)
+       if (binode->flags & BTRFS_INODE_APPEND)
                new_fl |= S_APPEND;
-       if (ip->flags & BTRFS_INODE_NOATIME)
+       if (binode->flags & BTRFS_INODE_NOATIME)
                new_fl |= S_NOATIME;
-       if (ip->flags & BTRFS_INODE_DIRSYNC)
+       if (binode->flags & BTRFS_INODE_DIRSYNC)
                new_fl |= S_DIRSYNC;
 
        set_mask_bits(&inode->i_flags,
@@ -159,15 +161,16 @@ void btrfs_update_iflags(struct inode *inode)
 
 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
 {
-       struct btrfs_inode *ip = BTRFS_I(file_inode(file));
-       unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+       struct btrfs_inode *binode = BTRFS_I(file_inode(file));
+       unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
 
        if (copy_to_user(arg, &flags, sizeof(flags)))
                return -EFAULT;
        return 0;
 }
 
-static int check_flags(unsigned int flags)
+/* Check if @flags are a supported and valid set of FS_*_FL flags */
+static int check_fsflags(unsigned int flags)
 {
        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
                      FS_NOATIME_FL | FS_NODUMP_FL | \
@@ -186,13 +189,13 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 {
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_inode *ip = BTRFS_I(inode);
-       struct btrfs_root *root = ip->root;
+       struct btrfs_inode *binode = BTRFS_I(inode);
+       struct btrfs_root *root = binode->root;
        struct btrfs_trans_handle *trans;
-       unsigned int flags, oldflags;
+       unsigned int fsflags, old_fsflags;
        int ret;
-       u64 ip_oldflags;
-       unsigned int i_oldflags;
+       u64 old_flags;
+       unsigned int old_i_flags;
        umode_t mode;
 
        if (!inode_owner_or_capable(inode))
@@ -201,10 +204,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        if (btrfs_root_readonly(root))
                return -EROFS;
 
-       if (copy_from_user(&flags, arg, sizeof(flags)))
+       if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
                return -EFAULT;
 
-       ret = check_flags(flags);
+       ret = check_fsflags(fsflags);
        if (ret)
                return ret;
 
@@ -214,44 +217,44 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
        inode_lock(inode);
 
-       ip_oldflags = ip->flags;
-       i_oldflags = inode->i_flags;
+       old_flags = binode->flags;
+       old_i_flags = inode->i_flags;
        mode = inode->i_mode;
 
-       flags = btrfs_mask_flags(inode->i_mode, flags);
-       oldflags = btrfs_flags_to_ioctl(ip->flags);
-       if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+       fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
+       old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+       if ((fsflags ^ old_fsflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
                if (!capable(CAP_LINUX_IMMUTABLE)) {
                        ret = -EPERM;
                        goto out_unlock;
                }
        }
 
-       if (flags & FS_SYNC_FL)
-               ip->flags |= BTRFS_INODE_SYNC;
+       if (fsflags & FS_SYNC_FL)
+               binode->flags |= BTRFS_INODE_SYNC;
        else
-               ip->flags &= ~BTRFS_INODE_SYNC;
-       if (flags & FS_IMMUTABLE_FL)
-               ip->flags |= BTRFS_INODE_IMMUTABLE;
+               binode->flags &= ~BTRFS_INODE_SYNC;
+       if (fsflags & FS_IMMUTABLE_FL)
+               binode->flags |= BTRFS_INODE_IMMUTABLE;
        else
-               ip->flags &= ~BTRFS_INODE_IMMUTABLE;
-       if (flags & FS_APPEND_FL)
-               ip->flags |= BTRFS_INODE_APPEND;
+               binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+       if (fsflags & FS_APPEND_FL)
+               binode->flags |= BTRFS_INODE_APPEND;
        else
-               ip->flags &= ~BTRFS_INODE_APPEND;
-       if (flags & FS_NODUMP_FL)
-               ip->flags |= BTRFS_INODE_NODUMP;
+               binode->flags &= ~BTRFS_INODE_APPEND;
+       if (fsflags & FS_NODUMP_FL)
+               binode->flags |= BTRFS_INODE_NODUMP;
        else
-               ip->flags &= ~BTRFS_INODE_NODUMP;
-       if (flags & FS_NOATIME_FL)
-               ip->flags |= BTRFS_INODE_NOATIME;
+               binode->flags &= ~BTRFS_INODE_NODUMP;
+       if (fsflags & FS_NOATIME_FL)
+               binode->flags |= BTRFS_INODE_NOATIME;
        else
-               ip->flags &= ~BTRFS_INODE_NOATIME;
-       if (flags & FS_DIRSYNC_FL)
-               ip->flags |= BTRFS_INODE_DIRSYNC;
+               binode->flags &= ~BTRFS_INODE_NOATIME;
+       if (fsflags & FS_DIRSYNC_FL)
+               binode->flags |= BTRFS_INODE_DIRSYNC;
        else
-               ip->flags &= ~BTRFS_INODE_DIRSYNC;
-       if (flags & FS_NOCOW_FL) {
+               binode->flags &= ~BTRFS_INODE_DIRSYNC;
+       if (fsflags & FS_NOCOW_FL) {
                if (S_ISREG(mode)) {
                        /*
                         * It's safe to turn csums off here, no extents exist.
@@ -259,10 +262,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                         * status of the file and will not set it.
                         */
                        if (inode->i_size == 0)
-                               ip->flags |= BTRFS_INODE_NODATACOW
-                                          | BTRFS_INODE_NODATASUM;
+                               binode->flags |= BTRFS_INODE_NODATACOW
+                                             | BTRFS_INODE_NODATASUM;
                } else {
-                       ip->flags |= BTRFS_INODE_NODATACOW;
+                       binode->flags |= BTRFS_INODE_NODATACOW;
                }
        } else {
                /*
@@ -270,10 +273,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                 */
                if (S_ISREG(mode)) {
                        if (inode->i_size == 0)
-                               ip->flags &= ~(BTRFS_INODE_NODATACOW
+                               binode->flags &= ~(BTRFS_INODE_NODATACOW
                                             | BTRFS_INODE_NODATASUM);
                } else {
-                       ip->flags &= ~BTRFS_INODE_NODATACOW;
+                       binode->flags &= ~BTRFS_INODE_NODATACOW;
                }
        }
 
@@ -282,18 +285,18 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
         * flag may be changed automatically if compression code won't make
         * things smaller.
         */
-       if (flags & FS_NOCOMP_FL) {
-               ip->flags &= ~BTRFS_INODE_COMPRESS;
-               ip->flags |= BTRFS_INODE_NOCOMPRESS;
+       if (fsflags & FS_NOCOMP_FL) {
+               binode->flags &= ~BTRFS_INODE_COMPRESS;
+               binode->flags |= BTRFS_INODE_NOCOMPRESS;
 
                ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
                if (ret && ret != -ENODATA)
                        goto out_drop;
-       } else if (flags & FS_COMPR_FL) {
+       } else if (fsflags & FS_COMPR_FL) {
                const char *comp;
 
-               ip->flags |= BTRFS_INODE_COMPRESS;
-               ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+               binode->flags |= BTRFS_INODE_COMPRESS;
+               binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
 
                comp = btrfs_compress_type2str(fs_info->compress_type);
                if (!comp || comp[0] == 0)
@@ -308,7 +311,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
                if (ret && ret != -ENODATA)
                        goto out_drop;
-               ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+               binode->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }
 
        trans = btrfs_start_transaction(root, 1);
@@ -317,7 +320,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                goto out_drop;
        }
 
-       btrfs_update_iflags(inode);
+       btrfs_sync_inode_flags_to_i_flags(inode);
        inode_inc_iversion(inode);
        inode->i_ctime = current_time(inode);
        ret = btrfs_update_inode(trans, root, inode);
@@ -325,8 +328,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        btrfs_end_transaction(trans);
  out_drop:
        if (ret) {
-               ip->flags = ip_oldflags;
-               inode->i_flags = i_oldflags;
+               binode->flags = old_flags;
+               inode->i_flags = old_i_flags;
        }
 
  out_unlock:
@@ -335,6 +338,148 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        return ret;
 }
 
+/*
+ * Translate btrfs internal inode flags to xflags as expected by the
+ * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
+ * silently dropped.
+ */
+static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
+{
+       unsigned int xflags = 0;
+
+       if (flags & BTRFS_INODE_APPEND)
+               xflags |= FS_XFLAG_APPEND;
+       if (flags & BTRFS_INODE_IMMUTABLE)
+               xflags |= FS_XFLAG_IMMUTABLE;
+       if (flags & BTRFS_INODE_NOATIME)
+               xflags |= FS_XFLAG_NOATIME;
+       if (flags & BTRFS_INODE_NODUMP)
+               xflags |= FS_XFLAG_NODUMP;
+       if (flags & BTRFS_INODE_SYNC)
+               xflags |= FS_XFLAG_SYNC;
+
+       return xflags;
+}
+
+/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
+static int check_xflags(unsigned int flags)
+{
+       if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
+                     FS_XFLAG_NODUMP | FS_XFLAG_SYNC))
+               return -EOPNOTSUPP;
+       return 0;
+}
+
+/*
+ * Set the xflags from the internal inode flags. The remaining items of fsxattr
+ * are zeroed.
+ */
+static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
+{
+       struct btrfs_inode *binode = BTRFS_I(file_inode(file));
+       struct fsxattr fa;
+
+       memset(&fa, 0, sizeof(fa));
+       fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
+
+       if (copy_to_user(arg, &fa, sizeof(fa)))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
+{
+       struct inode *inode = file_inode(file);
+       struct btrfs_inode *binode = BTRFS_I(inode);
+       struct btrfs_root *root = binode->root;
+       struct btrfs_trans_handle *trans;
+       struct fsxattr fa;
+       unsigned old_flags;
+       unsigned old_i_flags;
+       int ret = 0;
+
+       if (!inode_owner_or_capable(inode))
+               return -EPERM;
+
+       if (btrfs_root_readonly(root))
+               return -EROFS;
+
+       memset(&fa, 0, sizeof(fa));
+       if (copy_from_user(&fa, arg, sizeof(fa)))
+               return -EFAULT;
+
+       ret = check_xflags(fa.fsx_xflags);
+       if (ret)
+               return ret;
+
+       if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0)
+               return -EOPNOTSUPP;
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
+       inode_lock(inode);
+
+       old_flags = binode->flags;
+       old_i_flags = inode->i_flags;
+
+       /* We need the capabilities to change append-only or immutable inode */
+       if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) ||
+            (fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) &&
+           !capable(CAP_LINUX_IMMUTABLE)) {
+               ret = -EPERM;
+               goto out_unlock;
+       }
+
+       if (fa.fsx_xflags & FS_XFLAG_SYNC)
+               binode->flags |= BTRFS_INODE_SYNC;
+       else
+               binode->flags &= ~BTRFS_INODE_SYNC;
+       if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
+               binode->flags |= BTRFS_INODE_IMMUTABLE;
+       else
+               binode->flags &= ~BTRFS_INODE_IMMUTABLE;
+       if (fa.fsx_xflags & FS_XFLAG_APPEND)
+               binode->flags |= BTRFS_INODE_APPEND;
+       else
+               binode->flags &= ~BTRFS_INODE_APPEND;
+       if (fa.fsx_xflags & FS_XFLAG_NODUMP)
+               binode->flags |= BTRFS_INODE_NODUMP;
+       else
+               binode->flags &= ~BTRFS_INODE_NODUMP;
+       if (fa.fsx_xflags & FS_XFLAG_NOATIME)
+               binode->flags |= BTRFS_INODE_NOATIME;
+       else
+               binode->flags &= ~BTRFS_INODE_NOATIME;
+
+       /* 1 item for the inode */
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_unlock;
+       }
+
+       btrfs_sync_inode_flags_to_i_flags(inode);
+       inode_inc_iversion(inode);
+       inode->i_ctime = current_time(inode);
+       ret = btrfs_update_inode(trans, root, inode);
+
+       btrfs_end_transaction(trans);
+
+out_unlock:
+       if (ret) {
+               binode->flags = old_flags;
+               inode->i_flags = old_i_flags;
+       }
+
+       inode_unlock(inode);
+       mnt_drop_write_file(file);
+
+       return ret;
+}
+
 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 {
        struct inode *inode = file_inode(file);
@@ -424,7 +569,6 @@ static noinline int create_subvol(struct inode *dir,
        u64 objectid;
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
-       u64 qgroup_reserved;
        uuid_le new_uuid;
 
        root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
@@ -449,8 +593,7 @@ static noinline int create_subvol(struct inode *dir,
         * The same as the snapshot creation, please see the comment
         * of create_snapshot().
         */
-       ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
-                                              8, &qgroup_reserved, false);
+       ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
        if (ret)
                goto fail_free;
 
@@ -573,7 +716,7 @@ static noinline int create_subvol(struct inode *dir,
                                 btrfs_ino(BTRFS_I(dir)), index, name, namelen);
        BUG_ON(ret);
 
-       ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid,
+       ret = btrfs_uuid_tree_add(trans, root_item->uuid,
                                  BTRFS_UUID_KEY_SUBVOL, objectid);
        if (ret)
                btrfs_abort_transaction(trans, ret);
@@ -640,7 +783,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        wait_event(root->subv_writers->wait,
                   percpu_counter_sum(&root->subv_writers->counter) == 0);
 
-       ret = btrfs_start_delalloc_inodes(root, 0);
+       ret = btrfs_start_delalloc_inodes(root);
        if (ret)
                goto dec_and_free;
 
@@ -658,7 +801,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
         */
        ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
                                        &pending_snapshot->block_rsv, 8,
-                                       &pending_snapshot->qgroup_reserved,
                                        false);
        if (ret)
                goto dec_and_free;
@@ -1457,7 +1599,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
        }
 
-       mutex_lock(&fs_info->volume_mutex);
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
@@ -1565,7 +1706,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 out_free:
        kfree(vol_args);
 out:
-       mutex_unlock(&fs_info->volume_mutex);
        clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
        mnt_drop_write_file(file);
        return ret;
@@ -1832,60 +1972,6 @@ out:
        return ret;
 }
 
-/*
- * helper to check if the subvolume references other subvolumes
- */
-static noinline int may_destroy_subvol(struct btrfs_root *root)
-{
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct btrfs_path *path;
-       struct btrfs_dir_item *di;
-       struct btrfs_key key;
-       u64 dir_id;
-       int ret;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       /* Make sure this root isn't set as the default subvol */
-       dir_id = btrfs_super_root_dir(fs_info->super_copy);
-       di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
-                                  dir_id, "default", 7, 0);
-       if (di && !IS_ERR(di)) {
-               btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
-               if (key.objectid == root->root_key.objectid) {
-                       ret = -EPERM;
-                       btrfs_err(fs_info,
-                                 "deleting default subvolume %llu is not allowed",
-                                 key.objectid);
-                       goto out;
-               }
-               btrfs_release_path(path);
-       }
-
-       key.objectid = root->root_key.objectid;
-       key.type = BTRFS_ROOT_REF_KEY;
-       key.offset = (u64)-1;
-
-       ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
-       if (ret < 0)
-               goto out;
-       BUG_ON(ret == 0);
-
-       ret = 0;
-       if (path->slots[0] > 0) {
-               path->slots[0]--;
-               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-               if (key.objectid == root->root_key.objectid &&
-                   key.type == BTRFS_ROOT_REF_KEY)
-                       ret = -ENOTEMPTY;
-       }
-out:
-       btrfs_free_path(path);
-       return ret;
-}
-
 static noinline int key_in_sk(struct btrfs_key *key,
                              struct btrfs_ioctl_search_key *sk)
 {
@@ -2066,7 +2152,7 @@ static noinline int search_ioctl(struct inode *inode,
                root = btrfs_read_fs_root_no_name(info, &key);
                if (IS_ERR(root)) {
                        btrfs_free_path(path);
-                       return -ENOENT;
+                       return PTR_ERR(root);
                }
        }
 
@@ -2200,8 +2286,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
        key.offset = (u64)-1;
        root = btrfs_read_fs_root_no_name(info, &key);
        if (IS_ERR(root)) {
-               btrfs_err(info, "could not find root %llu", tree_id);
-               ret = -ENOENT;
+               ret = PTR_ERR(root);
                goto out;
        }
 
@@ -2237,64 +2322,482 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
                        goto out;
                }
 
-               *(ptr + len) = '/';
-               read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
-
-               if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
-                       break;
+               *(ptr + len) = '/';
+               read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
+
+               if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+                       break;
+
+               btrfs_release_path(path);
+               key.objectid = key.offset;
+               key.offset = (u64)-1;
+               dirid = key.objectid;
+       }
+       memmove(name, ptr, total_len);
+       name[total_len] = '\0';
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static int btrfs_search_path_in_tree_user(struct inode *inode,
+                               struct btrfs_ioctl_ino_lookup_user_args *args)
+{
+       struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+       struct super_block *sb = inode->i_sb;
+       struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+       u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+       u64 dirid = args->dirid;
+       unsigned long item_off;
+       unsigned long item_len;
+       struct btrfs_inode_ref *iref;
+       struct btrfs_root_ref *rref;
+       struct btrfs_root *root;
+       struct btrfs_path *path;
+       struct btrfs_key key, key2;
+       struct extent_buffer *leaf;
+       struct inode *temp_inode;
+       char *ptr;
+       int slot;
+       int len;
+       int total_len = 0;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       /*
+        * If the bottom subvolume does not exist directly under upper_limit,
+        * construct the path in from the bottom up.
+        */
+       if (dirid != upper_limit.objectid) {
+               ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
+
+               key.objectid = treeid;
+               key.type = BTRFS_ROOT_ITEM_KEY;
+               key.offset = (u64)-1;
+               root = btrfs_read_fs_root_no_name(fs_info, &key);
+               if (IS_ERR(root)) {
+                       ret = PTR_ERR(root);
+                       goto out;
+               }
+
+               key.objectid = dirid;
+               key.type = BTRFS_INODE_REF_KEY;
+               key.offset = (u64)-1;
+               while (1) {
+                       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = btrfs_previous_item(root, path, dirid,
+                                                         BTRFS_INODE_REF_KEY);
+                               if (ret < 0) {
+                                       goto out;
+                               } else if (ret > 0) {
+                                       ret = -ENOENT;
+                                       goto out;
+                               }
+                       }
+
+                       leaf = path->nodes[0];
+                       slot = path->slots[0];
+                       btrfs_item_key_to_cpu(leaf, &key, slot);
+
+                       iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
+                       len = btrfs_inode_ref_name_len(leaf, iref);
+                       ptr -= len + 1;
+                       total_len += len + 1;
+                       if (ptr < args->path) {
+                               ret = -ENAMETOOLONG;
+                               goto out;
+                       }
+
+                       *(ptr + len) = '/';
+                       read_extent_buffer(leaf, ptr,
+                                       (unsigned long)(iref + 1), len);
+
+                       /* Check the read+exec permission of this directory */
+                       ret = btrfs_previous_item(root, path, dirid,
+                                                 BTRFS_INODE_ITEM_KEY);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = -ENOENT;
+                               goto out;
+                       }
+
+                       leaf = path->nodes[0];
+                       slot = path->slots[0];
+                       btrfs_item_key_to_cpu(leaf, &key2, slot);
+                       if (key2.objectid != dirid) {
+                               ret = -ENOENT;
+                               goto out;
+                       }
+
+                       temp_inode = btrfs_iget(sb, &key2, root, NULL);
+                       ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+                       iput(temp_inode);
+                       if (ret) {
+                               ret = -EACCES;
+                               goto out;
+                       }
+
+                       if (key.offset == upper_limit.objectid)
+                               break;
+                       if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
+                               ret = -EACCES;
+                               goto out;
+                       }
+
+                       btrfs_release_path(path);
+                       key.objectid = key.offset;
+                       key.offset = (u64)-1;
+                       dirid = key.objectid;
+               }
+
+               memmove(args->path, ptr, total_len);
+               args->path[total_len] = '\0';
+               btrfs_release_path(path);
+       }
+
+       /* Get the bottom subvolume's name from ROOT_REF */
+       root = fs_info->tree_root;
+       key.objectid = treeid;
+       key.type = BTRFS_ROOT_REF_KEY;
+       key.offset = args->treeid;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0) {
+               goto out;
+       } else if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       leaf = path->nodes[0];
+       slot = path->slots[0];
+       btrfs_item_key_to_cpu(leaf, &key, slot);
+
+       item_off = btrfs_item_ptr_offset(leaf, slot);
+       item_len = btrfs_item_size_nr(leaf, slot);
+       /* Check if dirid in ROOT_REF corresponds to passed dirid */
+       rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+       if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* Copy subvolume's name */
+       item_off += sizeof(struct btrfs_root_ref);
+       item_len -= sizeof(struct btrfs_root_ref);
+       read_extent_buffer(leaf, args->name, item_off, item_len);
+       args->name[item_len] = 0;
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+                                          void __user *argp)
+{
+        struct btrfs_ioctl_ino_lookup_args *args;
+        struct inode *inode;
+       int ret = 0;
+
+       args = memdup_user(argp, sizeof(*args));
+       if (IS_ERR(args))
+               return PTR_ERR(args);
+
+       inode = file_inode(file);
+
+       /*
+        * Unprivileged query to obtain the containing subvolume root id. The
+        * path is reset so it's consistent with btrfs_search_path_in_tree.
+        */
+       if (args->treeid == 0)
+               args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+
+       if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+               args->name[0] = 0;
+               goto out;
+       }
+
+       if (!capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+               goto out;
+       }
+
+       ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+                                       args->treeid, args->objectid,
+                                       args->name);
+
+out:
+       if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+               ret = -EFAULT;
+
+       kfree(args);
+       return ret;
+}
+
+/*
+ * Version of ino_lookup ioctl (unprivileged)
+ *
+ * The main differences from ino_lookup ioctl are:
+ *
+ *   1. Read + Exec permission will be checked using inode_permission() during
+ *      path construction. -EACCES will be returned in case of failure.
+ *   2. Path construction will be stopped at the inode number which corresponds
+ *      to the fd with which this ioctl is called. If constructed path does not
+ *      exist under fd's inode, -EACCES will be returned.
+ *   3. The name of bottom subvolume is also searched and filled.
+ */
+static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
+{
+       struct btrfs_ioctl_ino_lookup_user_args *args;
+       struct inode *inode;
+       int ret;
+
+       args = memdup_user(argp, sizeof(*args));
+       if (IS_ERR(args))
+               return PTR_ERR(args);
+
+       inode = file_inode(file);
+
+       if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
+           BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
+               /*
+                * The subvolume does not exist under fd with which this is
+                * called
+                */
+               kfree(args);
+               return -EACCES;
+       }
+
+       ret = btrfs_search_path_in_tree_user(inode, args);
+
+       if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+               ret = -EFAULT;
+
+       kfree(args);
+       return ret;
+}
+
+/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
+static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+{
+       struct btrfs_ioctl_get_subvol_info_args *subvol_info;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_root *root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct btrfs_root_item *root_item;
+       struct btrfs_root_ref *rref;
+       struct extent_buffer *leaf;
+       unsigned long item_off;
+       unsigned long item_len;
+       struct inode *inode;
+       int slot;
+       int ret = 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
+       if (!subvol_info) {
+               btrfs_free_path(path);
+               return -ENOMEM;
+       }
+
+       inode = file_inode(file);
+       fs_info = BTRFS_I(inode)->root->fs_info;
+
+       /* Get root_item of inode's subvolume */
+       key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+       key.type = BTRFS_ROOT_ITEM_KEY;
+       key.offset = (u64)-1;
+       root = btrfs_read_fs_root_no_name(fs_info, &key);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
+       root_item = &root->root_item;
+
+       subvol_info->treeid = key.objectid;
+
+       subvol_info->generation = btrfs_root_generation(root_item);
+       subvol_info->flags = btrfs_root_flags(root_item);
+
+       memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
+       memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
+                                                   BTRFS_UUID_SIZE);
+       memcpy(subvol_info->received_uuid, root_item->received_uuid,
+                                                   BTRFS_UUID_SIZE);
+
+       subvol_info->ctransid = btrfs_root_ctransid(root_item);
+       subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
+       subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
+
+       subvol_info->otransid = btrfs_root_otransid(root_item);
+       subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
+       subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
+
+       subvol_info->stransid = btrfs_root_stransid(root_item);
+       subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
+       subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
+
+       subvol_info->rtransid = btrfs_root_rtransid(root_item);
+       subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
+       subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
+
+       if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
+               /* Search root tree for ROOT_BACKREF of this subvolume */
+               root = fs_info->tree_root;
+
+               key.type = BTRFS_ROOT_BACKREF_KEY;
+               key.offset = 0;
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0) {
+                       goto out;
+               } else if (path->slots[0] >=
+                          btrfs_header_nritems(path->nodes[0])) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0) {
+                               goto out;
+                       } else if (ret > 0) {
+                               ret = -EUCLEAN;
+                               goto out;
+                       }
+               }
+
+               leaf = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (key.objectid == subvol_info->treeid &&
+                   key.type == BTRFS_ROOT_BACKREF_KEY) {
+                       subvol_info->parent_id = key.offset;
+
+                       rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+                       subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
+
+                       item_off = btrfs_item_ptr_offset(leaf, slot)
+                                       + sizeof(struct btrfs_root_ref);
+                       item_len = btrfs_item_size_nr(leaf, slot)
+                                       - sizeof(struct btrfs_root_ref);
+                       read_extent_buffer(leaf, subvol_info->name,
+                                          item_off, item_len);
+               } else {
+                       ret = -ENOENT;
+                       goto out;
+               }
+       }
+
+       if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
+               ret = -EFAULT;
 
-               btrfs_release_path(path);
-               key.objectid = key.offset;
-               key.offset = (u64)-1;
-               dirid = key.objectid;
-       }
-       memmove(name, ptr, total_len);
-       name[total_len] = '\0';
-       ret = 0;
 out:
        btrfs_free_path(path);
+       kzfree(subvol_info);
        return ret;
 }
 
-static noinline int btrfs_ioctl_ino_lookup(struct file *file,
-                                          void __user *argp)
+/*
+ * Return ROOT_REF information of the subvolume containing this inode
+ * except the subvolume name.
+ */
+static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
 {
-        struct btrfs_ioctl_ino_lookup_args *args;
-        struct inode *inode;
-       int ret = 0;
+       struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+       struct btrfs_root_ref *rref;
+       struct btrfs_root *root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *leaf;
+       struct inode *inode;
+       u64 objectid;
+       int slot;
+       int ret;
+       u8 found;
 
-       args = memdup_user(argp, sizeof(*args));
-       if (IS_ERR(args))
-               return PTR_ERR(args);
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       rootrefs = memdup_user(argp, sizeof(*rootrefs));
+       if (IS_ERR(rootrefs)) {
+               btrfs_free_path(path);
+               return PTR_ERR(rootrefs);
+       }
 
        inode = file_inode(file);
+       root = BTRFS_I(inode)->root->fs_info->tree_root;
+       objectid = BTRFS_I(inode)->root->root_key.objectid;
 
-       /*
-        * Unprivileged query to obtain the containing subvolume root id. The
-        * path is reset so it's consistent with btrfs_search_path_in_tree.
-        */
-       if (args->treeid == 0)
-               args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+       key.objectid = objectid;
+       key.type = BTRFS_ROOT_REF_KEY;
+       key.offset = rootrefs->min_treeid;
+       found = 0;
 
-       if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
-               args->name[0] = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0) {
                goto out;
+       } else if (path->slots[0] >=
+                  btrfs_header_nritems(path->nodes[0])) {
+               ret = btrfs_next_leaf(root, path);
+               if (ret < 0) {
+                       goto out;
+               } else if (ret > 0) {
+                       ret = -EUCLEAN;
+                       goto out;
+               }
        }
+       while (1) {
+               leaf = path->nodes[0];
+               slot = path->slots[0];
 
-       if (!capable(CAP_SYS_ADMIN)) {
-               ret = -EPERM;
-               goto out;
-       }
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
+                       ret = 0;
+                       goto out;
+               }
 
-       ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
-                                       args->treeid, args->objectid,
-                                       args->name);
+               if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
+                       ret = -EOVERFLOW;
+                       goto out;
+               }
+
+               rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
+               rootrefs->rootref[found].treeid = key.offset;
+               rootrefs->rootref[found].dirid =
+                                 btrfs_root_ref_dirid(leaf, rref);
+               found++;
+
+               ret = btrfs_next_item(root, path);
+               if (ret < 0) {
+                       goto out;
+               } else if (ret > 0) {
+                       ret = -EUCLEAN;
+                       goto out;
+               }
+       }
 
 out:
-       if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
-               ret = -EFAULT;
+       if (!ret || ret == -EOVERFLOW) {
+               rootrefs->num_items = found;
+               /* update min_treeid for next search */
+               if (found)
+                       rootrefs->min_treeid =
+                               rootrefs->rootref[found - 1].treeid + 1;
+               if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
+                       ret = -EFAULT;
+       }
+
+       kfree(rootrefs);
+       btrfs_free_path(path);
 
-       kfree(args);
        return ret;
 }
 
@@ -2309,12 +2812,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_root *dest = NULL;
        struct btrfs_ioctl_vol_args *vol_args;
-       struct btrfs_trans_handle *trans;
-       struct btrfs_block_rsv block_rsv;
-       u64 root_flags;
-       u64 qgroup_reserved;
        int namelen;
-       int ret;
        int err = 0;
 
        if (!S_ISDIR(dir->i_mode))
@@ -2398,133 +2896,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        }
 
        inode_lock(inode);
-
-       /*
-        * Don't allow to delete a subvolume with send in progress. This is
-        * inside the i_mutex so the error handling that has to drop the bit
-        * again is not run concurrently.
-        */
-       spin_lock(&dest->root_item_lock);
-       root_flags = btrfs_root_flags(&dest->root_item);
-       if (dest->send_in_progress == 0) {
-               btrfs_set_root_flags(&dest->root_item,
-                               root_flags | BTRFS_ROOT_SUBVOL_DEAD);
-               spin_unlock(&dest->root_item_lock);
-       } else {
-               spin_unlock(&dest->root_item_lock);
-               btrfs_warn(fs_info,
-                          "Attempt to delete subvolume %llu during send",
-                          dest->root_key.objectid);
-               err = -EPERM;
-               goto out_unlock_inode;
-       }
-
-       down_write(&fs_info->subvol_sem);
-
-       err = may_destroy_subvol(dest);
-       if (err)
-               goto out_up_write;
-
-       btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
-       /*
-        * One for dir inode, two for dir entries, two for root
-        * ref/backref.
-        */
-       err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
-                                              5, &qgroup_reserved, true);
-       if (err)
-               goto out_up_write;
-
-       trans = btrfs_start_transaction(root, 0);
-       if (IS_ERR(trans)) {
-               err = PTR_ERR(trans);
-               goto out_release;
-       }
-       trans->block_rsv = &block_rsv;
-       trans->bytes_reserved = block_rsv.size;
-
-       btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
-
-       ret = btrfs_unlink_subvol(trans, root, dir,
-                               dest->root_key.objectid,
-                               dentry->d_name.name,
-                               dentry->d_name.len);
-       if (ret) {
-               err = ret;
-               btrfs_abort_transaction(trans, ret);
-               goto out_end_trans;
-       }
-
-       btrfs_record_root_in_trans(trans, dest);
-
-       memset(&dest->root_item.drop_progress, 0,
-               sizeof(dest->root_item.drop_progress));
-       dest->root_item.drop_level = 0;
-       btrfs_set_root_refs(&dest->root_item, 0);
-
-       if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
-               ret = btrfs_insert_orphan_item(trans,
-                                       fs_info->tree_root,
-                                       dest->root_key.objectid);
-               if (ret) {
-                       btrfs_abort_transaction(trans, ret);
-                       err = ret;
-                       goto out_end_trans;
-               }
-       }
-
-       ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid,
-                                 BTRFS_UUID_KEY_SUBVOL,
-                                 dest->root_key.objectid);
-       if (ret && ret != -ENOENT) {
-               btrfs_abort_transaction(trans, ret);
-               err = ret;
-               goto out_end_trans;
-       }
-       if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
-               ret = btrfs_uuid_tree_rem(trans, fs_info,
-                                         dest->root_item.received_uuid,
-                                         BTRFS_UUID_KEY_RECEIVED_SUBVOL,
-                                         dest->root_key.objectid);
-               if (ret && ret != -ENOENT) {
-                       btrfs_abort_transaction(trans, ret);
-                       err = ret;
-                       goto out_end_trans;
-               }
-       }
-
-out_end_trans:
-       trans->block_rsv = NULL;
-       trans->bytes_reserved = 0;
-       ret = btrfs_end_transaction(trans);
-       if (ret && !err)
-               err = ret;
-       inode->i_flags |= S_DEAD;
-out_release:
-       btrfs_subvolume_release_metadata(fs_info, &block_rsv);
-out_up_write:
-       up_write(&fs_info->subvol_sem);
-       if (err) {
-               spin_lock(&dest->root_item_lock);
-               root_flags = btrfs_root_flags(&dest->root_item);
-               btrfs_set_root_flags(&dest->root_item,