Merge tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
[muen/linux.git] / fs / btrfs / extent-tree.c
index e0460d7b562273287b5f6adc2ec1f97d928993b4..e08d0d45af4f3d31513107cbb4deaa7d5c244c24 100644 (file)
@@ -27,7 +27,7 @@
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
 #include <linux/lockdep.h>
-#include "hash.h"
+#include <linux/crc32c.h>
 #include "tree-log.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -535,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work)
        struct btrfs_block_group_cache *block_group;
        struct btrfs_fs_info *fs_info;
        struct btrfs_caching_control *caching_ctl;
-       struct btrfs_root *extent_root;
        int ret;
 
        caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
        fs_info = block_group->fs_info;
-       extent_root = fs_info->extent_root;
 
        mutex_lock(&caching_ctl->mutex);
        down_read(&fs_info->commit_root_sem);
@@ -1203,11 +1201,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
        __le64 lenum;
 
        lenum = cpu_to_le64(root_objectid);
-       high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+       high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(owner);
-       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(offset);
-       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
        return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -2652,9 +2650,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
  */
 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                                            struct btrfs_fs_info *fs_info,
                                             unsigned long nr)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -2994,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
        if (trans->transid > async->transid)
                goto end;
 
-       ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+       ret = btrfs_run_delayed_refs(trans, async->count);
        if (ret)
                async->error = ret;
 end:
@@ -3053,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
  * Returns <0 on error and aborts the transaction
  */
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                          struct btrfs_fs_info *fs_info, unsigned long count)
+                          unsigned long count)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_head *head;
@@ -3078,7 +3077,7 @@ again:
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
        trans->can_flush_pending_bgs = false;
-       ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+       ret = __btrfs_run_delayed_refs(trans, count);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                return ret;
@@ -3086,7 +3085,7 @@ again:
 
        if (run_all) {
                if (!list_empty(&trans->new_bgs))
-                       btrfs_create_pending_block_groups(trans, fs_info);
+                       btrfs_create_pending_block_groups(trans);
 
                spin_lock(&delayed_refs->lock);
                node = rb_first(&delayed_refs->href_root);
@@ -3660,9 +3659,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
  * the commit latency by getting rid of the easy block groups while
  * we're still allowing others to join the commit.
  */
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
-                                  struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
@@ -3686,7 +3685,7 @@ again:
         * make sure all the block groups on our dirty list actually
         * exist
         */
-       btrfs_create_pending_block_groups(trans, fs_info);
+       btrfs_create_pending_block_groups(trans);
 
        if (!path) {
                path = btrfs_alloc_path();
@@ -3741,8 +3740,9 @@ again:
                                should_put = 0;
 
                                /*
-                                * the cache_write_mutex is protecting
-                                * the io_list
+                                * The cache_write_mutex is protecting the
+                                * io_list, also refer to the definition of
+                                * btrfs_transaction::io_bgs for more details
                                 */
                                list_add_tail(&cache->io_list, io);
                        } else {
@@ -3800,7 +3800,7 @@ again:
         * go through delayed refs for all the stuff we've just kicked off
         * and then loop back (just once)
         */
-       ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+       ret = btrfs_run_delayed_refs(trans, 0);
        if (!ret && loops == 0) {
                loops++;
                spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3882,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                cache_save_setup(cache, trans, path);
 
                if (!ret)
-                       ret = btrfs_run_delayed_refs(trans, fs_info,
+                       ret = btrfs_run_delayed_refs(trans,
                                                     (unsigned long) -1);
 
                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
@@ -3934,6 +3934,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);
 
+       /*
+        * Refer to the definition of io_bgs member for details why it's safe
+        * to use it without any locking
+        */
        while (!list_empty(io)) {
                cache = list_first_entry(io, struct btrfs_block_group_cache,
                                         io_list);
@@ -4332,8 +4336,7 @@ again:
 
                /* commit the current transaction and try again */
 commit_trans:
-               if (need_commit &&
-                   !atomic_read(&fs_info->open_ioctl_trans)) {
+               if (need_commit) {
                        need_commit--;
 
                        if (need_commit > 0) {
@@ -4541,7 +4544,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
         * Needed because we can end up allocating a system chunk and for an
         * atomic and race free space reservation in the chunk block reserve.
         */
-       ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+       lockdep_assert_held(&fs_info->chunk_mutex);
 
        info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
        spin_lock(&info->lock);
@@ -4602,11 +4605,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                return -ENOSPC;
 
        space_info = __find_space_info(fs_info, flags);
-       if (!space_info) {
-               ret = create_space_info(fs_info, flags, &space_info);
-               if (ret)
-                       return ret;
-       }
+       ASSERT(space_info);
 
 again:
        spin_lock(&space_info->lock);
@@ -4705,7 +4704,7 @@ out:
         */
        if (trans->can_flush_pending_bgs &&
            trans->chunk_bytes_reserved >= (u64)SZ_2M) {
-               btrfs_create_pending_block_groups(trans, fs_info);
+               btrfs_create_pending_block_groups(trans);
                btrfs_trans_release_chunk_metadata(trans);
        }
        return ret;
@@ -4826,7 +4825,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
        long time_left;
        unsigned long nr_pages;
        int loops;
-       enum btrfs_reserve_flush_enum flush;
 
        /* Calc the number of the pages we need flush for space reservation */
        items = calc_reclaim_items_nr(fs_info, to_reclaim);
@@ -4867,10 +4865,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                           atomic_read(&fs_info->async_delalloc_pages) <=
                           (int)max_reclaim);
 skip_async:
-               if (!trans)
-                       flush = BTRFS_RESERVE_FLUSH_ALL;
-               else
-                       flush = BTRFS_RESERVE_NO_FLUSH;
                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets) &&
                    list_empty(&space_info->priority_tickets)) {
@@ -4993,7 +4987,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                        ret = PTR_ERR(trans);
                        break;
                }
-               ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+               ret = btrfs_run_delayed_items_nr(trans, nr);
                btrfs_end_transaction(trans);
                break;
        case FLUSH_DELALLOC:
@@ -5388,10 +5382,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
                    !block_rsv_use_bytes(global_rsv, orig_bytes))
                        ret = 0;
        }
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
                trace_btrfs_space_reservation(fs_info, "space_info:enospc",
                                              block_rsv->space_info->flags,
                                              orig_bytes, 1);
+
+               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+                       dump_space_info(fs_info, block_rsv->space_info,
+                                       orig_bytes, 0);
+       }
        return ret;
 }
 
@@ -5760,6 +5759,9 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
        if (num_bytes == 0)
                return 0;
 
+       ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+       if (ret)
+               return ret;
        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -5772,11 +5774,15 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 /**
  * btrfs_inode_rsv_release - release any excessive reservation.
  * @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ *   Unlike normal operation, qgroup meta reservation needs to know if we are
+ *   freeing qgroup reservation or just converting it into per-trans.  Normally
+ *   @qgroup_free is true for error handling, and false for normal release.
  *
  * This is the same as btrfs_block_rsv_release, except that it handles the
  * tracepoint for the reservation.
  */
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5792,6 +5798,10 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
        if (released > 0)
                trace_btrfs_space_reservation(fs_info, "delalloc",
                                              btrfs_ino(inode), released, 0);
+       if (qgroup_free)
+               btrfs_qgroup_free_meta_prealloc(inode->root, released);
+       else
+               btrfs_qgroup_convert_reserved_meta(inode->root, released);
 }
 
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5892,24 +5902,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info)
-{
-       if (!trans->block_rsv) {
-               ASSERT(!trans->bytes_reserved);
-               return;
-       }
-
-       if (!trans->bytes_reserved)
-               return;
-
-       ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
-       trace_btrfs_space_reservation(fs_info, "transaction",
-                                     trans->transid, trans->bytes_reserved, 0);
-       btrfs_block_rsv_release(fs_info, trans->block_rsv,
-                               trans->bytes_reserved);
-       trans->bytes_reserved = 0;
-}
 
 /*
  * To be called after all the new block groups attached to the transaction
@@ -5951,7 +5943,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         */
        u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
-       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 
+       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
                        num_bytes, 1);
        return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
 }
@@ -5995,7 +5987,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
                /* One for parent inode, two for dir entries */
                num_bytes = 3 * fs_info->nodesize;
-               ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+               ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
                if (ret)
                        return ret;
        } else {
@@ -6014,7 +6006,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
 
        if (ret && *qgroup_reserved)
-               btrfs_qgroup_free_meta(root, *qgroup_reserved);
+               btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
 
        return ret;
 }
@@ -6051,7 +6043,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       struct btrfs_root *root = inode->root;
        unsigned nr_extents;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret = 0;
@@ -6068,13 +6059,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_free_space_inode(inode)) {
                flush = BTRFS_RESERVE_NO_FLUSH;
                delalloc_lock = false;
-       } else if (current->journal_info) {
-               flush = BTRFS_RESERVE_FLUSH_LIMIT;
-       }
+       } else {
+               if (current->journal_info)
+                       flush = BTRFS_RESERVE_FLUSH_LIMIT;
 
-       if (flush != BTRFS_RESERVE_NO_FLUSH &&
-           btrfs_transaction_in_commit(fs_info))
-               schedule_timeout(1);
+               if (btrfs_transaction_in_commit(fs_info))
+                       schedule_timeout(1);
+       }
 
        if (delalloc_lock)
                mutex_lock(&inode->delalloc_mutex);
@@ -6089,19 +6080,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 
-       if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-               ret = btrfs_qgroup_reserve_meta(root,
-                               nr_extents * fs_info->nodesize, true);
-               if (ret)
-                       goto out_fail;
-       }
-
        ret = btrfs_inode_rsv_refill(inode, flush);
-       if (unlikely(ret)) {
-               btrfs_qgroup_free_meta(root,
-                                      nr_extents * fs_info->nodesize);
+       if (unlikely(ret))
                goto out_fail;
-       }
 
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
@@ -6115,7 +6096,7 @@ out_fail:
        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, true);
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
        return ret;
@@ -6125,12 +6106,14 @@ out_fail:
  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
  * @inode: the inode to release the reservation for.
  * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
  *
  * This will release the metadata reservation for an inode.  This can be called
  * once we complete IO for a given set of bytes to release their metadata
  * reservations, or on error for the same reason.
  */
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+                                    bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 
@@ -6143,13 +6126,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_testing(fs_info))
                return;
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
 /**
  * btrfs_delalloc_release_extents - release our outstanding_extents
  * @inode: the inode to balance the reservation for.
  * @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
  *
  * When we reserve space we increase outstanding_extents for the extents we may
  * add.  Once we've set the range as delalloc or created our ordered extents we
@@ -6157,7 +6141,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
  * with btrfs_delalloc_reserve_metadata.
  */
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+                                   bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
        unsigned num_extents;
@@ -6171,7 +6156,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_testing(fs_info))
                return;
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
 /**
@@ -6227,9 +6212,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
  */
 void btrfs_delalloc_release_space(struct inode *inode,
                                  struct extent_changeset *reserved,
-                                 u64 start, u64 len)
+                                 u64 start, u64 len, bool qgroup_free)
 {
-       btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+       btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
        btrfs_free_reserved_data_space(inode, reserved, start, len);
 }
 
@@ -6783,9 +6768,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
        return 0;
 }
 
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *block_group, *tmp;
        struct list_head *deleted_bgs;
        struct extent_io_tree *unpin;
@@ -7351,29 +7336,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return ret;
 }
 
-int __get_raid_index(u64 flags)
-{
-       if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               return BTRFS_RAID_RAID10;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               return BTRFS_RAID_RAID1;
-       else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               return BTRFS_RAID_DUP;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               return BTRFS_RAID_RAID0;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-               return BTRFS_RAID_RAID5;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-               return BTRFS_RAID_RAID6;
-
-       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
-       return __get_raid_index(cache->flags);
-}
-
 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
        [BTRFS_RAID_RAID10]     = "raid10",
        [BTRFS_RAID_RAID1]      = "raid1",
@@ -7488,7 +7450,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
        u64 empty_cluster = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
-       int index = __get_raid_index(flags);
+       int index = btrfs_bg_flags_to_raid_index(flags);
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
@@ -7574,7 +7536,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
-                               index = get_block_group_index(block_group);
+                               index = btrfs_bg_flags_to_raid_index(
+                                               block_group->flags);
                                btrfs_lock_block_group(block_group, delalloc);
                                goto have_block_group;
                        }
@@ -7584,7 +7547,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
        }
 search:
        have_caching_bg = false;
-       if (index == 0 || index == __get_raid_index(flags))
+       if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
                full_search = true;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7842,7 +7805,8 @@ checks:
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
-               BUG_ON(index != get_block_group_index(block_group));
+               BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+                      index);
                btrfs_release_block_group(block_group, delalloc);
                cond_resched();
        }
@@ -7996,6 +7960,51 @@ again:
        up_read(&info->groups_sem);
 }
 
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ *                       hole that is at least as big as @num_bytes.
+ *
+ * @root           -   The root that will contain this extent
+ *
+ * @ram_bytes      -   The amount of space in ram that @num_bytes take. This
+ *                     is used for accounting purposes. This value differs
+ *                     from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes      -   Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size -   Indicates the minimum amount of space that the
+ *                     allocator should try to satisfy. In some cases
+ *                     @num_bytes may be larger than what is required and if
+ *                     the filesystem is fragmented then allocation fails.
+ *                     However, the presence of @min_alloc_size gives a
+ *                     chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size     -   A hint that you plan on doing more COW. This is the
+ *                     size in bytes the allocator should try to find free
+ *                     next to the block it returns.  This is just a hint and
+ *                     may be ignored by the allocator.
+ *
+ * @hint_byte      -   Hint to the allocator to start searching above the byte
+ *                     address passed. It might be ignored.
+ *
+ * @ins            -   This key is modified to record the found hole. It will
+ *                     have the following values:
+ *                     ins->objectid == start position
+ *                     ins->flags = BTRFS_EXTENT_ITEM_KEY
+ *                     ins->offset == the size of the hole.
+ *
+ * @is_data        -   Boolean flag indicating whether an extent is
+ *                     allocated for data (true) or metadata (false)
+ *
+ * @delalloc       -   Boolean flag indicating whether this allocation is for
+ *                     delalloc or not. If 'true' data_rwsem of block groups
+ *                     is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
@@ -8699,6 +8708,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        u64 parent;
        u32 blocksize;
        struct btrfs_key key;
+       struct btrfs_key first_key;
        struct extent_buffer *next;
        int level = wc->level;
        int reada = 0;
@@ -8719,6 +8729,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        }
 
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+       btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+                             path->slots[level]);
        blocksize = fs_info->nodesize;
 
        next = find_extent_buffer(fs_info, bytenr);
@@ -8783,7 +8795,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        if (!next) {
                if (reada && level == 1)
                        reada_walk_down(trans, root, wc, path);
-               next = read_tree_block(fs_info, bytenr, generation);
+               next = read_tree_block(fs_info, bytenr, generation, level - 1,
+                                      &first_key);
                if (IS_ERR(next)) {
                        return PTR_ERR(next);
                } else if (!extent_buffer_uptodate(next)) {
@@ -9648,7 +9661,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
         */
        target = get_restripe_target(fs_info, block_group->flags);
        if (target) {
-               index = __get_raid_index(extended_to_chunk(target));
+               index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
        } else {
                /*
                 * this is just a balance, so if we were marked as full
@@ -9662,7 +9675,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
                        goto out;
                }
 
-               index = get_block_group_index(block_group);
+               index = btrfs_bg_flags_to_raid_index(block_group->flags);
        }
 
        if (index == BTRFS_RAID_RAID10) {
@@ -9911,10 +9924,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        return 0;
 }
 
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_space_info *space_info;
+       struct raid_kobject *rkobj;
+       LIST_HEAD(list);
+       int index;
+       int ret = 0;
+
+       spin_lock(&fs_info->pending_raid_kobjs_lock);
+       list_splice_init(&fs_info->pending_raid_kobjs, &list);
+       spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+       list_for_each_entry(rkobj, &list, list) {
+               space_info = __find_space_info(fs_info, rkobj->flags);
+               index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+               ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+                                 "%s", get_raid_name(index));
+               if (ret) {
+                       kobject_put(&rkobj->kobj);
+                       break;
+               }
+       }
+       if (ret)
+               btrfs_warn(fs_info,
+                          "failed to add kobject for block cache, ignoring");
+}
+
 static void link_block_group(struct btrfs_block_group_cache *cache)
 {
        struct btrfs_space_info *space_info = cache->space_info;
-       int index = get_block_group_index(cache);
+       struct btrfs_fs_info *fs_info = cache->fs_info;
+       int index = btrfs_bg_flags_to_raid_index(cache->flags);
        bool first = false;
 
        down_write(&space_info->groups_sem);
@@ -9924,27 +9967,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
        up_write(&space_info->groups_sem);
 
        if (first) {
-               struct raid_kobject *rkobj;
-               int ret;
-
-               rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
-               if (!rkobj)
-                       goto out_err;
-               rkobj->raid_type = index;
-               kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
-               ret = kobject_add(&rkobj->kobj, &space_info->kobj,
-                                 "%s", get_raid_name(index));
-               if (ret) {
-                       kobject_put(&rkobj->kobj);
-                       goto out_err;
+               struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+               if (!rkobj) {
+                       btrfs_warn(cache->fs_info,
+                               "couldn't alloc memory for raid level kobject");
+                       return;
                }
+               rkobj->flags = cache->flags;
+               kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+               spin_lock(&fs_info->pending_raid_kobjs_lock);
+               list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+               spin_unlock(&fs_info->pending_raid_kobjs_lock);
                space_info->block_group_kobjs[index] = &rkobj->kobj;
        }
-
-       return;
-out_err:
-       btrfs_warn(cache->fs_info,
-                  "failed to add kobject for block cache, ignoring");
 }
 
 static struct btrfs_block_group_cache *
@@ -10160,6 +10196,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
                        inc_block_group_ro(cache, 1);
        }
 
+       btrfs_add_raid_kobjects(info);
        init_global_block_rsv(info);
        ret = 0;
 error:
@@ -10167,9 +10204,9 @@ error:
        return ret;
 }
 
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
-                                      struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *block_group, *tmp;
        struct btrfs_root *extent_root = fs_info->extent_root;
        struct btrfs_block_group_item item;
@@ -10254,15 +10291,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
         * with its ->space_info set.
         */
        cache->space_info = __find_space_info(fs_info, cache->flags);
-       if (!cache->space_info) {
-               ret = create_space_info(fs_info, cache->flags,
-                                      &cache->space_info);
-               if (ret) {
-                       btrfs_remove_free_space_cache(cache);
-                       btrfs_put_block_group(cache);
-                       return ret;
-               }
-       }
+       ASSERT(cache->space_info);
 
        ret = btrfs_add_block_group_cache(fs_info, cache);
        if (ret) {
@@ -10334,7 +10363,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                                  block_group->key.offset);
 
        memcpy(&key, &block_group->key, sizeof(key));
-       index = get_block_group_index(block_group);
+       index = btrfs_bg_flags_to_raid_index(block_group->flags);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))