Merge tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Apr 2018 20:03:38 +0000 (13:03 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Apr 2018 20:03:38 +0000 (13:03 -0700)
Pull btrfs updates from David Sterba:
 "There are a several user visible changes, the rest is mostly invisible
  and continues to clean up the whole code base.

  User visible changes:
   - new mount option nossd_spread (pair for ssd_spread)

   - mount option subvolid will detect junk after the number and fail
     the mount

   - add message after cancelled device replace

   - direct module dependency on libcrc32, removed own crc wrappers

   - removed user space transaction ioctls

   - use lighter locking when reading /proc/self/mounts, RCU instead of
     mutex to avoid unnecessary contention

  Enhancements:
   - skip writeback of last page when truncating file to same size

   - send: do not issue unnecessary truncate operations

   - mount option token specifiers: use %u for unsigned values, more
     validation

   - selftests: more tree block validations

  qgroups:
   - preparatory work for splitting reservation types for data and
     metadata, this should allow for more accurate tracking and fix some
     issues with underflows or do further enhancements

   - split metadata reservations for started and joined transaction so
     they do not get mixed up and are accounted correctly at commit time

   - with the above, it's possible to revert patch that potentially
     deadlocks when trying to make more space by explicitly committing
     when the quota limit is hit

   - fix root item corruption when multiple same source snapshots are
     created with quota enabled

  RAID56:
   - make sure target is identical to source when raid56 rebuild fails
     after dev-replace

   - faster rebuild during scrub, batch by stripes and not
     block-by-block

   - make more use of cached data when rebuilding from a missing device

  Fixes:
   - null pointer deref when device replace target is missing

   - fix fsync after hole punching when using no-holes feature

   - fix lockdep splat when allocating percpu data with wrong GFP flags

  Cleanups, refactoring, core changes:
   - drop redunant parameters from various functions

   - kill and opencode trivial helpers

   - __cold/__exit function annotations

   - dead code removal

   - continued audit and documentation of memory barriers

   - error handling: handle removal from uuid tree

   - error handling: remove handling of impossible condtitons

   - more debugging or error messages

   - updated tracepoints

   - one VLA use removal (and one still left)"

* tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (164 commits)
  btrfs: lift errors from add_extent_changeset to the callers
  Btrfs: print error messages when failing to read trees
  btrfs: user proper type for btrfs_mask_flags flags
  btrfs: split dev-replace locking helpers for read and write
  btrfs: remove stale comments about fs_mutex
  btrfs: use RCU in btrfs_show_devname for device list traversal
  btrfs: update barrier in should_cow_block
  btrfs: use lockdep_assert_held for mutexes
  btrfs: use lockdep_assert_held for spinlocks
  btrfs: Validate child tree block's level and first key
  btrfs: tests/qgroup: Fix wrong tree backref level
  Btrfs: fix copy_items() return value when logging an inode
  Btrfs: fix fsync after hole punching when using no-holes feature
  btrfs: use helper to set ulist aux from a qgroup
  Revert "btrfs: qgroups: Retry after commit on getting EDQUOT"
  btrfs: qgroup: Update trace events for metadata reservation
  btrfs: qgroup: Use root::qgroup_meta_rsv_* to record qgroup meta reserved space
  btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item
  btrfs: qgroup: Use separate meta reservation type for delalloc
  btrfs: qgroup: Introduce function to convert META_PREALLOC into META_PERTRANS
  ...

68 files changed:
fs/btrfs/Kconfig
fs/btrfs/Makefile
fs/btrfs/acl.c
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c
fs/btrfs/compression.c
fs/btrfs/compression.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-inode.h
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/dev-replace.c
fs/btrfs/dev-replace.h
fs/btrfs/dir-item.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/free-space-tree.c
fs/btrfs/hash.c [deleted file]
fs/btrfs/hash.h [deleted file]
fs/btrfs/inode-item.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/locking.c
fs/btrfs/lzo.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/print-tree.c
fs/btrfs/props.c
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/raid56.c
fs/btrfs/reada.c
fs/btrfs/ref-verify.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/super.c
fs/btrfs/sysfs.c
fs/btrfs/tests/btrfs-tests.c
fs/btrfs/tests/extent-map-tests.c
fs/btrfs/tests/qgroup-tests.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-checker.c
fs/btrfs/tree-checker.h
fs/btrfs/tree-defrag.c
fs/btrfs/tree-log.c
fs/btrfs/tree-log.h
fs/btrfs/uuid-tree.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/xattr.c
fs/btrfs/xattr.h
include/linux/crc32c.h
include/trace/events/btrfs.h
lib/libcrc32c.c

index 273351ee4c469c23fc1a083e17b0bfc2159726d2..167e5dc7eadd7ce8dadf9866c0b8f1311494ee5a 100644 (file)
@@ -1,7 +1,6 @@
 config BTRFS_FS
        tristate "Btrfs filesystem support"
-       select CRYPTO
-       select CRYPTO_CRC32C
+       select LIBCRC32C
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
        select LZO_COMPRESS
index 0c4373628eb4f733d8dff351d69d8df6c15b2682..ca693dd554e93443f4d52af4e14d2154aef0961c 100644 (file)
@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
           reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-          uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o
+          uuid-tree.o props.o free-space-tree.o tree-checker.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
index 1ba49ebe67da39f6daf0a9a6f0ff63fff8554935..0066d95b133f8c168039adbda4837102167d9743 100644 (file)
@@ -46,12 +46,12 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                BUG();
        }
 
-       size = __btrfs_getxattr(inode, name, "", 0);
+       size = btrfs_getxattr(inode, name, "", 0);
        if (size > 0) {
                value = kzalloc(size, GFP_KERNEL);
                if (!value)
                        return ERR_PTR(-ENOMEM);
-               size = __btrfs_getxattr(inode, name, value, size);
+               size = btrfs_getxattr(inode, name, value, size);
        }
        if (size > 0) {
                acl = posix_acl_from_xattr(&init_user_ns, value, size);
@@ -65,9 +65,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
        return acl;
 }
 
-/*
- * Needs to be called with fs_mutex held
- */
 static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
                         struct inode *inode, struct posix_acl *acl, int type)
 {
@@ -101,7 +98,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
                        goto out;
        }
 
-       ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+       ret = btrfs_setxattr(trans, inode, name, value, size, 0);
 out:
        kfree(value);
 
@@ -127,11 +124,6 @@ int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        return ret;
 }
 
-/*
- * btrfs_init_acl is already generally called under fs_mutex, so the locking
- * stuff has been fixed to work with that.  If the locking stuff changes, we
- * need to re-evaluate the acl locking stuff.
- */
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
                   struct inode *inode, struct inode *dir)
 {
index 26484648d0903298cbb68bff095232873e94673a..571024bc632e9093c0b781133477fd6d07b14388 100644 (file)
@@ -170,7 +170,7 @@ int __init btrfs_prelim_ref_init(void)
        return 0;
 }
 
-void btrfs_prelim_ref_exit(void)
+void __cold btrfs_prelim_ref_exit(void)
 {
        kmem_cache_destroy(btrfs_prelim_ref_cache);
 }
@@ -738,7 +738,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
                BUG_ON(ref->key_for_search.type);
                BUG_ON(!ref->wanted_disk_byte);
 
-               eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0);
+               eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0,
+                                    ref->level - 1, NULL);
                if (IS_ERR(eb)) {
                        free_pref(ref);
                        return PTR_ERR(eb);
@@ -773,15 +774,12 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
        struct btrfs_key key;
        struct btrfs_key tmp_op_key;
-       struct btrfs_key *op_key = NULL;
        struct rb_node *n;
        int count;
        int ret = 0;
 
-       if (extent_op && extent_op->update_key) {
+       if (extent_op && extent_op->update_key)
                btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-               op_key = &tmp_op_key;
-       }
 
        spin_lock(&head->lock);
        for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
@@ -1291,7 +1289,8 @@ again:
                            ref->level == 0) {
                                struct extent_buffer *eb;
 
-                               eb = read_tree_block(fs_info, ref->parent, 0);
+                               eb = read_tree_block(fs_info, ref->parent, 0,
+                                                    ref->level, NULL);
                                if (IS_ERR(eb)) {
                                        ret = PTR_ERR(eb);
                                        goto out;
index 0c2fab8514ffa1fb22d0208f7a96617898866eb0..0a30028d51964812e0e30ec7f2eca76a2a60b4e6 100644 (file)
@@ -73,7 +73,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
 
 int __init btrfs_prelim_ref_init(void);
-void btrfs_prelim_ref_exit(void);
+void __cold btrfs_prelim_ref_exit(void);
 
 struct prelim_ref {
        struct rb_node rbnode;
index 63f0ccc92a7122b4e5353bfdcc668a84170031c2..ca15be569d69c2656c3c7e737b4773c4a6b9d37e 100644 (file)
@@ -195,7 +195,6 @@ struct btrfs_inode {
 
        /* Hook into fs_info->delayed_iputs */
        struct list_head delayed_iput;
-       long delayed_iput_count;
 
        /*
         * To avoid races between lockless (i_mutex not held) direct IO writes
@@ -365,6 +364,4 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
                        logical_start, csum, csum_expected, mirror_num);
 }
 
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
-
 #endif
index 7d51b5a5b5053ea1737d2e03905c2458b35c621f..3baebbc021c54e0c3eef2f6d9c73cc802f42783f 100644 (file)
@@ -96,9 +96,9 @@
 #include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/string.h>
+#include <linux/crc32c.h>
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 #include "extent_io.h"
 #include "volumes.h"
@@ -1736,7 +1736,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
                size_t sublen = i ? PAGE_SIZE :
                                    (PAGE_SIZE - BTRFS_CSUM_SIZE);
 
-               crc = btrfs_crc32c(crc, data, sublen);
+               crc = crc32c(crc, data, sublen);
        }
        btrfs_csum_final(crc, csum);
        if (memcmp(csum, h->csum, state->csum_size))
index 07d049c0c20fdb3fb55fe4ebf265bfc531d59c71..562c3e633403d482efd5a90e00e07b25c78d3b2a 100644 (file)
@@ -1133,7 +1133,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
        return ret;
 }
 
-void btrfs_exit_compress(void)
+void __cold btrfs_exit_compress(void)
 {
        free_workspaces();
 }
index 677fa4aa0bd754e924da835ef650f2ea262b2522..ce796557a918ed8c1852b713750900f71e85f242 100644 (file)
@@ -76,7 +76,7 @@ struct compressed_bio {
 };
 
 void __init btrfs_init_compress(void);
-void btrfs_exit_compress(void);
+void __cold btrfs_exit_compress(void);
 
 int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
                         u64 start, struct page **pages,
index b88a79e69ddfd896fb4340c963d784c947585c2d..a2c9d21176e270df3400482767faa2c65d0d0f6d 100644 (file)
@@ -41,8 +41,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
                    int level, int slot);
-static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
-                                struct extent_buffer *eb);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -301,11 +299,6 @@ enum mod_log_op {
        MOD_LOG_ROOT_REPLACE,
 };
 
-struct tree_mod_move {
-       int dst_slot;
-       int nr_items;
-};
-
 struct tree_mod_root {
        u64 logical;
        u8 level;
@@ -328,32 +321,15 @@ struct tree_mod_elem {
        u64 blockptr;
 
        /* this is used for op == MOD_LOG_MOVE_KEYS */
-       struct tree_mod_move move;
+       struct {
+               int dst_slot;
+               int nr_items;
+       } move;
 
        /* this is used for op == MOD_LOG_ROOT_REPLACE */
        struct tree_mod_root old_root;
 };
 
-static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
-{
-       read_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
-{
-       read_unlock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
-{
-       write_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
-{
-       write_unlock(&fs_info->tree_mod_log_lock);
-}
-
 /*
  * Pull a new tree mod seq number for our operation.
  */
@@ -373,14 +349,14 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
                           struct seq_list *elem)
 {
-       tree_mod_log_write_lock(fs_info);
+       write_lock(&fs_info->tree_mod_log_lock);
        spin_lock(&fs_info->tree_mod_seq_lock);
        if (!elem->seq) {
                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
        }
        spin_unlock(&fs_info->tree_mod_seq_lock);
-       tree_mod_log_write_unlock(fs_info);
+       write_unlock(&fs_info->tree_mod_log_lock);
 
        return elem->seq;
 }
@@ -422,7 +398,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
         * anything that's lower than the lowest existing (read: blocked)
         * sequence number can be removed from the tree.
         */
-       tree_mod_log_write_lock(fs_info);
+       write_lock(&fs_info->tree_mod_log_lock);
        tm_root = &fs_info->tree_mod_log;
        for (node = rb_first(tm_root); node; node = next) {
                next = rb_next(node);
@@ -432,7 +408,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
                rb_erase(node, tm_root);
                kfree(tm);
        }
-       tree_mod_log_write_unlock(fs_info);
+       write_unlock(&fs_info->tree_mod_log_lock);
 }
 
 /*
@@ -443,7 +419,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  * for root replace operations, or the logical address of the affected
  * block for all other operations.
  *
- * Note: must be called with write lock (tree_mod_log_write_lock).
+ * Note: must be called with write lock for fs_info::tree_mod_log_lock.
  */
 static noinline int
 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -481,7 +457,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
  * returns zero with the tree_mod_log_lock acquired. The caller must hold
  * this until all tree mod log insertions are recorded in the rb tree and then
- * call tree_mod_log_write_unlock() to release.
+ * write unlock fs_info::tree_mod_log_lock.
  */
 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
                                    struct extent_buffer *eb) {
@@ -491,9 +467,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
        if (eb && btrfs_header_level(eb) == 0)
                return 1;
 
-       tree_mod_log_write_lock(fs_info);
+       write_lock(&fs_info->tree_mod_log_lock);
        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
-               tree_mod_log_write_unlock(fs_info);
+               write_unlock(&fs_info->tree_mod_log_lock);
                return 1;
        }
 
@@ -536,38 +512,34 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
        return tm;
 }
 
-static noinline int
-tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
-                       struct extent_buffer *eb, int slot,
-                       enum mod_log_op op, gfp_t flags)
+static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+               enum mod_log_op op, gfp_t flags)
 {
        struct tree_mod_elem *tm;
        int ret;
 
-       if (!tree_mod_need_log(fs_info, eb))
+       if (!tree_mod_need_log(eb->fs_info, eb))
                return 0;
 
        tm = alloc_tree_mod_elem(eb, slot, op, flags);
        if (!tm)
                return -ENOMEM;
 
-       if (tree_mod_dont_log(fs_info, eb)) {
+       if (tree_mod_dont_log(eb->fs_info, eb)) {
                kfree(tm);
                return 0;
        }
 
-       ret = __tree_mod_log_insert(fs_info, tm);
-       tree_mod_log_write_unlock(fs_info);
+       ret = __tree_mod_log_insert(eb->fs_info, tm);
+       write_unlock(&eb->fs_info->tree_mod_log_lock);
        if (ret)
                kfree(tm);
 
        return ret;
 }
 
-static noinline int
-tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
-                        struct extent_buffer *eb, int dst_slot, int src_slot,
-                        int nr_items)
+static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
+               int dst_slot, int src_slot, int nr_items)
 {
        struct tree_mod_elem *tm = NULL;
        struct tree_mod_elem **tm_list = NULL;
@@ -575,7 +547,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
        int i;
        int locked = 0;
 
-       if (!tree_mod_need_log(fs_info, eb))
+       if (!tree_mod_need_log(eb->fs_info, eb))
                return 0;
 
        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
@@ -603,7 +575,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
                }
        }
 
-       if (tree_mod_dont_log(fs_info, eb))
+       if (tree_mod_dont_log(eb->fs_info, eb))
                goto free_tms;
        locked = 1;
 
@@ -613,26 +585,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
         * buffer, i.e. dst_slot < src_slot.
         */
        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
-               ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+               ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
                if (ret)
                        goto free_tms;
        }
 
-       ret = __tree_mod_log_insert(fs_info, tm);
+       ret = __tree_mod_log_insert(eb->fs_info, tm);
        if (ret)
                goto free_tms;
-       tree_mod_log_write_unlock(fs_info);
+       write_unlock(&eb->fs_info->tree_mod_log_lock);
        kfree(tm_list);
 
        return 0;
 free_tms:
        for (i = 0; i < nr_items; i++) {
                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
-                       rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+                       rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
                kfree(tm_list[i]);
        }
        if (locked)
-               tree_mod_log_write_unlock(fs_info);
+               write_unlock(&eb->fs_info->tree_mod_log_lock);
        kfree(tm_list);
        kfree(tm);
 
@@ -660,12 +632,10 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
        return 0;
 }
 
-static noinline int
-tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
-                        struct extent_buffer *old_root,
-                        struct extent_buffer *new_root,
-                        int log_removal)
+static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
+                        struct extent_buffer *new_root, int log_removal)
 {
+       struct btrfs_fs_info *fs_info = old_root->fs_info;
        struct tree_mod_elem *tm = NULL;
        struct tree_mod_elem **tm_list = NULL;
        int nritems = 0;
@@ -713,7 +683,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
        if (!ret)
                ret = __tree_mod_log_insert(fs_info, tm);
 
-       tree_mod_log_write_unlock(fs_info);
+       write_unlock(&fs_info->tree_mod_log_lock);
        if (ret)
                goto free_tms;
        kfree(tm_list);
@@ -740,7 +710,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
        struct tree_mod_elem *cur = NULL;
        struct tree_mod_elem *found = NULL;
 
-       tree_mod_log_read_lock(fs_info);
+       read_lock(&fs_info->tree_mod_log_lock);
        tm_root = &fs_info->tree_mod_log;
        node = tm_root->rb_node;
        while (node) {
@@ -768,7 +738,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
                        break;
                }
        }
-       tree_mod_log_read_unlock(fs_info);
+       read_unlock(&fs_info->tree_mod_log_lock);
 
        return found;
 }
@@ -849,7 +819,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
                        goto free_tms;
        }
 
-       tree_mod_log_write_unlock(fs_info);
+       write_unlock(&fs_info->tree_mod_log_lock);
        kfree(tm_list);
 
        return 0;
@@ -861,36 +831,13 @@ free_tms:
                kfree(tm_list[i]);
        }
        if (locked)
-               tree_mod_log_write_unlock(fs_info);
+               write_unlock(&fs_info->tree_mod_log_lock);
        kfree(tm_list);
 
        return ret;
 }
 
-static inline void
-tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
-                    int dst_offset, int src_offset, int nr_items)
-{
-       int ret;
-       ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
-                                      nr_items);
-       BUG_ON(ret < 0);
-}
-
-static noinline void
-tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
-                         struct extent_buffer *eb, int slot, int atomic)
-{
-       int ret;
-
-       ret = tree_mod_log_insert_key(fs_info, eb, slot,
-                                       MOD_LOG_KEY_REPLACE,
-                                       atomic ? GFP_ATOMIC : GFP_NOFS);
-       BUG_ON(ret < 0);
-}
-
-static noinline int
-tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 {
        struct tree_mod_elem **tm_list = NULL;
        int nritems = 0;
@@ -900,7 +847,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
        if (btrfs_header_level(eb) == 0)
                return 0;
 
-       if (!tree_mod_need_log(fs_info, NULL))
+       if (!tree_mod_need_log(eb->fs_info, NULL))
                return 0;
 
        nritems = btrfs_header_nritems(eb);
@@ -917,11 +864,11 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
                }
        }
 
-       if (tree_mod_dont_log(fs_info, eb))
+       if (tree_mod_dont_log(eb->fs_info, eb))
                goto free_tms;
 
-       ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
-       tree_mod_log_write_unlock(fs_info);
+       ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+       write_unlock(&eb->fs_info->tree_mod_log_lock);
        if (ret)
                goto free_tms;
        kfree(tm_list);
@@ -936,17 +883,6 @@ free_tms:
        return ret;
 }
 
-static noinline void
-tree_mod_log_set_root_pointer(struct btrfs_root *root,
-                             struct extent_buffer *new_root_node,
-                             int log_removal)
-{
-       int ret;
-       ret = tree_mod_log_insert_root(root->fs_info, root->node,
-                                      new_root_node, log_removal);
-       BUG_ON(ret < 0);
-}
-
 /*
  * check if the tree block can be shared by multiple trees
  */
@@ -1173,7 +1109,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                        parent_start = buf->start;
 
                extent_buffer_get(cow);
-               tree_mod_log_set_root_pointer(root, cow, 1);
+               ret = tree_mod_log_insert_root(root->node, cow, 1);
+               BUG_ON(ret < 0);
                rcu_assign_pointer(root->node, cow);
 
                btrfs_free_tree_block(trans, root, buf, parent_start,
@@ -1182,7 +1119,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                add_root_to_dirty_list(root);
        } else {
                WARN_ON(trans->transid != btrfs_header_generation(parent));
-               tree_mod_log_insert_key(fs_info, parent, parent_slot,
+               tree_mod_log_insert_key(parent, parent_slot,
                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
                btrfs_set_node_blockptr(parent, parent_slot,
                                        cow->start);
@@ -1190,7 +1127,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
                if (last_ref) {
-                       ret = tree_mod_log_free_eb(fs_info, buf);
+                       ret = tree_mod_log_free_eb(buf);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
                                return ret;
@@ -1211,9 +1148,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
  * returns the logical address of the oldest predecessor of the given root.
  * entries older than time_seq are ignored.
  */
-static struct tree_mod_elem *
-__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
-                          struct extent_buffer *eb_root, u64 time_seq)
+static struct tree_mod_elem *__tree_mod_log_oldest_root(
+               struct extent_buffer *eb_root, u64 time_seq)
 {
        struct tree_mod_elem *tm;
        struct tree_mod_elem *found = NULL;
@@ -1230,7 +1166,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
         * first operation that's logged for this root.
         */
        while (1) {
-               tm = tree_mod_log_search_oldest(fs_info, root_logical,
+               tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
                                                time_seq);
                if (!looped && !tm)
                        return NULL;
@@ -1279,7 +1215,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
        unsigned long p_size = sizeof(struct btrfs_key_ptr);
 
        n = btrfs_header_nritems(eb);
-       tree_mod_log_read_lock(fs_info);
+       read_lock(&fs_info->tree_mod_log_lock);
        while (tm && tm->seq >= time_seq) {
                /*
                 * all the operations are recorded with the operator used for
@@ -1334,7 +1270,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
                if (tm->logical != first_tm->logical)
                        break;
        }
-       tree_mod_log_read_unlock(fs_info);
+       read_unlock(&fs_info->tree_mod_log_lock);
        btrfs_set_header_nritems(eb, n);
 }
 
@@ -1418,9 +1354,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
        struct tree_mod_root *old_root = NULL;
        u64 old_generation = 0;
        u64 logical;
+       int level;
 
        eb_root = btrfs_read_lock_root_node(root);
-       tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq);
+       tm = __tree_mod_log_oldest_root(eb_root, time_seq);
        if (!tm)
                return eb_root;
 
@@ -1428,15 +1365,17 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
                old_root = &tm->old_root;
                old_generation = tm->generation;
                logical = old_root->logical;
+               level = old_root->level;
        } else {
                logical = eb_root->start;
+               level = btrfs_header_level(eb_root);
        }
 
        tm = tree_mod_log_search(fs_info, logical, time_seq);
        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);
-               old = read_tree_block(fs_info, logical, 0);
+               old = read_tree_block(fs_info, logical, 0, level, NULL);
                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
                        if (!IS_ERR(old))
                                free_extent_buffer(old);
@@ -1484,7 +1423,7 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
        int level;
        struct extent_buffer *eb_root = btrfs_root_node(root);
 
-       tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+       tm = __tree_mod_log_oldest_root(eb_root, time_seq);
        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
                level = tm->old_root.level;
        } else {
@@ -1502,8 +1441,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
        if (btrfs_is_testing(root->fs_info))
                return 0;
 
-       /* ensure we can see the force_cow */
-       smp_rmb();
+       /* Ensure we can see the FORCE_COW bit */
+       smp_mb__before_atomic();
 
        /*
         * We do not need to cow a block if
@@ -1656,6 +1595,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        btrfs_set_lock_blocking(parent);
 
        for (i = start_slot; i <= end_slot; i++) {
+               struct btrfs_key first_key;
                int close = 1;
 
                btrfs_node_key(parent, &disk_key, i);
@@ -1665,6 +1605,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                progress_passed = 1;
                blocknr = btrfs_node_blockptr(parent, i);
                gen = btrfs_node_ptr_generation(parent, i);
+               btrfs_node_key_to_cpu(parent, &first_key, i);
                if (last_block == 0)
                        last_block = blocknr;
 
@@ -1688,7 +1629,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        uptodate = 0;
                if (!cur || !uptodate) {
                        if (!cur) {
-                               cur = read_tree_block(fs_info, blocknr, gen);
+                               cur = read_tree_block(fs_info, blocknr, gen,
+                                                     parent_level - 1,
+                                                     &first_key);
                                if (IS_ERR(cur)) {
                                        return PTR_ERR(cur);
                                } else if (!extent_buffer_uptodate(cur)) {
@@ -1696,7 +1639,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                                        return -EIO;
                                }
                        } else if (!uptodate) {
-                               err = btrfs_read_buffer(cur, gen);
+                               err = btrfs_read_buffer(cur, gen,
+                                               parent_level - 1,&first_key);
                                if (err) {
                                        free_extent_buffer(cur);
                                        return err;
@@ -1849,14 +1793,17 @@ read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
 {
        int level = btrfs_header_level(parent);
        struct extent_buffer *eb;
+       struct btrfs_key first_key;
 
        if (slot < 0 || slot >= btrfs_header_nritems(parent))
                return ERR_PTR(-ENOENT);
 
        BUG_ON(level == 0);
 
+       btrfs_node_key_to_cpu(parent, &first_key, slot);
        eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
-                            btrfs_node_ptr_generation(parent, slot));
+                            btrfs_node_ptr_generation(parent, slot),
+                            level - 1, &first_key);
        if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
                free_extent_buffer(eb);
                eb = ERR_PTR(-EIO);
@@ -1928,7 +1875,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        goto enospc;
                }
 
-               tree_mod_log_set_root_pointer(root, child, 1);
+               ret = tree_mod_log_insert_root(root->node, child, 1);
+               BUG_ON(ret < 0);
                rcu_assign_pointer(root->node, child);
 
                add_root_to_dirty_list(root);
@@ -2007,8 +1955,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                } else {
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
-                       tree_mod_log_set_node_key(fs_info, parent,
-                                                 pslot + 1, 0);
+                       ret = tree_mod_log_insert_key(parent, pslot + 1,
+                                       MOD_LOG_KEY_REPLACE, GFP_NOFS);
+                       BUG_ON(ret < 0);
                        btrfs_set_node_key(parent, &right_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
                }
@@ -2052,7 +2001,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
                btrfs_node_key(mid, &mid_key, 0);
-               tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
+               ret = tree_mod_log_insert_key(parent, pslot,
+                               MOD_LOG_KEY_REPLACE, GFP_NOFS);
+               BUG_ON(ret < 0);
                btrfs_set_node_key(parent, &mid_key, pslot);
                btrfs_mark_buffer_dirty(parent);
        }
@@ -2153,7 +2104,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        struct btrfs_disk_key disk_key;
                        orig_slot += left_nr;
                        btrfs_node_key(mid, &disk_key, 0);
-                       tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
+                       ret = tree_mod_log_insert_key(parent, pslot,
+                                       MOD_LOG_KEY_REPLACE, GFP_NOFS);
+                       BUG_ON(ret < 0);
                        btrfs_set_node_key(parent, &disk_key, pslot);
                        btrfs_mark_buffer_dirty(parent);
                        if (btrfs_header_nritems(left) > orig_slot) {
@@ -2207,8 +2160,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        struct btrfs_disk_key disk_key;
 
                        btrfs_node_key(right, &disk_key, 0);
-                       tree_mod_log_set_node_key(fs_info, parent,
-                                                 pslot + 1, 0);
+                       ret = tree_mod_log_insert_key(parent, pslot + 1,
+                                       MOD_LOG_KEY_REPLACE, GFP_NOFS);
+                       BUG_ON(ret < 0);
                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
 
@@ -2445,10 +2399,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
        u64 gen;
        struct extent_buffer *b = *eb_ret;
        struct extent_buffer *tmp;
+       struct btrfs_key first_key;
        int ret;
+       int parent_level;
 
        blocknr = btrfs_node_blockptr(b, slot);
        gen = btrfs_node_ptr_generation(b, slot);
+       parent_level = btrfs_header_level(b);
+       btrfs_node_key_to_cpu(b, &first_key, slot);
 
        tmp = find_extent_buffer(fs_info, blocknr);
        if (tmp) {
@@ -2467,7 +2425,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
                btrfs_set_path_blocking(p);
 
                /* now we're allowed to do a blocking uptodate check */
-               ret = btrfs_read_buffer(tmp, gen);
+               ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
                if (!ret) {
                        *eb_ret = tmp;
                        return 0;
@@ -2494,7 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
        btrfs_release_path(p);
 
        ret = -EAGAIN;
-       tmp = read_tree_block(fs_info, blocknr, 0);
+       tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
+                             &first_key);
        if (!IS_ERR(tmp)) {
                /*
                 * If the read above didn't mark this buffer up to date,
@@ -3161,13 +3120,17 @@ static void fixup_low_keys(struct btrfs_fs_info *fs_info,
 {
        int i;
        struct extent_buffer *t;
+       int ret;
 
        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
                int tslot = path->slots[i];
+
                if (!path->nodes[i])
                        break;
                t = path->nodes[i];
-               tree_mod_log_set_node_key(fs_info, t, tslot, 1);
+               ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
+                               GFP_ATOMIC);
+               BUG_ON(ret < 0);
                btrfs_set_node_key(t, key, tslot);
                btrfs_mark_buffer_dirty(path->nodes[i]);
                if (tslot != 0)
@@ -3264,8 +3227,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 
        if (push_items < src_nritems) {
                /*
-                * don't call tree_mod_log_eb_move here, key removal was already
-                * fully logged by tree_mod_log_eb_copy above.
+                * Don't call tree_mod_log_insert_move here, key removal was
+                * already fully logged by tree_mod_log_eb_copy above.
                 */
                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
                                      btrfs_node_key_ptr_offset(push_items),
@@ -3320,7 +3283,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
        if (max_push < push_items)
                push_items = max_push;
 
-       tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems);
+       ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+       BUG_ON(ret < 0);
        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
                                      btrfs_node_key_ptr_offset(0),
                                      (dst_nritems) *
@@ -3363,6 +3327,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        struct extent_buffer *c;
        struct extent_buffer *old;
        struct btrfs_disk_key lower_key;
+       int ret;
 
        BUG_ON(path->nodes[level]);
        BUG_ON(path->nodes[level-1] != root->node);
@@ -3401,7 +3366,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
 
        old = root->node;
-       tree_mod_log_set_root_pointer(root, c, 0);
+       ret = tree_mod_log_insert_root(root->node, c, 0);
+       BUG_ON(ret < 0);
        rcu_assign_pointer(root->node, c);
 
        /* the super has an extra ref to root->node */
@@ -3438,17 +3404,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
        BUG_ON(slot > nritems);
        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
        if (slot != nritems) {
-               if (level)
-                       tree_mod_log_eb_move(fs_info, lower, slot + 1,
-                                            slot, nritems - slot);
+               if (level) {
+                       ret = tree_mod_log_insert_move(lower, slot + 1, slot,
+                                       nritems - slot);
+                       BUG_ON(ret < 0);
+               }
                memmove_extent_buffer(lower,
                              btrfs_node_key_ptr_offset(slot + 1),
                              btrfs_node_key_ptr_offset(slot),
                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
        }
        if (level) {
-               ret = tree_mod_log_insert_key(fs_info, lower, slot,
-                                             MOD_LOG_KEY_ADD, GFP_NOFS);
+               ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
+                               GFP_NOFS);
                BUG_ON(ret < 0);
        }
        btrfs_set_node_key(lower, key, slot);
@@ -4911,17 +4879,19 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 
        nritems = btrfs_header_nritems(parent);
        if (slot != nritems - 1) {
-               if (level)
-                       tree_mod_log_eb_move(fs_info, parent, slot,
-                                            slot + 1, nritems - slot - 1);
+               if (level) {
+                       ret = tree_mod_log_insert_move(parent, slot, slot + 1,
+                                       nritems - slot - 1);
+                       BUG_ON(ret < 0);
+               }
                memmove_extent_buffer(parent,
                              btrfs_node_key_ptr_offset(slot),
                              btrfs_node_key_ptr_offset(slot + 1),
                              sizeof(struct btrfs_key_ptr) *
                              (nritems - slot - 1));
        } else if (level) {
-               ret = tree_mod_log_insert_key(fs_info, parent, slot,
-                                             MOD_LOG_KEY_REMOVE, GFP_NOFS);
+               ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
+                               GFP_NOFS);
                BUG_ON(ret < 0);
        }
 
@@ -5145,9 +5115,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * into min_key, so you can call btrfs_search_slot with cow=1 on the
  * key and get a writable path.
  *
- * This does lock as it descends, and path->keep_locks should be set
- * to 1 by the caller.
- *
  * This honors path->lowest_level to prevent descent past a given level
  * of the tree.
  *
index da308774b8a4538c4bbea595a11b49a2a8c5ca5e..0eb55825862a7bac24a6b64c901dfbdaf70bf103 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/sizes.h>
 #include <linux/dynamic_debug.h>
 #include <linux/refcount.h>
+#include <linux/crc32c.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -65,6 +66,8 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
+#define BTRFS_OLDEST_GENERATION        0ULL
+
 #define BTRFS_COMPAT_EXTENT_TREE_V0
 
 /*
@@ -86,9 +89,9 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_LINK_MAX 65535U
 
+/* four bytes for CRC32 */
 static const int btrfs_csum_sizes[] = { 4 };
 
-/* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 /* ioprio of readahead is set to idle */
@@ -98,6 +101,7 @@ static const int btrfs_csum_sizes[] = { 4 };
 
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
+
 /*
  * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
  */
@@ -381,8 +385,9 @@ struct btrfs_dev_replace {
 
 /* For raid type sysfs entries */
 struct raid_kobject {
-       int raid_type;
+       u64 flags;
        struct kobject kobj;
+       struct list_head list;
 };
 
 struct btrfs_space_info {
@@ -707,7 +712,6 @@ struct btrfs_delayed_root;
 #define BTRFS_FS_LOG_RECOVERING                        4
 #define BTRFS_FS_OPEN                          5
 #define BTRFS_FS_QUOTA_ENABLED                 6
-#define BTRFS_FS_QUOTA_ENABLING                        7
 #define BTRFS_FS_UPDATE_UUID_TREE_GEN          9
 #define BTRFS_FS_CREATING_FREE_SPACE_TREE      10
 #define BTRFS_FS_BTREE_ERR                     11
@@ -788,7 +792,7 @@ struct btrfs_fs_info {
        unsigned long pending_changes;
        unsigned long compress_type:4;
        unsigned int compress_level;
-       int commit_interval;
+       u32 commit_interval;
        /*
         * It is a suggestive number, the read side is safe even it gets a
         * wrong number because we will write out the data into a regular
@@ -877,7 +881,6 @@ struct btrfs_fs_info {
        struct rb_root tree_mod_log;
 
        atomic_t async_delalloc_pages;
-       atomic_t open_ioctl_trans;
 
        /*
         * this is used to protect the following list -- ordered_roots.
@@ -935,9 +938,11 @@ struct btrfs_fs_info {
        struct btrfs_workqueue *extent_workers;
        struct task_struct *transaction_kthread;
        struct task_struct *cleaner_kthread;
-       int thread_pool_size;
+       u32 thread_pool_size;
 
        struct kobject *space_info_kobj;
+       struct list_head pending_raid_kobjs;
+       spinlock_t pending_raid_kobjs_lock; /* uncontended */
 
        u64 total_pinned;
 
@@ -952,9 +957,9 @@ struct btrfs_fs_info {
        struct btrfs_fs_devices *fs_devices;
 
        /*
-        * the space_info list is almost entirely read only.  It only changes
-        * when we add a new raid type to the FS, and that happens
-        * very rarely.  RCU is used to protect it.
+        * The space_info list is effectively read only after initial
+        * setup.  It is populated at mount time and cleaned up after
+        * all block groups are removed.  RCU is used to protect it.
         */
        struct list_head space_info;
 
@@ -993,8 +998,8 @@ struct btrfs_fs_info {
        struct btrfs_balance_control *balance_ctl;
        wait_queue_head_t balance_wait_q;
 
-       unsigned data_chunk_allocations;
-       unsigned metadata_ratio;
+       u32 data_chunk_allocations;
+       u32 metadata_ratio;
 
        void *bdev_holder;
 
@@ -1260,12 +1265,13 @@ struct btrfs_root {
        struct btrfs_subvolume_writers *subv_writers;
        atomic_t will_be_snapshotted;
 
-       /* For qgroup metadata space reserve */
-       atomic64_t qgroup_meta_rsv;
+       /* For qgroup metadata reserved space */
+       spinlock_t qgroup_meta_rsv_lock;
+       u64 qgroup_meta_rsv_pertrans;
+       u64 qgroup_meta_rsv_prealloc;
 };
 
 struct btrfs_file_private {
-       struct btrfs_trans_handle *trans;
        void *filldir_buf;
 };
 
@@ -2554,6 +2560,20 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
        ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
        btrfs_item_offset_nr(leaf, slot)))
 
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+       return crc32c((u32)~1, name, len);
+}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+                                   int len)
+{
+       return (u64) crc32c(parent_objectid, name, len);
+}
+
 static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 {
        return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
@@ -2608,7 +2628,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                          struct btrfs_fs_info *fs_info, unsigned long count);
+                          unsigned long count);
 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
                                 unsigned long count, u64 transid, int wait);
 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
@@ -2628,7 +2648,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 u64 bytenr);
 void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-int get_block_group_index(struct btrfs_block_group_cache *cache);
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root,
                                             u64 parent, u64 root_objectid,
@@ -2668,15 +2687,13 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
                                       u64 start, u64 len);
 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 owner, u64 offset);
 
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
-                                  struct btrfs_fs_info *fs_info);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_fs_info *fs_info);
 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
@@ -2688,6 +2705,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           struct btrfs_fs_info *fs_info, u64 bytes_used,
                           u64 type, u64 chunk_offset, u64 size);
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info);
 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
                                struct btrfs_fs_info *fs_info,
                                const u64 chunk_offset);
@@ -2697,8 +2715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
-                                      struct btrfs_fs_info *fs_info);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
@@ -2730,11 +2747,10 @@ int btrfs_check_data_free_space(struct inode *inode,
 void btrfs_free_reserved_data_space(struct inode *inode,
                        struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode,
-                       struct extent_changeset *reserved, u64 start, u64 len);
+                                 struct extent_changeset *reserved,
+                                 u64 start, u64 len, bool qgroup_free);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
                                            u64 len);
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
                                  struct btrfs_inode *inode);
@@ -2745,10 +2761,12 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                     u64 *qgroup_reserved, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
                                      struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+                                   bool qgroup_free);
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+                                    bool qgroup_free);
 int btrfs_delalloc_reserve_space(struct inode *inode,
                        struct extent_changeset **reserved, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
@@ -2792,7 +2810,6 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
-int __get_raid_index(u64 flags);
 int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
@@ -3195,8 +3212,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
 int __init btrfs_init_cachep(void);
-void btrfs_destroy_cachep(void);
-long btrfs_ioctl_trans_end(struct file *file);
+void __cold btrfs_destroy_cachep(void);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                         struct btrfs_root *root, int *was_new);
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
@@ -3246,7 +3262,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 
 /* file.c */
 int __init btrfs_auto_defrag_init(void);
-void btrfs_auto_defrag_exit(void);
+void __cold btrfs_auto_defrag_exit(void);
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                           struct btrfs_inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
@@ -3281,25 +3297,23 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 /* sysfs.c */
 int __init btrfs_init_sysfs(void);
-void btrfs_exit_sysfs(void);
+void __cold btrfs_exit_sysfs(void);
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
 void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
 
-/* xattr.c */
-ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-
 /* super.c */
 int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
                        unsigned long new_flags);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
-static inline __printf(2, 3)
+static inline __printf(2, 3) __cold
 void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
 }
 
 #ifdef CONFIG_PRINTK
 __printf(2, 3)
+__cold
 void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
 #else
 #define btrfs_printk(fs_info, fmt, args...) \
index 0530f6f2e4ba8bff7e93cd11151db804ca70ffac..86ec2edc05e873a0b2b07c04c72dd0f43276e378 100644 (file)
@@ -23,6 +23,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "ctree.h"
+#include "qgroup.h"
 
 #define BTRFS_DELAYED_WRITEBACK                512
 #define BTRFS_DELAYED_BACKGROUND       128
@@ -42,7 +43,7 @@ int __init btrfs_delayed_inode_init(void)
        return 0;
 }
 
-void btrfs_delayed_inode_exit(void)
+void __cold btrfs_delayed_inode_exit(void)
 {
        kmem_cache_destroy(delayed_node_cache);
 }
@@ -552,11 +553,12 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
 }
 
 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
-                                              struct btrfs_fs_info *fs_info,
+                                              struct btrfs_root *root,
                                               struct btrfs_delayed_item *item)
 {
        struct btrfs_block_rsv *src_rsv;
        struct btrfs_block_rsv *dst_rsv;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        u64 num_bytes;
        int ret;
 
@@ -578,15 +580,17 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info,
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
                                                struct btrfs_delayed_item *item)
 {
        struct btrfs_block_rsv *rsv;
+       struct btrfs_fs_info *fs_info = root->fs_info;
 
        if (!item->bytes_reserved)
                return;
 
        rsv = &fs_info->delayed_block_rsv;
+       btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved);
        trace_btrfs_space_reservation(fs_info, "delayed_item",
                                      item->key.objectid, item->bytes_reserved,
                                      0);
@@ -611,6 +615,9 @@ static int btrfs_delayed_inode_reserve_metadata(
 
        num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
+       ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+       if (ret < 0)
+               return ret;
        /*
         * btrfs_dirty_inode will update the inode under btrfs_join_transaction
         * which doesn't reserve space for speed.  This is a problem since we
@@ -630,8 +637,10 @@ static int btrfs_delayed_inode_reserve_metadata(
                 * EAGAIN to make us stop the transaction we have, so return
                 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
                 */
-               if (ret == -EAGAIN)
+               if (ret == -EAGAIN) {
                        ret = -ENOSPC;
+                       btrfs_qgroup_free_meta_prealloc(root, num_bytes);
+               }
                if (!ret) {
                        node->bytes_reserved = num_bytes;
                        trace_btrfs_space_reservation(fs_info,
@@ -653,7 +662,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 }
 
 static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
-                                               struct btrfs_delayed_node *node)
+                                               struct btrfs_delayed_node *node,
+                                               bool qgroup_free)
 {
        struct btrfs_block_rsv *rsv;
 
@@ -665,6 +675,12 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
                                      node->inode_id, node->bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, rsv,
                                node->bytes_reserved);
+       if (qgroup_free)
+               btrfs_qgroup_free_meta_prealloc(node->root,
+                               node->bytes_reserved);
+       else
+               btrfs_qgroup_convert_reserved_meta(node->root,
+                               node->bytes_reserved);
        node->bytes_reserved = 0;
 }
 
@@ -766,7 +782,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
                                    curr->data_len);
                slot++;
 
-               btrfs_delayed_item_release_metadata(fs_info, curr);
+               btrfs_delayed_item_release_metadata(root, curr);
 
                list_del(&curr->tree_list);
                btrfs_release_delayed_item(curr);
@@ -788,7 +804,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                                     struct btrfs_path *path,
                                     struct btrfs_delayed_item *delayed_item)
 {
-       struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *leaf;
        char *ptr;
        int ret;
@@ -806,7 +821,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                            delayed_item->data_len);
        btrfs_mark_buffer_dirty(leaf);
 
-       btrfs_delayed_item_release_metadata(fs_info, delayed_item);
+       btrfs_delayed_item_release_metadata(root, delayed_item);
        return 0;
 }
 
@@ -858,7 +873,6 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
                                    struct btrfs_path *path,
                                    struct btrfs_delayed_item *item)
 {
-       struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_delayed_item *curr, *next;
        struct extent_buffer *leaf;
        struct btrfs_key key;
@@ -908,7 +922,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
                goto out;
 
        list_for_each_entry_safe(curr, next, &head, tree_list) {
-               btrfs_delayed_item_release_metadata(fs_info, curr);
+               btrfs_delayed_item_release_metadata(root, curr);
                list_del(&curr->tree_list);
                btrfs_release_delayed_item(curr);
        }
@@ -1051,7 +1065,7 @@ out:
 no_iref:
        btrfs_release_path(path);
 err_out:
-       btrfs_delayed_inode_release_metadata(fs_info, node);
+       btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
        btrfs_release_delayed_inode(node);
 
        return ret;
@@ -1115,9 +1129,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
  * Returns < 0 on error and returns with an aborted transaction with any
  * outstanding delayed items cleaned up.
  */
-static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
-                                    struct btrfs_fs_info *fs_info, int nr)
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_root *delayed_root;
        struct btrfs_delayed_node *curr_node, *prev_node;
        struct btrfs_path *path;
@@ -1162,16 +1176,14 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
-                           struct btrfs_fs_info *fs_info)
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
 {
-       return __btrfs_run_delayed_items(trans, fs_info, -1);
+       return __btrfs_run_delayed_items(trans, -1);
 }
 
-int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info, int nr)
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
 {
-       return __btrfs_run_delayed_items(trans, fs_info, nr);
+       return __btrfs_run_delayed_items(trans, nr);
 }
 
 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
@@ -1443,7 +1455,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        btrfs_set_stack_dir_type(dir_item, type);
        memcpy((char *)(dir_item + 1), name, name_len);
 
-       ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, delayed_item);
+       ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
        /*
         * we have reserved enough space when we start a new transaction,
         * so reserving metadata failure is impossible
@@ -1480,7 +1492,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
                return 1;
        }
 
-       btrfs_delayed_item_release_metadata(fs_info, item);
+       btrfs_delayed_item_release_metadata(node->root, item);
        btrfs_release_delayed_item(item);
        mutex_unlock(&node->mutex);
        return 0;
@@ -1515,7 +1527,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 
        item->key = item_key;
 
-       ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, item);
+       ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
        /*
         * we have reserved enough space when we start a new transaction,
         * so reserving metadata failure is impossible.
@@ -1880,7 +1892,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
        mutex_lock(&delayed_node->mutex);
        curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
        while (curr_item) {
-               btrfs_delayed_item_release_metadata(fs_info, curr_item);
+               btrfs_delayed_item_release_metadata(root, curr_item);
                prev_item = curr_item;
                curr_item = __btrfs_next_delayed_item(prev_item);
                btrfs_release_delayed_item(prev_item);
@@ -1888,7 +1900,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 
        curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
        while (curr_item) {
-               btrfs_delayed_item_release_metadata(fs_info, curr_item);
+               btrfs_delayed_item_release_metadata(root, curr_item);
                prev_item = curr_item;
                curr_item = __btrfs_next_delayed_item(prev_item);
                btrfs_release_delayed_item(prev_item);
@@ -1898,7 +1910,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
                btrfs_release_delayed_iref(delayed_node);
 
        if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
-               btrfs_delayed_inode_release_metadata(fs_info, delayed_node);
+               btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
                btrfs_release_delayed_inode(delayed_node);
        }
        mutex_unlock(&delayed_node->mutex);
index c4189d4959343219eadf1db68d8aa5dd3f8d6ee4..100a91e26b557d9ad86f40bc97f7f59028470aa1 100644 (file)
@@ -111,10 +111,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 
 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode);
 
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
-                           struct btrfs_fs_info *fs_info);
-int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info, int nr);
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr);
 
 void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info);
 
@@ -151,7 +149,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 
 /* for init */
 int __init btrfs_delayed_inode_init(void);
-void btrfs_delayed_inode_exit(void);
+void __cold btrfs_delayed_inode_exit(void);
 
 /* for debugging */
 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
index 7ab5e0128f0ce823101f5623a0664c01208f89a7..2677257c149dcf1f2e5419ba486b0b2b760ec42c 100644 (file)
@@ -216,7 +216,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
 
        delayed_refs = &trans->transaction->delayed_refs;
-       assert_spin_locked(&delayed_refs->lock);
+       lockdep_assert_held(&delayed_refs->lock);
        if (mutex_trylock(&head->mutex))
                return 0;
 
@@ -239,7 +239,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
                                    struct btrfs_delayed_ref_head *head,
                                    struct btrfs_delayed_ref_node *ref)
 {
-       assert_spin_locked(&head->lock);
+       lockdep_assert_held(&head->lock);
        rb_erase(&ref->ref_node, &head->ref_tree);
        RB_CLEAR_NODE(&ref->ref_node);
        if (!list_empty(&ref->add_list))
@@ -307,7 +307,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
        struct rb_node *node;
        u64 seq = 0;
 
-       assert_spin_locked(&head->lock);
+       lockdep_assert_held(&head->lock);
 
        if (RB_EMPTY_ROOT(&head->ref_tree))
                return;
@@ -930,7 +930,7 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
        return find_ref_head(&delayed_refs->href_root, bytenr, 0);
 }
 
-void btrfs_delayed_ref_exit(void)
+void __cold btrfs_delayed_ref_exit(void)
 {
        kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
        kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
index c4f625e5a691c74927d902d7de72cb93f8669ff9..9e3e5aff0937c44bbd1e291931087d4a272dd23b 100644 (file)
@@ -204,7 +204,7 @@ extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
 extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
 
 int __init btrfs_delayed_ref_init(void);
-void btrfs_delayed_ref_exit(void);
+void __cold btrfs_delayed_ref_exit(void);
 
 static inline struct btrfs_delayed_extent_op *
 btrfs_alloc_delayed_extent_op(void)
index 7efbc4d1128b222f0d18c18998b0f955debc2ac9..0d203633bb96493dbc8abda9bb4fe4bf9bb2f3cd 100644 (file)
@@ -44,7 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
                                                struct btrfs_fs_info *fs_info,
                                                struct btrfs_device *srcdev,
                                                struct btrfs_device *tgtdev);
-static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 static int btrfs_dev_replace_kthread(void *data);
 static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
 
@@ -174,8 +173,14 @@ no_valid_dev_replace_entry_found:
                        }
                        set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
                                &dev_replace->tgtdev->dev_state);
-                       btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
-                               dev_replace->tgtdev);
+
+                       WARN_ON(fs_info->fs_devices->rw_devices == 0);
+                       dev_replace->tgtdev->io_width = fs_info->sectorsize;
+                       dev_replace->tgtdev->io_align = fs_info->sectorsize;
+                       dev_replace->tgtdev->sector_size = fs_info->sectorsize;
+                       dev_replace->tgtdev->fs_info = fs_info;
+                       set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+                               &dev_replace->tgtdev->dev_state);
                }
                break;
        }
@@ -200,13 +205,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
        struct btrfs_dev_replace_item *ptr;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-       btrfs_dev_replace_lock(dev_replace, 0);
+       btrfs_dev_replace_read_lock(dev_replace);
        if (!dev_replace->is_valid ||
            !dev_replace->item_needs_writeback) {
-               btrfs_dev_replace_unlock(dev_replace, 0);
+               btrfs_dev_replace_read_unlock(dev_replace);
                return 0;
        }
-       btrfs_dev_replace_unlock(dev_replace, 0);
+       btrfs_dev_replace_read_unlock(dev_replace);
 
        key.objectid = 0;
        key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +269,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
        ptr = btrfs_item_ptr(eb, path->slots[0],
                             struct btrfs_dev_replace_item);
 
-       btrfs_dev_replace_lock(dev_replace, 1);
+       btrfs_dev_replace_write_lock(dev_replace);
        if (dev_replace->srcdev)
                btrfs_set_dev_replace_src_devid(eb, ptr,
                        dev_replace->srcdev->devid);
@@ -287,7 +292,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
        btrfs_set_dev_replace_cursor_right(eb, ptr,
                dev_replace->cursor_right);
        dev_replace->item_needs_writeback = 0;
-       btrfs_dev_replace_unlock(dev_replace, 1);
+       btrfs_dev_replace_write_unlock(dev_replace);
 
        btrfs_mark_buffer_dirty(eb);
 
@@ -307,7 +312,7 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 
 static char* btrfs_dev_name(struct btrfs_device *device)
 {
-       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+       if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
                return "<missing disk>";
        else
                return rcu_str_deref(device->name);
@@ -352,7 +357,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
                return PTR_ERR(trans);
        }
 
-       btrfs_dev_replace_lock(dev_replace, 1);
+       btrfs_dev_replace_write_lock(dev_replace);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -390,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
        dev_replace->item_needs_writeback = 1;
        atomic64_set(&dev_replace->num_write_errors, 0);
        atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
-       btrfs_dev_replace_unlock(dev_replace, 1);
+       btrfs_dev_replace_write_unlock(dev_replace);
 
        ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
        if (ret)
@@ -402,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
-               btrfs_dev_replace_lock(dev_replace, 1);
+               btrfs_dev_replace_write_lock(dev_replace);
                goto leave;
        }
 
@@ -426,7 +431,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 leave:
        dev_replace->srcdev = NULL;
        dev_replace->tgtdev = NULL;
-       btrfs_dev_replace_unlock(dev_replace, 1);
+       btrfs_dev_replace_write_unlock(dev_replace);
        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
        return ret;
 }
@@ -493,18 +498,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        /* don't allow cancel or unmount to disturb the finishing procedure */
        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 
-       btrfs_dev_replace_lock(dev_replace, 0);
+       btrfs_dev_replace_read_lock(dev_replace);
        /* was the operation canceled, or is it finished? */
        if (dev_replace->replace_state !=
            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
-               btrfs_dev_replace_unlock(dev_replace, 0);
+               btrfs_dev_replace_read_unlock(dev_replace);
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return 0;
        }
 
        tgt_device = dev_replace->tgtdev;
        src_device = dev_replace->srcdev;
-       btrfs_dev_replace_unlock(dev_replace, 0);
+       btrfs_dev_replace_read_unlock(dev_replace);
 
        /*
         * flush all outstanding I/O and inode extent mappings before the
@@ -529,7 +534,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        /* keep away write_all_supers() during the finishing procedure */
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        mutex_lock(&fs_info->chunk_mutex);
-       btrfs_dev_replace_lock(dev_replace, 1);
+       btrfs_dev_replace_write_lock(dev_replace);
        dev_replace->replace_state =
                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -549,7 +554,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                                 btrfs_dev_name(src_device),
                                 src_device->devid,
                                 rcu_str_deref(tgt_device->name), scrub_ret);
-               btrfs_dev_replace_unlock(dev_replace, 1);
+               btrfs_dev_replace_write_unlock(dev_replace);
                mutex_unlock(&fs_info->chunk_mutex);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                mutex_unlock(&uuid_mutex);
@@ -586,7 +591,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
        fs_info->fs_devices->rw_devices++;
 
-       btrfs_dev_replace_unlock(dev_replace, 1);
+       btrfs_dev_replace_write_unlock(dev_replace);
 
        btrfs_rm_dev_replace_blocked(fs_info);
 
@@ -679,7 +684,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 {
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-       btrfs_dev_replace_lock(dev_replace, 0);
+       btrfs_dev_replace_read_lock(dev_replace);
        /* even if !dev_replace_is_valid, the values are good enough for
         * the replace_status ioctl */
        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -691,41 +696,36 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
        args->status.num_uncorrectable_read_errors =
                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
        args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
-       btrfs_dev_replace_unlock(dev_replace, 0);
+       btrfs_dev_replace_read_unlock(dev_replace);
 }
 
-int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
-                            struct btrfs_ioctl_dev_replace_args *args)
-{
-       args->result = __btrfs_dev_replace_cancel(fs_info);
-       return 0;
-}
-
-static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 {
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        struct btrfs_device *tgt_device = NULL;
+       struct btrfs_device *src_device = NULL;
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = fs_info->tree_root;
-       u64 result;
+       int result;
        int ret;
 
        if (sb_rdonly(fs_info->sb))
                return -EROFS;
 
        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-       btrfs_dev_replace_lock(dev_replace, 1);
+       btrfs_dev_replace_write_lock(dev_replace);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
-               btrfs_dev_replace_unlock(dev_replace, 1);
+               btrfs_dev_replace_write_unlock(dev_replace);
                goto leave;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
                tgt_device = dev_replace->tgtdev;
+               src_device = dev_replace->srcdev;
                dev_replace->tgtdev = NULL;
                dev_replace->srcdev = NULL;
                break;
@@ -733,7 +733,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
        dev_replace->time_stopped = get_seconds();
        dev_replace->item_needs_writeback = 1;
-       btrfs_dev_replace_unlock(dev_replace, 1);
+       btrfs_dev_replace_write_unlock(dev_replace);
        btrfs_scrub_cancel(fs_info);
 
        trans = btrfs_start_transaction(root, 0);
@@ -743,6 +743,12 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
        }
        ret = btrfs_commit_transaction(trans);
        WARN_ON(ret);
+
+       btrfs_info_in_rcu(fs_info,
+               "dev_replace from %s (devid %llu) to %s canceled",
+               btrfs_dev_name(src_device), src_device->devid,
+               btrfs_dev_name(tgt_device));
+
        if (tgt_device)
                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 
@@ -756,7 +762,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-       btrfs_dev_replace_lock(dev_replace, 1);
+       btrfs_dev_replace_write_lock(dev_replace);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -772,7 +778,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
                break;
        }
 
-       btrfs_dev_replace_unlock(dev_replace, 1);
+       btrfs_dev_replace_write_unlock(dev_replace);
        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 }
 
@@ -782,12 +788,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
        struct task_struct *task;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-       btrfs_dev_replace_lock(dev_replace, 1);
+       btrfs_dev_replace_write_lock(dev_replace);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
-               btrfs_dev_replace_unlock(dev_replace, 1);
+               btrfs_dev_replace_write_unlock(dev_replace);
                return 0;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
                break;
@@ -801,10 +807,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
                           "cannot continue dev_replace, tgtdev is missing");
                btrfs_info(fs_info,
                           "you may cancel the operation after 'mount -o degraded'");
-               btrfs_dev_replace_unlock(dev_replace, 1);
+               btrfs_dev_replace_write_unlock(dev_replace);
                return 0;
        }
-       btrfs_dev_replace_unlock(dev_replace, 1);
+       btrfs_dev_replace_write_unlock(dev_replace);
 
        WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
@@ -873,37 +879,37 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
        return 1;
 }
 
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
+void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
 {
-       if (rw == 1) {
-               /* write */
-again:
-               wait_event(dev_replace->read_lock_wq,
-                          atomic_read(&dev_replace->blocking_readers) == 0);
-               write_lock(&dev_replace->lock);
-               if (atomic_read(&dev_replace->blocking_readers)) {
-                       write_unlock(&dev_replace->lock);
-                       goto again;
-               }
-       } else {
-               read_lock(&dev_replace->lock);
-               atomic_inc(&dev_replace->read_locks);
-       }
+       read_lock(&dev_replace->lock);
+       atomic_inc(&dev_replace->read_locks);
+}
+
+void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
+{
+       ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+       atomic_dec(&dev_replace->read_locks);
+       read_unlock(&dev_replace->lock);
 }
 
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
+void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace)
 {
-       if (rw == 1) {
-               /* write */
-               ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+again:
+       wait_event(dev_replace->read_lock_wq,
+                  atomic_read(&dev_replace->blocking_readers) == 0);
+       write_lock(&dev_replace->lock);
+       if (atomic_read(&dev_replace->blocking_readers)) {
                write_unlock(&dev_replace->lock);
-       } else {
-               ASSERT(atomic_read(&dev_replace->read_locks) > 0);
-               atomic_dec(&dev_replace->read_locks);
-               read_unlock(&dev_replace->lock);
+               goto again;
        }
 }
 
+void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
+{
+       ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+       write_unlock(&dev_replace->lock);
+}
+
 /* inc blocking cnt and release read lock */
 void btrfs_dev_replace_set_lock_blocking(
                                        struct btrfs_dev_replace *dev_replace)
index f94a76844ae7425a3702f36013b4b415a02457c2..8566a02ef222748e9e69a09a3ef8f1bc38f89b3f 100644 (file)
@@ -32,13 +32,14 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
                int read_src);
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
                              struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
-                            struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
+void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
 void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
 void btrfs_dev_replace_clear_lock_blocking(
                                        struct btrfs_dev_replace *dev_replace);
index cbe421605cd5ab527fe6c33aa837fe0cb7a5995a..29e967b2c6672ff4d068af6485c60e89a29c3804 100644 (file)
@@ -18,7 +18,6 @@
 
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 
 /*
index 21f34ad0d41129d1829105c08ccbc6c5c74ddac8..07b5e6f7df6711dbcc41419e97f6797d2a4f24d1 100644 (file)
 #include <linux/uuid.h>
 #include <linux/semaphore.h>
 #include <linux/error-injection.h>
+#include <linux/crc32c.h>
 #include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
@@ -110,7 +110,7 @@ int __init btrfs_end_io_wq_init(void)
        return 0;
 }
 
-void btrfs_end_io_wq_exit(void)
+void __cold btrfs_end_io_wq_exit(void)
 {
        kmem_cache_destroy(btrfs_end_io_wq_cache);
 }
@@ -124,8 +124,8 @@ struct async_submit_bio {
        void *private_data;
        struct btrfs_fs_info *fs_info;
        struct bio *bio;
-       extent_submit_bio_hook_t *submit_bio_start;
-       extent_submit_bio_hook_t *submit_bio_done;
+       extent_submit_bio_start_t *submit_bio_start;
+       extent_submit_bio_done_t *submit_bio_done;
        int mirror_num;
        unsigned long bio_flags;
        /*
@@ -270,7 +270,7 @@ out:
 
 u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
 {
-       return btrfs_crc32c(seed, data, len);
+       return crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, u8 *result)
@@ -403,8 +403,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 
        if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
                u32 crc = ~(u32)0;
-               const int csum_size = sizeof(crc);
-               char result[csum_size];
+               char result[sizeof(crc)];
 
                /*
                 * The super_block structure does not span the whole
@@ -415,7 +414,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
                                crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
                btrfs_csum_final(crc, result);
 
-               if (memcmp(raw_disk_sb, result, csum_size))
+               if (memcmp(raw_disk_sb, result, sizeof(result)))
                        ret = 1;
        }
 
@@ -428,13 +427,59 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
        return ret;
 }
 
+static int verify_level_key(struct btrfs_fs_info *fs_info,
+                           struct extent_buffer *eb, int level,
+                           struct btrfs_key *first_key)
+{
+       int found_level;
+       struct btrfs_key found_key;
+       int ret;
+
+       found_level = btrfs_header_level(eb);
+       if (found_level != level) {
+#ifdef CONFIG_BTRFS_DEBUG
+               WARN_ON(1);
+               btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+                         eb->start, level, found_level);
+#endif
+               return -EIO;
+       }
+
+       if (!first_key)
+               return 0;
+
+       if (found_level)
+               btrfs_node_key_to_cpu(eb, &found_key, 0);
+       else
+               btrfs_item_key_to_cpu(eb, &found_key, 0);
+       ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+#ifdef CONFIG_BTRFS_DEBUG
+       if (ret) {
+               WARN_ON(1);
+               btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
+                         eb->start, first_key->objectid, first_key->type,
+                         first_key->offset, found_key.objectid,
+                         found_key.type, found_key.offset);
+       }
+#endif
+       return ret;
+}
+
 /*
  * helper to read a given tree block, doing retries as required when
  * the checksums don't match and we have alternate mirrors to try.
+ *
+ * @parent_transid:    expected transid, skip check if 0
+ * @level:             expected level, mandatory check
+ * @first_key:         expected key of first slot, skip check if NULL
  */
 static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
                                          struct extent_buffer *eb,
-                                         u64 parent_transid)
+                                         u64 parent_transid, int level,
+                                         struct btrfs_key *first_key)
 {
        struct extent_io_tree *io_tree;
        int failed = 0;
@@ -449,11 +494,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
                ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
                                               mirror_num);
                if (!ret) {
-                       if (!verify_parent_transid(io_tree, eb,
+                       if (verify_parent_transid(io_tree, eb,
                                                   parent_transid, 0))
-                               break;
-                       else
                                ret = -EIO;
+                       else if (verify_level_key(fs_info, eb, level,
+                                                 first_key))
+                               ret = -EUCLEAN;
+                       else
+                               break;
                }
 
                /*
@@ -461,7 +509,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
                 * there is no reason to read the other copies, they won't be
                 * any less wrong.
                 */
-               if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+               if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) ||
+                   ret == -EUCLEAN)
                        break;
 
                num_copies = btrfs_num_copies(fs_info,
@@ -602,12 +651,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
         * that we don't try and read the other copies of this block, just
         * return -EIO.
         */
-       if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
+       if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
                ret = -EIO;
        }
 
-       if (found_level > 0 && btrfs_check_node(root, eb))
+       if (found_level > 0 && btrfs_check_node(fs_info, eb))
                ret = -EIO;
 
        if (!ret)
@@ -710,14 +759,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
        return 0;
 }
 
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
-{
-       unsigned long limit = min_t(unsigned long,
-                                   info->thread_pool_size,
-                                   info->fs_devices->open_devices);
-       return 256 * limit;
-}
-
 static void run_one_async_start(struct btrfs_work *work)
 {
        struct async_submit_bio *async;
@@ -725,7 +766,6 @@ static void run_one_async_start(struct btrfs_work *work)
 
        async = container_of(work, struct  async_submit_bio, work);
        ret = async->submit_bio_start(async->private_data, async->bio,
-                                     async->mirror_num, async->bio_flags,
                                      async->bio_offset);
        if (ret)
                async->status = ret;
@@ -744,8 +784,7 @@ static void run_one_async_done(struct btrfs_work *work)
                return;
        }
 
-       async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
-                              async->bio_flags, async->bio_offset);
+       async->submit_bio_done(async->private_data, async->bio, async->mirror_num);
 }
 
 static void run_one_async_free(struct btrfs_work *work)
@@ -759,8 +798,8 @@ static void run_one_async_free(struct btrfs_work *work)
 blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags,
                                 u64 bio_offset, void *private_data,
-                                extent_submit_bio_hook_t *submit_bio_start,
-                                extent_submit_bio_hook_t *submit_bio_done)
+                                extent_submit_bio_start_t *submit_bio_start,
+                                extent_submit_bio_done_t *submit_bio_done)
 {
        struct async_submit_bio *async;
 
@@ -807,8 +846,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
        return errno_to_blk_status(ret);
 }
 
-static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
-                                            int mirror_num, unsigned long bio_flags,
+static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
                                             u64 bio_offset)
 {
        /*
@@ -818,9 +856,8 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio
        return btree_csum_one_bio(bio);
 }
 
-static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
-                                           int mirror_num, unsigned long bio_flags,
-                                           u64 bio_offset)
+static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio,
+                                           int mirror_num)
 {
        struct inode *inode = private_data;
        blk_status_t ret;
@@ -879,8 +916,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
                 */
                ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
                                          bio_offset, private_data,
-                                         __btree_submit_bio_start,
-                                         __btree_submit_bio_done);
+                                         btree_submit_bio_start,
+                                         btree_submit_bio_done);
        }
 
        if (ret)
@@ -1062,8 +1099,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
                                buf->start, buf->start + buf->len - 1);
 }
 
+/*
+ * Read tree block at logical address @bytenr and do variant basic but critical
+ * verification.
+ *
+ * @parent_transid:    expected transid of this tree block, skip check if 0
+ * @level:             expected level, mandatory check
+ * @first_key:         expected key in slot 0, skip check if NULL
+ */
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
-                                     u64 parent_transid)
+                                     u64 parent_transid, int level,
+                                     struct btrfs_key *first_key)
 {
        struct extent_buffer *buf = NULL;
        int ret;
@@ -1072,7 +1118,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
        if (IS_ERR(buf))
                return buf;
 
-       ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+       ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+                                            level, first_key);
        if (ret) {
                free_extent_buffer(buf);
                return ERR_PTR(ret);
@@ -1108,7 +1155,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
        if (!writers)
                return ERR_PTR(-ENOMEM);
 
-       ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+       ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
        if (ret < 0) {
                kfree(writers);
                return ERR_PTR(ret);
@@ -1160,6 +1207,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        spin_lock_init(&root->accounting_lock);
        spin_lock_init(&root->log_extents_lock[0]);
        spin_lock_init(&root->log_extents_lock[1]);
+       spin_lock_init(&root->qgroup_meta_rsv_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        mutex_init(&root->ordered_extent_mutex);
@@ -1176,7 +1224,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
        atomic_set(&root->orphan_inodes, 0);
        refcount_set(&root->refs, 1);
        atomic_set(&root->will_be_snapshotted, 0);
-       atomic64_set(&root->qgroup_meta_rsv, 0);
        root->log_transid = 0;
        root->log_transid_committed = -1;
        root->last_log_commit = 0;
@@ -1401,6 +1448,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
        struct btrfs_path *path;
        u64 generation;
        int ret;
+       int level;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -1423,9 +1471,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
        }
 
        generation = btrfs_root_generation(&root->root_item);
+       level = btrfs_root_level(&root->root_item);
        root->node = read_tree_block(fs_info,
                                     btrfs_root_bytenr(&root->root_item),
-                                    generation);
+                                    generation, level, NULL);
        if (IS_ERR(root->node)) {
                ret = PTR_ERR(root->node);
                goto find_fail;
@@ -1808,12 +1857,10 @@ sleep:
                if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
                                      &fs_info->fs_state)))
                        btrfs_cleanup_transaction(fs_info);
-               set_current_state(TASK_INTERRUPTIBLE);
                if (!kthread_should_stop() &&
                                (!btrfs_transaction_blocked(fs_info) ||
                                 cannot_commit))
-                       schedule_timeout(delay);
-               __set_current_state(TASK_RUNNING);
+                       schedule_timeout_interruptible(delay);
        } while (!kthread_should_stop());
        return 0;
 }
@@ -2183,7 +2230,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
                struct btrfs_fs_devices *fs_devices)
 {
-       int max_active = fs_info->thread_pool_size;
+       u32 max_active = fs_info->thread_pool_size;
        unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
 
        fs_info->workers =
@@ -2276,6 +2323,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
        struct btrfs_root *log_tree_root;
        struct btrfs_super_block *disk_super = fs_info->super_copy;
        u64 bytenr = btrfs_super_log_root(disk_super);
+       int level = btrfs_super_log_root_level(disk_super);
 
        if (fs_devices->rw_devices == 0) {
                btrfs_warn(fs_info, "log replay required on RO media");
@@ -2289,7 +2337,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
        __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
        log_tree_root->node = read_tree_block(fs_info, bytenr,
-                                             fs_info->generation + 1);
+                                             fs_info->generation + 1,
+                                             level, NULL);
        if (IS_ERR(log_tree_root->node)) {
                btrfs_warn(fs_info, "failed to read log tree");
                ret = PTR_ERR(log_tree_root->node);
@@ -2334,23 +2383,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
        location.offset = 0;
 
        root = btrfs_read_tree_root(tree_root, &location);
-       if (IS_ERR(root))
-               return PTR_ERR(root);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
        fs_info->extent_root = root;
 
        location.objectid = BTRFS_DEV_TREE_OBJECTID;
        root = btrfs_read_tree_root(tree_root, &location);
-       if (IS_ERR(root))
-               return PTR_ERR(root);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
        fs_info->dev_root = root;
        btrfs_init_devices_late(fs_info);
 
        location.objectid = BTRFS_CSUM_TREE_OBJECTID;
        root = btrfs_read_tree_root(tree_root, &location);
-       if (IS_ERR(root))
-               return PTR_ERR(root);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
        fs_info->csum_root = root;
 
@@ -2367,7 +2422,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);
                if (ret != -ENOENT)
-                       return ret;
+                       goto out;
        } else {
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->uuid_root = root;
@@ -2376,13 +2431,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
                location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
                root = btrfs_read_tree_root(tree_root, &location);
-               if (IS_ERR(root))
-                       return PTR_ERR(root);
+               if (IS_ERR(root)) {
+                       ret = PTR_ERR(root);
+                       goto out;
+               }
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->free_space_root = root;
        }
 
        return 0;
+out:
+       btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
+                  location.objectid, ret);
+       return ret;
 }
 
 int open_ctree(struct super_block *sb,
@@ -2404,8 +2465,8 @@ int open_ctree(struct super_block *sb,
        int err = -EINVAL;
        int num_backups_tried = 0;
        int backup_index = 0;
-       int max_active;
        int clear_free_space_tree = 0;
+       int level;
 
        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2447,6 +2508,8 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
        INIT_LIST_HEAD(&fs_info->delalloc_roots);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
+       INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
+       spin_lock_init(&fs_info->pending_raid_kobjs_lock);
        spin_lock_init(&fs_info->delalloc_root_lock);
        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
@@ -2713,8 +2776,6 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
 
-       max_active = fs_info->thread_pool_size;
-
        ret = btrfs_init_workqueues(fs_info, fs_devices);
        if (ret) {
                err = ret;
@@ -2741,12 +2802,13 @@ int open_ctree(struct super_block *sb,
        }
 
        generation = btrfs_super_chunk_root_generation(disk_super);
+       level = btrfs_super_chunk_root_level(disk_super);
 
        __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
        chunk_root->node = read_tree_block(fs_info,
                                           btrfs_super_chunk_root(disk_super),
-                                          generation);
+                                          generation, level, NULL);
        if (IS_ERR(chunk_root->node) ||
            !extent_buffer_uptodate(chunk_root->node)) {
                btrfs_err(fs_info, "failed to read chunk root");
@@ -2768,10 +2830,10 @@ int open_ctree(struct super_block *sb,
        }
 
        /*
-        * keep the device that is marked to be the target device for the
-        * dev_replace procedure
+        * Keep the devid that is marked to be the target device for the
+        * device replace procedure
         */
-       btrfs_close_extra_devices(fs_devices, 0);
+       btrfs_free_extra_devids(fs_devices, 0);
 
        if (!fs_devices->latest_bdev) {
                btrfs_err(fs_info, "failed to read devices");
@@ -2780,10 +2842,11 @@ int open_ctree(struct super_block *sb,
 
 retry_root_backup:
        generation = btrfs_super_generation(disk_super);
+       level = btrfs_super_root_level(disk_super);
 
        tree_root->node = read_tree_block(fs_info,
                                          btrfs_super_root(disk_super),
-                                         generation);
+                                         generation, level, NULL);
        if (IS_ERR(tree_root->node) ||
            !extent_buffer_uptodate(tree_root->node)) {
                btrfs_warn(fs_info, "failed to read tree root");
@@ -2834,7 +2897,7 @@ retry_root_backup:
                goto fail_block_groups;
        }
 
-       btrfs_close_extra_devices(fs_devices, 1);
+       btrfs_free_extra_devids(fs_devices, 1);
 
        ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
        if (ret) {
@@ -2953,6 +3016,7 @@ retry_root_backup:
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (IS_ERR(fs_info->fs_root)) {
                err = PTR_ERR(fs_info->fs_root);
+               btrfs_warn(fs_info, "failed to read fs tree: %d", err);
                goto fail_qgroup;
        }
 
@@ -3290,6 +3354,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
        struct buffer_head *bh;
        int i;
        int errors = 0;
+       bool primary_failed = false;
        u64 bytenr;
 
        if (max_mirrors == 0)
@@ -3306,11 +3371,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
                                      BTRFS_SUPER_INFO_SIZE);
                if (!bh) {
                        errors++;
+                       if (i == 0)
+                               primary_failed = true;
                        continue;
                }
                wait_on_buffer(bh);
-               if (!buffer_uptodate(bh))
+               if (!buffer_uptodate(bh)) {
                        errors++;
+                       if (i == 0)
+                               primary_failed = true;
+               }
 
                /* drop our reference */
                brelse(bh);
@@ -3319,6 +3389,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
                brelse(bh);
        }
 
+       /* log error, force error return */
+       if (primary_failed) {
+               btrfs_err(device->fs_info, "error writing primary super block to device %llu",
+                         device->devid);
+               return -1;
+       }
+
        return errors < i ? 0 : -1;
 }
 
@@ -3851,7 +3928,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
         * So here we should only check item pointers, not item data.
         */
        if (btrfs_header_level(buf) == 0 &&
-           btrfs_check_leaf_relaxed(root, buf)) {
+           btrfs_check_leaf_relaxed(fs_info, buf)) {
                btrfs_print_leaf(buf);
                ASSERT(0);
        }
@@ -3890,12 +3967,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
        __btrfs_btree_balance_dirty(fs_info, 0);
 }
 
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+                     struct btrfs_key *first_key)
 {
        struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       return btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+       return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+                                             level, first_key);
 }
 
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
@@ -4314,11 +4393,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group_cache,
                                         dirty_list);
-               if (!cache) {
-                       btrfs_err(fs_info, "orphan block group dirty_bgs list");
-                       spin_unlock(&cur_trans->dirty_bgs_lock);
-                       return;
-               }
 
                if (!list_empty(&cache->io_list)) {
                        spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4338,14 +4412,14 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);
 
+       /*
+        * Refer to the definition of io_bgs member for details why it's safe
+        * to use it without any locking
+        */
        while (!list_empty(&cur_trans->io_bgs)) {
                cache = list_first_entry(&cur_trans->io_bgs,
                                         struct btrfs_block_group_cache,
                                         io_list);
-               if (!cache) {
-                       btrfs_err(fs_info, "orphan block group on io_bgs list");
-                       return;
-               }
 
                list_del_init(&cache->io_list);
                spin_lock(&cache->lock);
index 301151a50ac179e52d3bdf0073832046bcf0454b..453ea9f5d4e9d4f9626f4a09b0d3464a83d695e2 100644 (file)
@@ -52,8 +52,9 @@ static inline u64 btrfs_sb_offset(int mirror)
 struct btrfs_device;
 struct btrfs_fs_devices;
 
-struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info,
-                                     u64 bytenr, u64 parent_transid);
+struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+                                     u64 parent_transid, int level,
+                                     struct btrfs_key *first_key);
 void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
 int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
                         int mirror_num, struct extent_buffer **eb);
@@ -123,7 +124,8 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
                          int atomic);
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+                     struct btrfs_key *first_key);
 u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, u8 *result);
 blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -131,9 +133,8 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
                        int mirror_num, unsigned long bio_flags,
                        u64 bio_offset, void *private_data,
-                       extent_submit_bio_hook_t *submit_bio_start,
-                       extent_submit_bio_hook_t *submit_bio_done);
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+                       extent_submit_bio_start_t *submit_bio_start,
+                       extent_submit_bio_done_t *submit_bio_done);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -154,7 +155,7 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
                int create);
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
 int __init btrfs_end_io_wq_init(void);
-void btrfs_end_io_wq_exit(void);
+void __cold btrfs_end_io_wq_exit(void);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
index e0460d7b562273287b5f6adc2ec1f97d928993b4..e08d0d45af4f3d31513107cbb4deaa7d5c244c24 100644 (file)
@@ -27,7 +27,7 @@
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
 #include <linux/lockdep.h>
-#include "hash.h"
+#include <linux/crc32c.h>
 #include "tree-log.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -535,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work)
        struct btrfs_block_group_cache *block_group;
        struct btrfs_fs_info *fs_info;
        struct btrfs_caching_control *caching_ctl;
-       struct btrfs_root *extent_root;
        int ret;
 
        caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
        fs_info = block_group->fs_info;
-       extent_root = fs_info->extent_root;
 
        mutex_lock(&caching_ctl->mutex);
        down_read(&fs_info->commit_root_sem);
@@ -1203,11 +1201,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
        __le64 lenum;
 
        lenum = cpu_to_le64(root_objectid);
-       high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+       high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(owner);
-       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(offset);
-       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
        return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -2652,9 +2650,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
  */
 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                                            struct btrfs_fs_info *fs_info,
                                             unsigned long nr)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -2994,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
        if (trans->transid > async->transid)
                goto end;
 
-       ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+       ret = btrfs_run_delayed_refs(trans, async->count);
        if (ret)
                async->error = ret;
 end:
@@ -3053,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
  * Returns <0 on error and aborts the transaction
  */
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                          struct btrfs_fs_info *fs_info, unsigned long count)
+                          unsigned long count)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_head *head;
@@ -3078,7 +3077,7 @@ again:
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
        trans->can_flush_pending_bgs = false;
-       ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+       ret = __btrfs_run_delayed_refs(trans, count);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                return ret;
@@ -3086,7 +3085,7 @@ again:
 
        if (run_all) {
                if (!list_empty(&trans->new_bgs))
-                       btrfs_create_pending_block_groups(trans, fs_info);
+                       btrfs_create_pending_block_groups(trans);
 
                spin_lock(&delayed_refs->lock);
                node = rb_first(&delayed_refs->href_root);
@@ -3660,9 +3659,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
  * the commit latency by getting rid of the easy block groups while
  * we're still allowing others to join the commit.
  */
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
-                                  struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
@@ -3686,7 +3685,7 @@ again:
         * make sure all the block groups on our dirty list actually
         * exist
         */
-       btrfs_create_pending_block_groups(trans, fs_info);
+       btrfs_create_pending_block_groups(trans);
 
        if (!path) {
                path = btrfs_alloc_path();
@@ -3741,8 +3740,9 @@ again:
                                should_put = 0;
 
                                /*
-                                * the cache_write_mutex is protecting
-                                * the io_list
+                                * The cache_write_mutex is protecting the
+                                * io_list, also refer to the definition of
+                                * btrfs_transaction::io_bgs for more details
                                 */
                                list_add_tail(&cache->io_list, io);
                        } else {
@@ -3800,7 +3800,7 @@ again:
         * go through delayed refs for all the stuff we've just kicked off
         * and then loop back (just once)
         */
-       ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+       ret = btrfs_run_delayed_refs(trans, 0);
        if (!ret && loops == 0) {
                loops++;
                spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3882,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                cache_save_setup(cache, trans, path);
 
                if (!ret)
-                       ret = btrfs_run_delayed_refs(trans, fs_info,
+                       ret = btrfs_run_delayed_refs(trans,
                                                     (unsigned long) -1);
 
                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
@@ -3934,6 +3934,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);
 
+       /*
+        * Refer to the definition of io_bgs member for details why it's safe
+        * to use it without any locking
+        */
        while (!list_empty(io)) {
                cache = list_first_entry(io, struct btrfs_block_group_cache,
                                         io_list);
@@ -4332,8 +4336,7 @@ again:
 
                /* commit the current transaction and try again */
 commit_trans:
-               if (need_commit &&
-                   !atomic_read(&fs_info->open_ioctl_trans)) {
+               if (need_commit) {
                        need_commit--;
 
                        if (need_commit > 0) {
@@ -4541,7 +4544,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
         * Needed because we can end up allocating a system chunk and for an
         * atomic and race free space reservation in the chunk block reserve.
         */
-       ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+       lockdep_assert_held(&fs_info->chunk_mutex);
 
        info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
        spin_lock(&info->lock);
@@ -4602,11 +4605,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                return -ENOSPC;
 
        space_info = __find_space_info(fs_info, flags);
-       if (!space_info) {
-               ret = create_space_info(fs_info, flags, &space_info);
-               if (ret)
-                       return ret;
-       }
+       ASSERT(space_info);
 
 again:
        spin_lock(&space_info->lock);
@@ -4705,7 +4704,7 @@ out:
         */
        if (trans->can_flush_pending_bgs &&
            trans->chunk_bytes_reserved >= (u64)SZ_2M) {
-               btrfs_create_pending_block_groups(trans, fs_info);
+               btrfs_create_pending_block_groups(trans);
                btrfs_trans_release_chunk_metadata(trans);
        }
        return ret;
@@ -4826,7 +4825,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
        long time_left;
        unsigned long nr_pages;
        int loops;
-       enum btrfs_reserve_flush_enum flush;
 
        /* Calc the number of the pages we need flush for space reservation */
        items = calc_reclaim_items_nr(fs_info, to_reclaim);
@@ -4867,10 +4865,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                           atomic_read(&fs_info->async_delalloc_pages) <=
                           (int)max_reclaim);
 skip_async:
-               if (!trans)
-                       flush = BTRFS_RESERVE_FLUSH_ALL;
-               else
-                       flush = BTRFS_RESERVE_NO_FLUSH;
                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets) &&
                    list_empty(&space_info->priority_tickets)) {
@@ -4993,7 +4987,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                        ret = PTR_ERR(trans);
                        break;
                }
-               ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+               ret = btrfs_run_delayed_items_nr(trans, nr);
                btrfs_end_transaction(trans);
                break;
        case FLUSH_DELALLOC:
@@ -5388,10 +5382,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
                    !block_rsv_use_bytes(global_rsv, orig_bytes))
                        ret = 0;
        }
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
                trace_btrfs_space_reservation(fs_info, "space_info:enospc",
                                              block_rsv->space_info->flags,
                                              orig_bytes, 1);
+
+               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+                       dump_space_info(fs_info, block_rsv->space_info,
+                                       orig_bytes, 0);
+       }
        return ret;
 }
 
@@ -5760,6 +5759,9 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
        if (num_bytes == 0)
                return 0;
 
+       ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+       if (ret)
+               return ret;
        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -5772,11 +5774,15 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 /**
  * btrfs_inode_rsv_release - release any excessive reservation.
  * @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ *   Unlike normal operation, qgroup meta reservation needs to know if we are
+ *   freeing qgroup reservation or just converting it into per-trans.  Normally
+ *   @qgroup_free is true for error handling, and false for normal release.
  *
  * This is the same as btrfs_block_rsv_release, except that it handles the
  * tracepoint for the reservation.
  */
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5792,6 +5798,10 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
        if (released > 0)
                trace_btrfs_space_reservation(fs_info, "delalloc",
                                              btrfs_ino(inode), released, 0);
+       if (qgroup_free)
+               btrfs_qgroup_free_meta_prealloc(inode->root, released);
+       else
+               btrfs_qgroup_convert_reserved_meta(inode->root, released);
 }
 
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5892,24 +5902,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info)
-{
-       if (!trans->block_rsv) {
-               ASSERT(!trans->bytes_reserved);
-               return;
-       }
-
-       if (!trans->bytes_reserved)
-               return;
-
-       ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
-       trace_btrfs_space_reservation(fs_info, "transaction",
-                                     trans->transid, trans->bytes_reserved, 0);
-       btrfs_block_rsv_release(fs_info, trans->block_rsv,
-                               trans->bytes_reserved);
-       trans->bytes_reserved = 0;
-}
 
 /*
  * To be called after all the new block groups attached to the transaction
@@ -5951,7 +5943,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         */
        u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
-       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 
+       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
                        num_bytes, 1);
        return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
 }
@@ -5995,7 +5987,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
                /* One for parent inode, two for dir entries */
                num_bytes = 3 * fs_info->nodesize;
-               ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+               ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
                if (ret)
                        return ret;
        } else {
@@ -6014,7 +6006,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
 
        if (ret && *qgroup_reserved)
-               btrfs_qgroup_free_meta(root, *qgroup_reserved);
+               btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
 
        return ret;
 }
@@ -6051,7 +6043,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       struct btrfs_root *root = inode->root;
        unsigned nr_extents;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret = 0;
@@ -6068,13 +6059,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_free_space_inode(inode)) {
                flush = BTRFS_RESERVE_NO_FLUSH;
                delalloc_lock = false;
-       } else if (current->journal_info) {
-               flush = BTRFS_RESERVE_FLUSH_LIMIT;
-       }
+       } else {
+               if (current->journal_info)
+                       flush = BTRFS_RESERVE_FLUSH_LIMIT;
 
-       if (flush != BTRFS_RESERVE_NO_FLUSH &&
-           btrfs_transaction_in_commit(fs_info))
-               schedule_timeout(1);
+               if (btrfs_transaction_in_commit(fs_info))
+                       schedule_timeout(1);
+       }
 
        if (delalloc_lock)
                mutex_lock(&inode->delalloc_mutex);
@@ -6089,19 +6080,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 
-       if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-               ret = btrfs_qgroup_reserve_meta(root,
-                               nr_extents * fs_info->nodesize, true);
-               if (ret)
-                       goto out_fail;
-       }
-
        ret = btrfs_inode_rsv_refill(inode, flush);
-       if (unlikely(ret)) {
-               btrfs_qgroup_free_meta(root,
-                                      nr_extents * fs_info->nodesize);
+       if (unlikely(ret))
                goto out_fail;
-       }
 
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
@@ -6115,7 +6096,7 @@ out_fail:
        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, true);
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
        return ret;
@@ -6125,12 +6106,14 @@ out_fail:
  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
  * @inode: the inode to release the reservation for.
  * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
  *
  * This will release the metadata reservation for an inode.  This can be called
  * once we complete IO for a given set of bytes to release their metadata
  * reservations, or on error for the same reason.
  */
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+                                    bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 
@@ -6143,13 +6126,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_testing(fs_info))
                return;
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
 /**
  * btrfs_delalloc_release_extents - release our outstanding_extents
  * @inode: the inode to balance the reservation for.
  * @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
  *
  * When we reserve space we increase outstanding_extents for the extents we may
  * add.  Once we've set the range as delalloc or created our ordered extents we
@@ -6157,7 +6141,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
  * with btrfs_delalloc_reserve_metadata.
  */
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+                                   bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
        unsigned num_extents;
@@ -6171,7 +6156,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_testing(fs_info))
                return;
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
 /**
@@ -6227,9 +6212,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
  */
 void btrfs_delalloc_release_space(struct inode *inode,
                                  struct extent_changeset *reserved,
-                                 u64 start, u64 len)
+                                 u64 start, u64 len, bool qgroup_free)
 {
-       btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+       btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
        btrfs_free_reserved_data_space(inode, reserved, start, len);
 }
 
@@ -6783,9 +6768,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
        return 0;
 }
 
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *block_group, *tmp;
        struct list_head *deleted_bgs;
        struct extent_io_tree *unpin;
@@ -7351,29 +7336,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return ret;
 }
 
-int __get_raid_index(u64 flags)
-{
-       if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               return BTRFS_RAID_RAID10;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               return BTRFS_RAID_RAID1;
-       else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               return BTRFS_RAID_DUP;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               return BTRFS_RAID_RAID0;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-               return BTRFS_RAID_RAID5;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-               return BTRFS_RAID_RAID6;
-
-       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
-       return __get_raid_index(cache->flags);
-}
-
 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
        [BTRFS_RAID_RAID10]     = "raid10",
        [BTRFS_RAID_RAID1]      = "raid1",
@@ -7488,7 +7450,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
        u64 empty_cluster = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
-       int index = __get_raid_index(flags);
+       int index = btrfs_bg_flags_to_raid_index(flags);
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
@@ -7574,7 +7536,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
-                               index = get_block_group_index(block_group);
+                               index = btrfs_bg_flags_to_raid_index(
+                                               block_group->flags);
                                btrfs_lock_block_group(block_group, delalloc);
                                goto have_block_group;
                        }
@@ -7584,7 +7547,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
        }
 search:
        have_caching_bg = false;
-       if (index == 0 || index == __get_raid_index(flags))
+       if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
                full_search = true;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7842,7 +7805,8 @@ checks:
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
-               BUG_ON(index != get_block_group_index(block_group));
+               BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+                      index);
                btrfs_release_block_group(block_group, delalloc);
                cond_resched();
        }
@@ -7996,6 +7960,51 @@ again:
        up_read(&info->groups_sem);
 }
 
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ *                       hole that is at least as big as @num_bytes.
+ *
+ * @root           -   The root that will contain this extent
+ *
+ * @ram_bytes      -   The amount of space in ram that @num_bytes take. This
+ *                     is used for accounting purposes. This value differs
+ *                     from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes      -   Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size -   Indicates the minimum amount of space that the
+ *                     allocator should try to satisfy. In some cases
+ *                     @num_bytes may be larger than what is required and if
+ *                     the filesystem is fragmented then allocation fails.
+ *                     However, the presence of @min_alloc_size gives a
+ *                     chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size     -   A hint that you plan on doing more COW. This is the
+ *                     size in bytes the allocator should try to find free
+ *                     next to the block it returns.  This is just a hint and
+ *                     may be ignored by the allocator.
+ *
+ * @hint_byte      -   Hint to the allocator to start searching above the byte
+ *                     address passed. It might be ignored.
+ *
+ * @ins            -   This key is modified to record the found hole. It will
+ *                     have the following values:
+ *                     ins->objectid == start position
+ *                     ins->flags = BTRFS_EXTENT_ITEM_KEY
+ *                     ins->offset == the size of the hole.
+ *
+ * @is_data        -   Boolean flag indicating whether an extent is
+ *                     allocated for data (true) or metadata (false)
+ *
+ * @delalloc       -   Boolean flag indicating whether this allocation is for
+ *                     delalloc or not. If 'true' data_rwsem of block groups
+ *                     is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
@@ -8699,6 +8708,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        u64 parent;
        u32 blocksize;
        struct btrfs_key key;
+       struct btrfs_key first_key;
        struct extent_buffer *next;
        int level = wc->level;
        int reada = 0;
@@ -8719,6 +8729,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        }
 
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+       btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+                             path->slots[level]);
        blocksize = fs_info->nodesize;
 
        next = find_extent_buffer(fs_info, bytenr);
@@ -8783,7 +8795,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        if (!next) {
                if (reada && level == 1)
                        reada_walk_down(trans, root, wc, path);
-               next = read_tree_block(fs_info, bytenr, generation);
+               next = read_tree_block(fs_info, bytenr, generation, level - 1,
+                                      &first_key);
                if (IS_ERR(next)) {
                        return PTR_ERR(next);
                } else if (!extent_buffer_uptodate(next)) {
@@ -9648,7 +9661,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
         */
        target = get_restripe_target(fs_info, block_group->flags);
        if (target) {
-               index = __get_raid_index(extended_to_chunk(target));
+               index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
        } else {
                /*
                 * this is just a balance, so if we were marked as full
@@ -9662,7 +9675,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
                        goto out;
                }
 
-               index = get_block_group_index(block_group);
+               index = btrfs_bg_flags_to_raid_index(block_group->flags);
        }
 
        if (index == BTRFS_RAID_RAID10) {
@@ -9911,10 +9924,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        return 0;
 }
 
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_space_info *space_info;
+       struct raid_kobject *rkobj;
+       LIST_HEAD(list);
+       int index;
+       int ret = 0;
+
+       spin_lock(&fs_info->pending_raid_kobjs_lock);
+       list_splice_init(&fs_info->pending_raid_kobjs, &list);
+       spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+       list_for_each_entry(rkobj, &list, list) {
+               space_info = __find_space_info(fs_info, rkobj->flags);
+               index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+               ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+                                 "%s", get_raid_name(index));
+               if (ret) {
+                       kobject_put(&rkobj->kobj);
+                       break;
+               }
+       }
+       if (ret)
+               btrfs_warn(fs_info,
+                          "failed to add kobject for block cache, ignoring");
+}
+
 static void link_block_group(struct btrfs_block_group_cache *cache)
 {
        struct btrfs_space_info *space_info = cache->space_info;
-       int index = get_block_group_index(cache);
+       struct btrfs_fs_info *fs_info = cache->fs_info;
+       int index = btrfs_bg_flags_to_raid_index(cache->flags);
        bool first = false;
 
        down_write(&space_info->groups_sem);
@@ -9924,27 +9967,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
        up_write(&space_info->groups_sem);
 
        if (first) {
-               struct raid_kobject *rkobj;
-               int ret;
-
-               rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
-               if (!rkobj)
-                       goto out_err;
-               rkobj->raid_type = index;
-               kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
-               ret = kobject_add(&rkobj->kobj, &space_info->kobj,
-                                 "%s", get_raid_name(index));
-               if (ret) {
-                       kobject_put(&rkobj->kobj);
-                       goto out_err;
+               struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+               if (!rkobj) {
+                       btrfs_warn(cache->fs_info,
+                               "couldn't alloc memory for raid level kobject");
+                       return;
                }
+               rkobj->flags = cache->flags;
+               kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+               spin_lock(&fs_info->pending_raid_kobjs_lock);
+               list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+               spin_unlock(&fs_info->pending_raid_kobjs_lock);
                space_info->block_group_kobjs[index] = &rkobj->kobj;
        }
-
-       return;
-out_err:
-       btrfs_warn(cache->fs_info,
-                  "failed to add kobject for block cache, ignoring");
 }
 
 static struct btrfs_block_group_cache *
@@ -10160,6 +10196,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
                        inc_block_group_ro(cache, 1);
        }
 
+       btrfs_add_raid_kobjects(info);
        init_global_block_rsv(info);
        ret = 0;
 error:
@@ -10167,9 +10204,9 @@ error:
        return ret;
 }
 
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
-                                      struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *block_group, *tmp;
        struct btrfs_root *extent_root = fs_info->extent_root;
        struct btrfs_block_group_item item;
@@ -10254,15 +10291,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
         * with its ->space_info set.
         */
        cache->space_info = __find_space_info(fs_info, cache->flags);
-       if (!cache->space_info) {
-               ret = create_space_info(fs_info, cache->flags,
-                                      &cache->space_info);
-               if (ret) {
-                       btrfs_remove_free_space_cache(cache);
-                       btrfs_put_block_group(cache);
-                       return ret;
-               }
-       }
+       ASSERT(cache->space_info);
 
        ret = btrfs_add_block_group_cache(fs_info, cache);
        if (ret) {
@@ -10334,7 +10363,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                                  block_group->key.offset);
 
        memcpy(&key, &block_group->key, sizeof(key));
-       index = get_block_group_index(block_group);
+       index = btrfs_bg_flags_to_raid_index(block_group->flags);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))
index dfeb74a0be77c92c525745c3352d3e72759b377d..47a8fe9d22e890bddd2d3df7bf9abb027db4647a 100644 (file)
@@ -76,8 +76,8 @@ void btrfs_leak_debug_check(void)
 
        while (!list_empty(&buffers)) {
                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-               pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n",
-                      eb->start, eb->len, atomic_read(&eb->refs));
+               pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
+                      eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
                list_del(&eb->leak_list);
                kmem_cache_free(extent_buffer_cache, eb);
        }
@@ -119,23 +119,22 @@ struct extent_page_data {
        unsigned int sync_io:1;
 };
 
-static void add_extent_changeset(struct extent_state *state, unsigned bits,
+static int add_extent_changeset(struct extent_state *state, unsigned bits,
                                 struct extent_changeset *changeset,
                                 int set)
 {
        int ret;
 
        if (!changeset)
-               return;
+               return 0;
        if (set && (state->state & bits) == bits)
-               return;
+               return 0;
        if (!set && (state->state & bits) == 0)
-               return;
+               return 0;
        changeset->bytes_changed += state->end - state->start + 1;
        ret = ulist_add(&changeset->range_changed, state->start, state->end,
                        GFP_ATOMIC);
-       /* ENOMEM */
-       BUG_ON(ret < 0);
+       return ret;
 }
 
 static void flush_write_bio(struct extent_page_data *epd);
@@ -187,7 +186,7 @@ free_state_cache:
        return -ENOMEM;
 }
 
-void extent_io_exit(void)
+void __cold extent_io_exit(void)
 {
        btrfs_leak_debug_check();
 
@@ -527,6 +526,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 {
        struct extent_state *next;
        unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+       int ret;
 
        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
@@ -534,7 +534,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                tree->dirty_bytes -= range;
        }
        clear_state_cb(tree, state, bits);
-       add_extent_changeset(state, bits_to_clear, changeset, 0);
+       ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+       BUG_ON(ret < 0);
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
@@ -805,13 +806,15 @@ static void set_state_bits(struct extent_io_tree *tree,
                           unsigned *bits, struct extent_changeset *changeset)
 {
        unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+       int ret;
 
        set_state_cb(tree, state, bits);
        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
-       add_extent_changeset(state, bits_to_set, changeset, 1);
+       ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+       BUG_ON(ret < 0);
        state->state |= bits_to_set;
 }
 
@@ -2744,20 +2747,21 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
        return blk_status_to_errno(ret);
 }
 
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
-                    unsigned long offset, size_t size, struct bio *bio,
-                    unsigned long bio_flags)
-{
-       int ret = 0;
-       if (tree->ops)
-               ret = tree->ops->merge_bio_hook(page, offset, size, bio,
-                                               bio_flags);
-       return ret;
-
-}
-
 /*
  * @opf:       bio REQ_OP_* and REQ_* flags as one value
+ * @tree:      tree so we can call our merge_bio hook
+ * @wbc:       optional writeback control for io accounting
+ * @page:      page to add to the bio
+ * @pg_offset: offset of the new bio or to check whether we are adding
+ *              a contiguous page to the previous one
+ * @size:      portion of page that we want to write
+ * @offset:    starting offset in the page
+ * @bdev:      attach newly created bios to this bdev
+ * @bio_ret:   must be valid pointer, newly allocated bio will be stored there
+ * @end_io_func:     end_io callback for new bio
+ * @mirror_num:             desired mirror to read/write
+ * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
+ * @bio_flags: flags of the current bio to see if we can merge them
  */
 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
                              struct writeback_control *wbc,
@@ -2773,21 +2777,27 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 {
        int ret = 0;
        struct bio *bio;
-       int contig = 0;
-       int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
        size_t page_size = min_t(size_t, size, PAGE_SIZE);
        sector_t sector = offset >> 9;
 
-       if (bio_ret && *bio_ret) {
+       ASSERT(bio_ret);
+
+       if (*bio_ret) {
+               bool contig;
+               bool can_merge = true;
+
                bio = *bio_ret;
-               if (old_compressed)
+               if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
                        contig = bio->bi_iter.bi_sector == sector;
                else
                        contig = bio_end_sector(bio) == sector;
 
-               if (prev_bio_flags != bio_flags || !contig ||
+               if (tree->ops && tree->ops->merge_bio_hook(page, offset,
+                                       page_size, bio, bio_flags))
+                       can_merge = false;
+
+               if (prev_bio_flags != bio_flags || !contig || !can_merge ||
                    force_bio_submit ||
-                   merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) ||
                    bio_add_page(bio, page, page_size, pg_offset) < page_size) {
                        ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
                        if (ret < 0) {
@@ -2813,10 +2823,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
                wbc_account_io(wbc, page, page_size);
        }
 
-       if (bio_ret)
-               *bio_ret = bio;
-       else
-               ret = submit_one_bio(bio, mirror_num, bio_flags);
+       *bio_ret = bio;
 
        return ret;
 }
@@ -2886,8 +2893,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 {
        struct inode *inode = page->mapping->host;
        u64 start = page_offset(page);
-       u64 page_end = start + PAGE_SIZE - 1;
-       u64 end;
+       const u64 end = start + PAGE_SIZE - 1;
        u64 cur = start;
        u64 extent_offset;
        u64 last_byte = i_size_read(inode);
@@ -2905,7 +2911,6 @@ static int __do_readpage(struct extent_io_tree *tree,
 
        set_page_extent_mapped(page);
 
-       end = page_end;
        if (!PageUptodate(page)) {
                if (cleancache_get_page(page) == 0) {
                        BUG_ON(blocksize != PAGE_SIZE);
@@ -5230,11 +5235,6 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
        }
 }
 
-int extent_buffer_uptodate(struct extent_buffer *eb)
-{
-       return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-}
-
 int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb, int wait, int mirror_num)
 {
index a7a850abd60082a13baffa9bde3480d00b4718ff..b77d849098637eb95037ff54cc59783917a58e82 100644 (file)
@@ -83,8 +83,8 @@ static inline int le_test_bit(int nr, const u8 *addr)
        return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
 }
 
-extern void le_bitmap_set(u8 *map, unsigned int start, int len);
-extern void le_bitmap_clear(u8 *map, unsigned int start, int len);
+void le_bitmap_set(u8 *map, unsigned int start, int len);
+void le_bitmap_clear(u8 *map, unsigned int start, int len);
 
 struct extent_state;
 struct btrfs_root;
@@ -95,6 +95,13 @@ struct io_failure_record;
 typedef        blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
                                       int mirror_num, unsigned long bio_flags,
                                       u64 bio_offset);
+
+typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
+               struct bio *bio, u64 bio_offset);
+
+typedef blk_status_t (extent_submit_bio_done_t)(void *private_data,
+               struct bio *bio, int mirror_num);
+
 struct extent_io_ops {
        /*
         * The following callbacks must be allways defined, the function
@@ -286,7 +293,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
                          get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
-void extent_io_exit(void);
+void __cold extent_io_exit(void);
 
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
@@ -455,6 +462,11 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
        atomic_inc(&eb->refs);
 }
 
+static inline int extent_buffer_uptodate(struct extent_buffer *eb)
+{
+       return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+}
+
 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
                         unsigned long start, unsigned long len);
 void read_extent_buffer(const struct extent_buffer *eb, void *dst,
@@ -489,7 +501,6 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_buffer *eb);
 void set_extent_buffer_uptodate(struct extent_buffer *eb);
 void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-int extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_under_io(struct extent_buffer *eb);
 int map_private_extent_buffer(const struct extent_buffer *eb,
                              unsigned long offset, unsigned long min_len,
index d3bd02105d1c47ef182c9533f94285eb1d560533..53a0633c6ef731d2e16cec67a5338acb18937eb7 100644 (file)
@@ -2,7 +2,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/hardirq.h>
 #include "ctree.h"
 #include "extent_map.h"
 #include "compression.h"
@@ -20,7 +19,7 @@ int __init extent_map_init(void)
        return 0;
 }
 
-void extent_map_exit(void)
+void __cold extent_map_exit(void)
 {
        kmem_cache_destroy(extent_map_cache);
 }
@@ -552,6 +551,9 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
                ret = 0;
 
                existing = search_extent_mapping(em_tree, start, len);
+
+               trace_btrfs_handle_em_exist(existing, em, start, len);
+
                /*
                 * existing will always be non-NULL, since there must be
                 * extent causing the -EEXIST.
index b29f77bc0732eecef20c83a65bf6e6d93ad5c635..f6f8ba114977efa9e2860f3e7f352820e21b79c1 100644 (file)
@@ -86,7 +86,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
 struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
-void extent_map_exit(void);
+void __cold extent_map_exit(void);
 int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
index 41ab9073d1d4220efd68859a2c023c85f6a5393c..f247300170e59cad9783a7a2dbaf1a6ca8068cf9 100644 (file)
@@ -1691,7 +1691,7 @@ again:
                                    force_page_uptodate);
                if (ret) {
                        btrfs_delalloc_release_extents(BTRFS_I(inode),
-                                                      reserve_bytes);
+                                                      reserve_bytes, true);
                        break;
                }
 
@@ -1703,7 +1703,7 @@ again:
                        if (extents_locked == -EAGAIN)
                                goto again;
                        btrfs_delalloc_release_extents(BTRFS_I(inode),
-                                                      reserve_bytes);
+                                                      reserve_bytes, true);
                        ret = extents_locked;
                        break;
                }
@@ -1738,7 +1738,7 @@ again:
                                                fs_info->sb->s_blocksize_bits;
                        if (only_release_metadata) {
                                btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                                               release_bytes);
+                                                       release_bytes, true);
                        } else {
                                u64 __pos;
 
@@ -1747,7 +1747,7 @@ again:
                                        (dirty_pages << PAGE_SHIFT);
                                btrfs_delalloc_release_space(inode,
                                                data_reserved, __pos,
-                                               release_bytes);
+                                               release_bytes, true);
                        }
                }
 
@@ -1760,7 +1760,8 @@ again:
                if (extents_locked)
                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                             lockstart, lockend, &cached_state);
-               btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
+                                              (ret != 0));
                if (ret) {
                        btrfs_drop_pages(pages, num_pages);
                        break;
@@ -1800,11 +1801,11 @@ again:
                if (only_release_metadata) {
                        btrfs_end_write_no_snapshotting(root);
                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                       release_bytes);
+                                       release_bytes, true);
                } else {
                        btrfs_delalloc_release_space(inode, data_reserved,
                                        round_down(pos, fs_info->sectorsize),
-                                       release_bytes);
+                                       release_bytes, true);
                }
        }
 
@@ -1997,8 +1998,6 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 {
        struct btrfs_file_private *private = filp->private_data;
 
-       if (private && private->trans)
-               btrfs_ioctl_trans_end(filp);
        if (private && private->filldir_buf)
                kfree(private->filldir_buf);
        kfree(private);
@@ -2189,12 +2188,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                goto out;
        }
 
-       /*
-        * ok we haven't committed the transaction yet, lets do a commit
-        */
-       if (file->private_data)
-               btrfs_ioctl_trans_end(file);
-
        /*
         * We use start here because we will need to wait on the IO to complete
         * in btrfs_sync_log, which could require joining a transaction (for
@@ -2214,7 +2207,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        }
        trans->sync = true;
 
-       ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
+       ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
        if (ret < 0) {
                /* Fallthrough and commit/free transaction. */
                ret = 1;
@@ -2482,7 +2475,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
                if ((!ordered ||
                    (ordered->file_offset + ordered->len <= lockstart ||
                     ordered->file_offset > lockend)) &&
-                    !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+                    !filemap_range_has_page(inode->i_mapping,
+                                            lockstart, lockend)) {
                        if (ordered)
                                btrfs_put_ordered_extent(ordered);
                        break;
@@ -3378,7 +3372,7 @@ const struct file_operations btrfs_file_operations = {
        .dedupe_file_range = btrfs_dedupe_file_range,
 };
 
-void btrfs_auto_defrag_exit(void)
+void __cold btrfs_auto_defrag_exit(void)
 {
        kmem_cache_destroy(btrfs_inode_defrag_cachep);
 }
index a9f22ac50d6a92095dd43d35e613091b5dc6fc87..d0dde9e6afd76916ab491fdbb2b42eb75e433afc 100644 (file)
@@ -3547,7 +3547,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
        if (ret) {
                if (release_metadata)
                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
-                                       inode->i_size);
+                                       inode->i_size, true);
 #ifdef DEBUG
                btrfs_err(fs_info,
                          "failed to write free ino cache for root %llu",
index fe5e0324dca9233205a3c176f10a893dba8c9136..af36a6a971fe56e9151b9bb10dd1a83ab08eacaf 100644 (file)
@@ -1071,7 +1071,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-       path->reada = 1;
+       path->reada = READA_FORWARD;
 
        path2 = btrfs_alloc_path();
        if (!path2) {
@@ -1573,7 +1573,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
         */
        path->skip_locking = 1;
        path->search_commit_root = 1;
-       path->reada = 1;
+       path->reada = READA_FORWARD;
 
        info = search_free_space_info(NULL, fs_info, block_group, path, 0);
        if (IS_ERR(info)) {
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
deleted file mode 100644 (file)
index baacc18..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <crypto/hash.h>
-#include <linux/err.h>
-#include "hash.h"
-
-static struct crypto_shash *tfm;
-
-int __init btrfs_hash_init(void)
-{
-       tfm = crypto_alloc_shash("crc32c", 0, 0);
-
-       return PTR_ERR_OR_ZERO(tfm);
-}
-
-const char* btrfs_crc32c_impl(void)
-{
-       return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
-}
-
-void btrfs_hash_exit(void)
-{
-       crypto_free_shash(tfm);
-}
-
-u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
-{
-       SHASH_DESC_ON_STACK(shash, tfm);
-       u32 *ctx = (u32 *)shash_desc_ctx(shash);
-       u32 retval;
-       int err;
-
-       shash->tfm = tfm;
-       shash->flags = 0;
-       *ctx = crc;
-
-       err = crypto_shash_update(shash, address, length);
-       BUG_ON(err);
-
-       retval = *ctx;
-       barrier_data(ctx);
-       return retval;
-}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
deleted file mode 100644 (file)
index c3a2ec5..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __HASH__
-#define __HASH__
-
-int __init btrfs_hash_init(void);
-
-void btrfs_hash_exit(void);
-const char* btrfs_crc32c_impl(void);
-
-u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
-       return btrfs_crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
-                                   int len)
-{
-       return (u64) btrfs_crc32c(parent_objectid, name, len);
-}
-
-#endif
index 65e1a76bf7557441b22b16d4765a531b18b23562..1d5631ef2738888380a9c522acb12b06eaf53e83 100644 (file)
@@ -18,7 +18,6 @@
 
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 #include "print-tree.h"
 
index 022b19336feeadd7c9fb686fcc94f1266b7b382f..9409dcc7020d78455e5eafc65f1331b6c37f8e85 100644 (file)
@@ -500,12 +500,12 @@ again:
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
                                              prealloc, prealloc, &alloc_hint);
        if (ret) {
-               btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
                goto out_put;
        }
 
        ret = btrfs_write_out_ino_cache(root, trans, path, inode);
-       btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+       btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
 out_put:
        iput(inode);
 out_release:
index f53470112670b2de73343e635e4304fd08816468..1f091c2358a408b340b2172848246021b07bb307 100644 (file)
@@ -58,7 +58,6 @@
 #include "free-space-cache.h"
 #include "inode-map.h"
 #include "backref.h"
-#include "hash.h"
 #include "props.h"
 #include "qgroup.h"
 #include "dedupe.h"
@@ -102,7 +101,7 @@ static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 };
 
 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
-static int btrfs_truncate(struct inode *inode);
+static int btrfs_truncate(struct inode *inode, bool skip_writeback);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
@@ -277,12 +276,12 @@ fail:
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static noinline int cow_file_range_inline(struct btrfs_root *root,
-                                         struct inode *inode, u64 start,
+static noinline int cow_file_range_inline(struct inode *inode, u64 start,
                                          u64 end, size_t compressed_size,
                                          int compress_type,
                                          struct page **compressed_pages)
 {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
        u64 isize = i_size_read(inode);
@@ -458,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode,
                                        int *num_added)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 blocksize = fs_info->sectorsize;
        u64 actual_end;
        u64 isize = i_size_read(inode);
@@ -580,11 +578,11 @@ cont:
                        /* we didn't compress the entire range, try
                         * to make an uncompressed inline extent.
                         */
-                       ret = cow_file_range_inline(root, inode, start, end,
-                                           0, BTRFS_COMPRESS_NONE, NULL);
+                       ret = cow_file_range_inline(inode, start, end, 0,
+                                                   BTRFS_COMPRESS_NONE, NULL);
                } else {
                        /* try making a compressed inline extent */
-                       ret = cow_file_range_inline(root, inode, start, end,
+                       ret = cow_file_range_inline(inode, start, end,
                                                    total_compressed,
                                                    compress_type, pages);
                }
@@ -961,7 +959,6 @@ static noinline int cow_file_range(struct inode *inode,
        u64 alloc_hint = 0;
        u64 num_bytes;
        unsigned long ram_size;
-       u64 disk_num_bytes;
        u64 cur_alloc_size = 0;
        u64 blocksize = fs_info->sectorsize;
        struct btrfs_key ins;
@@ -979,14 +976,14 @@ static noinline int cow_file_range(struct inode *inode,
 
        num_bytes = ALIGN(end - start + 1, blocksize);
        num_bytes = max(blocksize,  num_bytes);
-       disk_num_bytes = num_bytes;
+       ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
 
        inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
 
        if (start == 0) {
                /* lets try to make an inline extent */
-               ret = cow_file_range_inline(root, inode, start, end, 0,
-                                       BTRFS_COMPRESS_NONE, NULL);
+               ret = cow_file_range_inline(inode, start, end, 0,
+                                           BTRFS_COMPRESS_NONE, NULL);
                if (ret == 0) {
                        /*
                         * We use DO_ACCOUNTING here because we need the
@@ -1010,15 +1007,12 @@ static noinline int cow_file_range(struct inode *inode,
                }
        }
 
-       BUG_ON(disk_num_bytes >
-              btrfs_super_total_bytes(fs_info->super_copy));
-
        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
        btrfs_drop_extent_cache(BTRFS_I(inode), start,
                        start + num_bytes - 1, 0);
 
-       while (disk_num_bytes > 0) {
-               cur_alloc_size = disk_num_bytes;
+       while (num_bytes > 0) {
+               cur_alloc_size = num_bytes;
                ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
                                           fs_info->sectorsize, 0, alloc_hint,
                                           &ins, 1, 1);
@@ -1082,11 +1076,10 @@ static noinline int cow_file_range(struct inode *inode,
                                             delalloc_end, locked_page,
                                             EXTENT_LOCKED | EXTENT_DELALLOC,
                                             page_ops);
-               if (disk_num_bytes < cur_alloc_size)
-                       disk_num_bytes = 0;
+               if (num_bytes < cur_alloc_size)
+                       num_bytes = 0;
                else
-                       disk_num_bytes -= cur_alloc_size;
-               num_bytes -= cur_alloc_size;
+                       num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
                extent_reserved = false;
@@ -1262,6 +1255,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
                list_del(&sums->list);
                kfree(sums);
        }
+       if (ret < 0)
+               return ret;
        return 1;
 }
 
@@ -1394,10 +1389,23 @@ next_slot:
                                goto out_check;
                        if (btrfs_extent_readonly(fs_info, disk_bytenr))
                                goto out_check;
-                       if (btrfs_cross_ref_exist(root, ino,
-                                                 found_key.offset -
-                                                 extent_offset, disk_bytenr))
+                       ret = btrfs_cross_ref_exist(root, ino,
+                                                   found_key.offset -
+                                                   extent_offset, disk_bytenr);
+                       if (ret) {
+                               /*
+                                * ret could be -EIO if the above fails to read
+                                * metadata.
+                                */
+                               if (ret < 0) {
+                                       if (cow_start != (u64)-1)
+                                               cur_offset = cow_start;
+                                       goto error;
+                               }
+
+                               WARN_ON_ONCE(nolock);
                                goto out_check;
+                       }
                        disk_bytenr += extent_offset;
                        disk_bytenr += cur_offset - found_key.offset;
                        num_bytes = min(end + 1, extent_end) - cur_offset;
@@ -1415,10 +1423,22 @@ next_slot:
                         * this ensure that csum for a given extent are
                         * either valid or do not exist.
                         */
-                       if (csum_exist_in_range(fs_info, disk_bytenr,
-                                               num_bytes)) {
+                       ret = csum_exist_in_range(fs_info, disk_bytenr,
+                                                 num_bytes);
+                       if (ret) {
                                if (!nolock)
                                        btrfs_end_write_no_snapshotting(root);
+
+                               /*
+                                * ret could be -EIO if the above fails to read
+                                * metadata.
+                                */
+                               if (ret < 0) {
+                                       if (cow_start != (u64)-1)
+                                               cur_offset = cow_start;
+                                       goto error;
+                               }
+                               WARN_ON_ONCE(nolock);
                                goto out_check;
                        }
                        if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
@@ -1847,7 +1867,7 @@ static void btrfs_clear_bit_hook(void *private_data,
                 */
                if (*bits & EXTENT_CLEAR_META_RESV &&
                    root != fs_info->tree_root)
-                       btrfs_delalloc_release_metadata(inode, len);
+                       btrfs_delalloc_release_metadata(inode, len, false);
 
                /* For sanity tests. */
                if (btrfs_is_testing(fs_info))
@@ -1921,8 +1941,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
-                                   int mirror_num, unsigned long bio_flags,
+static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
                                    u64 bio_offset)
 {
        struct inode *inode = private_data;
@@ -1941,9 +1960,8 @@ static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
-                         int mirror_num, unsigned long bio_flags,
-                         u64 bio_offset)
+static blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
+                         int mirror_num)
 {
        struct inode *inode = private_data;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2015,8 +2033,8 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
                /* we're doing a write, do the async checksumming */
                ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
                                          bio_offset, inode,
-                                         __btrfs_submit_bio_start,
-                                         __btrfs_submit_bio_done);
+                                         btrfs_submit_bio_start,
+                                         btrfs_submit_bio_done);
                goto out;
        } else if (!skip_sum) {
                ret = btrfs_csum_one_bio(inode, bio, 0, 0);
@@ -2134,7 +2152,7 @@ again:
 
        ClearPageChecked(page);
        set_page_dirty(page);
-       btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+       btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
 out:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
                             &cached_state);
@@ -2754,12 +2772,10 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
        struct sa_defrag_extent_backref *backref;
        struct sa_defrag_extent_backref *prev = NULL;
        struct inode *inode;
-       struct btrfs_root *root;
        struct rb_node *node;
        int ret;
 
        inode = new->inode;
-       root = BTRFS_I(inode)->root;
 
        path = btrfs_alloc_path();
        if (!path)
@@ -3247,6 +3263,16 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
                                      start, (size_t)(end - start + 1));
 }
 
+/*
+ * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ *
+ * @inode: The inode we want to perform iput on
+ *
+ * This function uses the generic vfs_inode::i_count to track whether we should
+ * just decrement it (in case it's > 1) or if this is the last iput then link
+ * the inode to the delayed iput machinery. Delayed iputs are processed at
+ * transaction commit time/superblock commit/cleaner kthread.
+ */
 void btrfs_add_delayed_iput(struct inode *inode)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3256,12 +3282,8 @@ void btrfs_add_delayed_iput(struct inode *inode)
                return;
 
        spin_lock(&fs_info->delayed_iput_lock);
-       if (binode->delayed_iput_count == 0) {
-               ASSERT(list_empty(&binode->delayed_iput));
-               list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
-       } else {
-               binode->delayed_iput_count++;
-       }
+       ASSERT(list_empty(&binode->delayed_iput));
+       list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
        spin_unlock(&fs_info->delayed_iput_lock);
 }
 
@@ -3274,13 +3296,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
 
                inode = list_first_entry(&fs_info->delayed_iputs,
                                struct btrfs_inode, delayed_iput);
-               if (inode->delayed_iput_count) {
-                       inode->delayed_iput_count--;
-                       list_move_tail(&inode->delayed_iput,
-                                       &fs_info->delayed_iputs);
-               } else {
-                       list_del_init(&inode->delayed_iput);
-               }
+               list_del_init(&inode->delayed_iput);
                spin_unlock(&fs_info->delayed_iput_lock);
                iput(&inode->vfs_inode);
                spin_lock(&fs_info->delayed_iput_lock);
@@ -3350,7 +3366,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = inode->root;
        struct btrfs_block_rsv *block_rsv = NULL;
        int reserve = 0;
-       int insert = 0;
+       bool insert = false;
        int ret;
 
        if (!root->orphan_block_rsv) {
@@ -3360,7 +3376,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
                        return -ENOMEM;
        }
 
+       if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                             &inode->runtime_flags))
+               insert = true;
+
+       if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+                             &inode->runtime_flags))
+               reserve = 1;
+
        spin_lock(&root->orphan_lock);
+       /* If someone has created ->orphan_block_rsv, be happy to use it. */
        if (!root->orphan_block_rsv) {
                root->orphan_block_rsv = block_rsv;
        } else if (block_rsv) {
@@ -3368,26 +3393,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
                block_rsv = NULL;
        }
 
-       if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-                             &inode->runtime_flags)) {
-#if 0
-               /*
-                * For proper ENOSPC handling, we should do orphan
-                * cleanup when mounting. But this introduces backward
-                * compatibility issue.
-                */
-               if (!xchg(&root->orphan_item_inserted, 1))
-                       insert = 2;
-               else
-                       insert = 1;
-#endif
-               insert = 1;
+       if (insert)
                atomic_inc(&root->orphan_inodes);
-       }
-
-       if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
-                             &inode->runtime_flags))
-               reserve = 1;
        spin_unlock(&root->orphan_lock);
 
        /* grab metadata reservation from transaction handle */
@@ -3411,7 +3418,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
        }
 
        /* insert an orphan item to track this unlinked/truncated file */
-       if (insert >= 1) {
+       if (insert) {
                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
                if (ret) {
                        if (reserve) {
@@ -3435,15 +3442,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
                ret = 0;
        }
 
-       /* insert an orphan item to track subvolume contains orphan files */
-       if (insert >= 2) {
-               ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
-                                              root->root_key.objectid);
-               if (ret && ret != -EEXIST) {
-                       btrfs_abort_transaction(trans, ret);
-                       return ret;
-               }
-       }
        return 0;
 }
 
@@ -3644,7 +3642,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                goto out;
                        }
 
-                       ret = btrfs_truncate(inode);
+                       ret = btrfs_truncate(inode, false);
                        if (ret)
                                btrfs_orphan_del(NULL, BTRFS_I(inode));
                } else {
@@ -4711,7 +4709,6 @@ delete:
                                if (updates) {
                                        trans->delayed_ref_updates = 0;
                                        ret = btrfs_run_delayed_refs(trans,
-                                                                  fs_info,
                                                                   updates * 2);
                                        if (ret && !err)
                                                err = ret;
@@ -4751,8 +4748,7 @@ error:
                unsigned long updates = trans->delayed_ref_updates;
                if (updates) {
                        trans->delayed_ref_updates = 0;
-                       ret = btrfs_run_delayed_refs(trans, fs_info,
-                                                    updates * 2);
+                       ret = btrfs_run_delayed_refs(trans, updates * 2);
                        if (ret && !err)
                                err = ret;
                }
@@ -4806,8 +4802,8 @@ again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
                btrfs_delalloc_release_space(inode, data_reserved,
-                                            block_start, blocksize);
-               btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+                                            block_start, blocksize, true);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
                ret = -ENOMEM;
                goto out;
        }
@@ -4874,8 +4870,8 @@ again:
 out_unlock:
        if (ret)
                btrfs_delalloc_release_space(inode, data_reserved, block_start,
-                                            blocksize);
-       btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+                                            blocksize, true);
+       btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
        unlock_page(page);
        put_page(page);
 out:
@@ -5130,7 +5126,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                inode_dio_wait(inode);
                btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
 
-               ret = btrfs_truncate(inode);
+               ret = btrfs_truncate(inode, newsize == oldsize);
                if (ret && inode->i_nlink) {
                        int err;
 
@@ -5466,7 +5462,8 @@ no_delete:
 
 /*
  * this returns the key found in the dir entry in the location pointer.
- * If no dir entries were found, location->objectid is 0.
+ * If no dir entries were found, returns -ENOENT.
+ * If found a corrupted location in dir entry, returns -EUCLEAN.
  */
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
                               struct btrfs_key *location)
@@ -5484,27 +5481,27 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 
        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
                        name, namelen, 0);
-       if (IS_ERR(di))
+       if (!di) {
+               ret = -ENOENT;
+               goto out;
+       }
+       if (IS_ERR(di)) {
                ret = PTR_ERR(di);
-
-       if (IS_ERR_OR_NULL(di))
-               goto out_err;
+               goto out;
+       }
 
        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
        if (location->type != BTRFS_INODE_ITEM_KEY &&
            location->type != BTRFS_ROOT_ITEM_KEY) {
+               ret = -EUCLEAN;
                btrfs_warn(root->fs_info,
 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
                           __func__, name, btrfs_ino(BTRFS_I(dir)),
                           location->objectid, location->type, location->offset);
-               goto out_err;
        }
 out:
        btrfs_free_path(path);
        return ret;
-out_err:
-       location->objectid = 0;
-       goto out;
 }
 
 /*
@@ -5807,9 +5804,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        if (ret < 0)
                return ERR_PTR(ret);
 
-       if (location.objectid == 0)
-               return ERR_PTR(-ENOENT);
-
        if (location.type == BTRFS_INODE_ITEM_KEY) {
                inode = btrfs_iget(dir->i_sb, &location, root, NULL);
                return inode;
@@ -7443,76 +7437,6 @@ out:
        return ret;
 }
 
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
-{
-       struct radix_tree_root *root = &inode->i_mapping->page_tree;
-       bool found = false;
-       void **pagep = NULL;
-       struct page *page = NULL;
-       unsigned long start_idx;
-       unsigned long end_idx;
-
-       start_idx = start >> PAGE_SHIFT;
-
-       /*
-        * end is the last byte in the last page.  end == start is legal
-        */
-       end_idx = end >> PAGE_SHIFT;
-
-       rcu_read_lock();
-
-       /* Most of the code in this while loop is lifted from
-        * find_get_page.  It's been modified to begin searching from a
-        * page and return just the first page found in that range.  If the
-        * found idx is less than or equal to the end idx then we know that
-        * a page exists.  If no pages are found or if those pages are
-        * outside of the range then we're fine (yay!) */
-       while (page == NULL &&
-              radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
-               page = radix_tree_deref_slot(pagep);
-               if (unlikely(!page))
-                       break;
-
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page)) {
-                               page = NULL;
-                               continue;
-                       }
-                       /*
-                        * Otherwise, shmem/tmpfs must be storing a swap entry
-                        * here as an exceptional entry: so return it without
-                        * attempting to raise page count.
-                        */
-                       page = NULL;
-                       break; /* TODO: Is this relevant for this use case? */
-               }
-
-               if (!page_cache_get_speculative(page)) {
-                       page = NULL;
-                       continue;
-               }
-
-               /*
-                * Has the page moved?
-                * This is part of the lockless pagecache protocol. See
-                * include/linux/pagemap.h for details.
-                */
-               if (unlikely(page != *pagep)) {
-                       put_page(page);
-                       page = NULL;
-               }
-       }
-
-       if (page) {
-               if (page->index <= end_idx)
-                       found = true;
-               put_page(page);
-       }
-
-       rcu_read_unlock();
-       return found;
-}
-
 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                              struct extent_state **cached_state, int writing)
 {
@@ -7538,8 +7462,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                 * get stale data.
                 */
                if (!ordered &&
-                   (!writing ||
-                    !btrfs_page_exists_in_range(inode, lockstart, lockend)))
+                   (!writing || !filemap_range_has_page(inode->i_mapping,
+                                                        lockstart, lockend)))
                        break;
 
                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -8270,9 +8194,8 @@ static void btrfs_endio_direct_write(struct bio *bio)
        bio_put(bio);
 }
 
-static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
-                                   struct bio *bio, int mirror_num,
-                                   unsigned long bio_flags, u64 offset)
+static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
+                                   struct bio *bio, u64 offset)
 {
        struct inode *inode = private_data;
        blk_status_t ret;
@@ -8298,13 +8221,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
                err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
 
        if (err) {
-               dip->errors = 1;
-
                /*
-                * before atomic variable goto zero, we must make sure
-                * dip->errors is perceived to be set.
+                * We want to perceive the errors flag being set before
+                * decrementing the reference count. We don't need a barrier
+                * since atomic operations with a return value are fully
+                * ordered as per atomic_t.txt
                 */
-               smp_mb__before_atomic();
+               dip->errors = 1;
        }
 
        /* if there are more bios still pending for this dio, just exit */
@@ -8352,9 +8275,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
        return 0;
 }
 
-static inline blk_status_t
-__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
-                      int async_submit)
+static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
+               struct inode *inode, u64 file_offset, int async_submit)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_dio_private *dip = bio->bi_private;
@@ -8377,8 +8299,8 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
        if (write && async_submit) {
                ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
                                          file_offset, inode,
-                                         __btrfs_submit_bio_start_direct_io,
-                                         __btrfs_submit_bio_done);
+                                         btrfs_submit_bio_start_direct_io,
+                                         btrfs_submit_bio_done);
                goto err;
        } else if (write) {
                /*
@@ -8464,7 +8386,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
                 */
                atomic_inc(&dip->pending_bios);
 
-               status = __btrfs_submit_dio_bio(bio, inode, file_offset,
+               status = btrfs_submit_dio_bio(bio, inode, file_offset,
                                                async_submit);
                if (status) {
                        bio_put(bio);
@@ -8484,7 +8406,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
        } while (submit_len > 0);
 
 submit:
-       status = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+       status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
        if (!status)
                return 0;
 
@@ -8492,10 +8414,11 @@ submit:
 out_err:
        dip->errors = 1;
        /*
-        * before atomic variable goto zero, we must
-        * make sure dip->errors is perceived to be set.
+        * Before atomic variable goto zero, we must  make sure dip->errors is
+        * perceived to be set. This ordering is ensured by the fact that an
+        * atomic operations with a return value are fully ordered as per
+        * atomic_t.txt
         */
-       smp_mb__before_atomic();
        if (atomic_dec_and_test(&dip->pending_bios))
                bio_io_error(dip->orig_bio);
 
@@ -8713,7 +8636,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                if (ret < 0 && ret != -EIOCBQUEUED) {
                        if (dio_data.reserve)
                                btrfs_delalloc_release_space(inode, data_reserved,
-                                       offset, dio_data.reserve);
+                                       offset, dio_data.reserve, true);
                        /*
                         * On error we might have left some ordered extents
                         * without submitting corresponding bios for them, so
@@ -8729,8 +8652,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                                        false);
                } else if (ret >= 0 && (size_t)ret < count)
                        btrfs_delalloc_release_space(inode, data_reserved,
-                                       offset, count - (size_t)ret);
-               btrfs_delalloc_release_extents(BTRFS_I(inode), count);
+                                       offset, count - (size_t)ret, true);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
        }
 out:
        if (wakeup)
@@ -9045,7 +8968,8 @@ again:
                if (reserved_space < PAGE_SIZE) {
                        end = page_start + reserved_space - 1;
                        btrfs_delalloc_release_space(inode, data_reserved,
-                                       page_start, PAGE_SIZE - reserved_space);
+                                       page_start, PAGE_SIZE - reserved_space,
+                                       true);
                }
        }
 
@@ -9095,23 +9019,23 @@ again:
 
 out_unlock:
        if (!ret) {
-               btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+               btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
                sb_end_pagefault(inode->i_sb);
                extent_changeset_free(data_reserved);
                return VM_FAULT_LOCKED;
        }
        unlock_page(page);
 out:
-       btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+       btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
        btrfs_delalloc_release_space(inode, data_reserved, page_start,
-                                    reserved_space);
+                                    reserved_space, (ret != 0));
 out_noreserve:
        sb_end_pagefault(inode->i_sb);
        extent_changeset_free(data_reserved);
        return ret;
 }
 
-static int btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -9122,10 +9046,12 @@ static int btrfs_truncate(struct inode *inode)
        u64 mask = fs_info->sectorsize - 1;
        u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
 
-       ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
-                                      (u64)-1);
-       if (ret)
-               return ret;
+       if (!skip_writeback) {
+               ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+                                              (u64)-1);
+               if (ret)
+                       return ret;
+       }
 
        /*
         * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
@@ -9335,7 +9261,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->dir_index = 0;
        ei->last_unlink_trans = 0;
        ei->last_log_commit = 0;
-       ei->delayed_iput_count = 0;
 
        spin_lock_init(&ei->lock);
        ei->outstanding_extents = 0;
@@ -9455,7 +9380,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
 
-void btrfs_destroy_cachep(void)
+void __cold btrfs_destroy_cachep(void)
 {
        /*
         * Make sure all delayed rcu free inodes are flushed before we
index 3278ae592a2c55f7620ac0de368c0fb4ebcad5a4..b2db3988813f8537d1af50cfe9f72e064cdde48a 100644 (file)
@@ -106,7 +106,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
                       int no_time_update);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
-static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags)
 {
        if (S_ISDIR(mode))
                return flags;
@@ -1197,7 +1197,7 @@ again:
                spin_unlock(&BTRFS_I(inode)->lock);
                btrfs_delalloc_release_space(inode, data_reserved,
                                start_index << PAGE_SHIFT,
-                               (page_cnt - i_done) << PAGE_SHIFT);
+                               (page_cnt - i_done) << PAGE_SHIFT, true);
        }
 
 
@@ -1215,7 +1215,8 @@ again:
                unlock_page(pages[i]);
                put_page(pages[i]);
        }
-       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
+       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
+                                      false);
        extent_changeset_free(data_reserved);
        return i_done;
 out:
@@ -1225,8 +1226,9 @@ out:
        }
        btrfs_delalloc_release_space(inode, data_reserved,
                        start_index << PAGE_SHIFT,
-                       page_cnt << PAGE_SHIFT);
-       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
+                       page_cnt << PAGE_SHIFT, true);
+       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
+                                      true);
        extent_changeset_free(data_reserved);
        return ret;
 
@@ -2600,7 +2602,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        range->len = (u64)-1;
                }
                ret = btrfs_defrag_file(file_inode(file), file,
-                                       range, 0, 0);
+                                       range, BTRFS_OLDEST_GENERATION, 0);
                if (ret > 0)
                        ret = 0;
                kfree(range);
@@ -3936,73 +3938,6 @@ int btrfs_clone_file_range(struct file *src_file, loff_t off,
        return btrfs_clone_files(dst_file, src_file, off, len, destoff);
 }
 
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks.  They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-static long btrfs_ioctl_trans_start(struct file *file)
-{
-       struct inode *inode = file_inode(file);
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
-       struct btrfs_file_private *private;
-       int ret;
-       static bool warned = false;
-
-       ret = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
-       if (!warned) {
-               btrfs_warn(fs_info,
-                       "Userspace transaction mechanism is considered "
-                       "deprecated and slated to be removed in 4.17. "
-                       "If you have a valid use case please "
-                       "speak up on the mailing list");
-               WARN_ON(1);
-               warned = true;
-       }
-
-       ret = -EINPROGRESS;
-       private = file->private_data;
-       if (private && private->trans)
-               goto out;
-       if (!private) {
-               private = kzalloc(sizeof(struct btrfs_file_private),
-                                 GFP_KERNEL);
-               if (!private)
-                       return -ENOMEM;
-               file->private_data = private;
-       }
-
-       ret = -EROFS;
-       if (btrfs_root_readonly(root))
-               goto out;
-
-       ret = mnt_want_write_file(file);
-       if (ret)
-               goto out;
-
-       atomic_inc(&fs_info->open_ioctl_trans);
-
-       ret = -ENOMEM;
-       trans = btrfs_start_ioctl_transaction(root);
-       if (IS_ERR(trans))
-               goto out_drop;
-
-       private->trans = trans;
-       return 0;
-
-out_drop:
-       atomic_dec(&fs_info->open_ioctl_trans);
-       mnt_drop_write_file(file);
-out:
-       return ret;
-}
-
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 {
        struct inode *inode = file_inode(file);
@@ -4244,30 +4179,6 @@ out:
        return ret;
 }
 
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks.  They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-long btrfs_ioctl_trans_end(struct file *file)
-{
-       struct inode *inode = file_inode(file);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_file_private *private = file->private_data;
-
-       if (!private || !private->trans)
-               return -EINVAL;
-
-       btrfs_end_transaction(private->trans);
-       private->trans = NULL;
-
-       atomic_dec(&root->fs_info->open_ioctl_trans);
-
-       mnt_drop_write_file(file);
-       return 0;
-}
-
 static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
                                            void __user *argp)
 {
@@ -4429,7 +4340,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
                ret = 0;
                break;
        case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
-               ret = btrfs_dev_replace_cancel(fs_info, p);
+               p->result = btrfs_dev_replace_cancel(fs_info);
+               ret = 0;
                break;
        default:
                ret = -EINVAL;
@@ -5138,10 +5050,17 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
        received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
                                       BTRFS_UUID_SIZE);
        if (received_uuid_changed &&
-           !btrfs_is_empty_uuid(root_item->received_uuid))
-               btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid,
-                                   BTRFS_UUID_KEY_RECEIVED_SUBVOL,
-                                   root->root_key.objectid);
+           !btrfs_is_empty_uuid(root_item->received_uuid)) {
+               ret = btrfs_uuid_tree_rem(trans, fs_info,
+                                         root_item->received_uuid,
+                                         BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+                                         root->root_key.objectid);
+               if (ret && ret != -ENOENT) {
+                       btrfs_abort_transaction(trans, ret);
+                       btrfs_end_transaction(trans);
+                       goto out;
+               }
+       }
        memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
        btrfs_set_root_stransid(root_item, sa->stransid);
        btrfs_set_root_rtransid(root_item, sa->rtransid);
@@ -5574,10 +5493,6 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_dev_info(fs_info, argp);
        case BTRFS_IOC_BALANCE:
                return btrfs_ioctl_balance(file, NULL);
-       case BTRFS_IOC_TRANS_START:
-               return btrfs_ioctl_trans_start(file);
-       case BTRFS_IOC_TRANS_END:
-               return btrfs_ioctl_trans_end(file);
        case BTRFS_IOC_TREE_SEARCH:
                return btrfs_ioctl_tree_search(file, argp);
        case BTRFS_IOC_TREE_SEARCH_V2:
index d13128c70dddc2b440f29f167efe0862d8792e2e..621083f8932c7e5b61d5d9734bd187e73abd1e69 100644 (file)
@@ -290,7 +290,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
                /*
                 * Make sure counter is updated before we wake up waiters.
                 */
-               smp_mb();
+               smp_mb__after_atomic();
                if (waitqueue_active(&eb->write_lock_wq))
                        wake_up(&eb->write_lock_wq);
        } else {
index 6c7f18cd3b61221b083e24969df9e479f5c655d3..1c7f7f70caf4926319597e94e803611a280f08fe 100644 (file)
@@ -382,14 +382,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
        struct workspace *workspace = list_entry(ws, struct workspace, list);
        size_t in_len;
        size_t out_len;
-       size_t tot_len;
        int ret = 0;
        char *kaddr;
        unsigned long bytes;
 
        BUG_ON(srclen < LZO_LEN);
 
-       tot_len = read_compress_length(data_in);
        data_in += LZO_LEN;
 
        in_len = read_compress_length(data_in);
index 5b311aeddcc80d5b58b9c337c3e398483999deb2..661cc3db0c7c023610f0bf57ee92e236e5b493b1 100644 (file)
@@ -610,7 +610,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
        btrfs_mod_outstanding_extents(btrfs_inode, -1);
        spin_unlock(&btrfs_inode->lock);
        if (root != fs_info->tree_root)
-               btrfs_delalloc_release_metadata(btrfs_inode, entry->len);
+               btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
 
        tree = &btrfs_inode->ordered_tree;
        spin_lock_irq(&tree->lock);
@@ -1154,7 +1154,7 @@ int __init ordered_data_init(void)
        return 0;
 }
 
-void ordered_data_exit(void)
+void __cold ordered_data_exit(void)
 {
        kmem_cache_destroy(btrfs_ordered_extent_cache);
 }
index 56c4c0ee6381f2a271216adbc7cbe278d48b98c0..4a1672a13ba6c53a6187ad378056f53ad42a16c7 100644 (file)
@@ -151,7 +151,9 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
                                         unsigned long bytes)
 {
        int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
-       return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
+       int csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+       return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size;
 }
 
 static inline void
@@ -215,5 +217,5 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
                               struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
-void ordered_data_exit(void);
+void __cold ordered_data_exit(void);
 #endif
index 569205e651c7dd46c98a8fe3d0798140322d4ec3..4a8770485f776c59d21ef176f9c46099c19e43e9 100644 (file)
@@ -365,9 +365,13 @@ void btrfs_print_tree(struct extent_buffer *c)
                       btrfs_node_blockptr(c, i));
        }
        for (i = 0; i < nr; i++) {
-               struct extent_buffer *next = read_tree_block(fs_info,
-                                       btrfs_node_blockptr(c, i),
-                                       btrfs_node_ptr_generation(c, i));
+               struct btrfs_key first_key;
+               struct extent_buffer *next;
+
+               btrfs_node_key_to_cpu(c, &first_key, i);
+               next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
+                                      btrfs_node_ptr_generation(c, i),
+                                      level - 1, &first_key);
                if (IS_ERR(next)) {
                        continue;
                } else if (!extent_buffer_uptodate(next)) {
index b30a056963ab91a5f110d0a034bd5bd81adeb9f1..5859f7d3cf3e5c28e78bce1dbfea8367f7bb859d 100644 (file)
@@ -19,8 +19,8 @@
 #include <linux/hashtable.h>
 #include "props.h"
 #include "btrfs_inode.h"
-#include "hash.h"
 #include "transaction.h"
+#include "ctree.h"
 #include "xattr.h"
 #include "compression.h"
 
@@ -116,7 +116,7 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
                return -EINVAL;
 
        if (value_len == 0) {
-               ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+               ret = btrfs_setxattr(trans, inode, handler->xattr_name,
                                       NULL, 0, flags);
                if (ret)
                        return ret;
@@ -130,13 +130,13 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
        ret = handler->validate(value, value_len);
        if (ret)
                return ret;
-       ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+       ret = btrfs_setxattr(trans, inode, handler->xattr_name,
                               value, value_len, flags);
        if (ret)
                return ret;
        ret = handler->apply(inode, value, value_len);
        if (ret) {
-               __btrfs_setxattr(trans, inode, handler->xattr_name,
+               btrfs_setxattr(trans, inode, handler->xattr_name,
                                 NULL, 0, flags);
                return ret;
        }
index aa259d6986e1c24049c8f500184b2393d99b0549..f583f13ff26e9cbf67a57adeb86016950e8714d4 100644 (file)
  *  - check all ioctl parameters
  */
 
+/*
+ * Helpers to access qgroup reservation
+ *
+ * Callers should ensure the lock context and type are valid
+ */
+
+static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
+{
+       u64 ret = 0;
+       int i;
+
+       for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+               ret += qgroup->rsv.values[i];
+
+       return ret;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
+{
+       if (type == BTRFS_QGROUP_RSV_DATA)
+               return "data";
+       if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
+               return "meta_pertrans";
+       if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+               return "meta_prealloc";
+       return NULL;
+}
+#endif
+
+static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
+                          struct btrfs_qgroup *qgroup, u64 num_bytes,
+                          enum btrfs_qgroup_rsv_type type)
+{
+       trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
+       qgroup->rsv.values[type] += num_bytes;
+}
+
+static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
+                              struct btrfs_qgroup *qgroup, u64 num_bytes,
+                              enum btrfs_qgroup_rsv_type type)
+{
+       trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
+       if (qgroup->rsv.values[type] >= num_bytes) {
+               qgroup->rsv.values[type] -= num_bytes;
+               return;
+       }
+#ifdef CONFIG_BTRFS_DEBUG
+       WARN_RATELIMIT(1,
+               "qgroup %llu %s reserved space underflow, have %llu to free %llu",
+               qgroup->qgroupid, qgroup_rsv_type_str(type),
+               qgroup->rsv.values[type], num_bytes);
+#endif
+       qgroup->rsv.values[type] = 0;
+}
+
+static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
+                                    struct btrfs_qgroup *dest,
+                                    struct btrfs_qgroup *src)
+{
+       int i;
+
+       for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+               qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
+}
+
+static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_qgroup *dest,
+                                         struct btrfs_qgroup *src)
+{
+       int i;
+
+       for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+               qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
+}
+
 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
                                           int mod)
 {
@@ -826,10 +902,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
        int slot;
 
        mutex_lock(&fs_info->qgroup_ioctl_lock);
-       if (fs_info->quota_root) {
-               set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
+       if (fs_info->quota_root)
                goto out;
-       }
 
        fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
        if (!fs_info->qgroup_ulist) {
@@ -923,8 +997,15 @@ out_add_root:
        }
        spin_lock(&fs_info->qgroup_lock);
        fs_info->quota_root = quota_root;
-       set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
+       set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
        spin_unlock(&fs_info->qgroup_lock);
+       ret = qgroup_rescan_init(fs_info, 0, 1);
+       if (!ret) {
+               qgroup_rescan_zero_tracking(fs_info);
+               btrfs_queue_work(fs_info->qgroup_rescan_workers,
+                                &fs_info->qgroup_rescan_work);
+       }
+
 out_free_path:
        btrfs_free_path(path);
 out_free_root:
@@ -991,33 +1072,29 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
                list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
 }
 
-static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
-                                     struct btrfs_qgroup *qgroup,
-                                     u64 num_bytes)
-{
-#ifdef CONFIG_BTRFS_DEBUG
-       WARN_ON(qgroup->reserved < num_bytes);
-       btrfs_debug(fs_info,
-               "qgroup %llu reserved space underflow, have: %llu, to free: %llu",
-               qgroup->qgroupid, qgroup->reserved, num_bytes);
-#endif
-       qgroup->reserved = 0;
-}
 /*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their reference and
- * exclusive counts adjusted.
+ * The easy accounting, we're updating qgroup relationship whose child qgroup
+ * only has exclusive extents.
+ *
+ * In this case, all exclsuive extents will also be exlusive for parent, so
+ * excl/rfer just get added/removed.
+ *
+ * So is qgroup reservation space, which should also be added/removed to
+ * parent.
+ * Or when child tries to release reservation space, parent will underflow its
+ * reservation (for relationship adding case).
  *
  * Caller should hold fs_info->qgroup_lock.
  */
 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
                                    struct ulist *tmp, u64 ref_root,
-                                   u64 num_bytes, int sign)
+                                   struct btrfs_qgroup *src, int sign)
 {
        struct btrfs_qgroup *qgroup;
        struct btrfs_qgroup_list *glist;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
+       u64 num_bytes = src->excl;
        int ret = 0;
 
        qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1030,13 +1107,11 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
        WARN_ON(sign < 0 && qgroup->excl < num_bytes);
        qgroup->excl += sign * num_bytes;
        qgroup->excl_cmpr += sign * num_bytes;
-       if (sign > 0) {
-               trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes);
-               if (qgroup->reserved < num_bytes)
-                       report_reserved_underflow(fs_info, qgroup, num_bytes);
-               else
-                       qgroup->reserved -= num_bytes;
-       }
+
+       if (sign > 0)
+               qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
+       else
+               qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
 
        qgroup_dirty(fs_info, qgroup);
 
@@ -1056,15 +1131,10 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
                qgroup->rfer_cmpr += sign * num_bytes;
                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
                qgroup->excl += sign * num_bytes;
-               if (sign > 0) {
-                       trace_qgroup_update_reserve(fs_info, qgroup,
-                                                   -(s64)num_bytes);
-                       if (qgroup->reserved < num_bytes)
-                               report_reserved_underflow(fs_info, qgroup,
-                                                         num_bytes);
-                       else
-                               qgroup->reserved -= num_bytes;
-               }
+               if (sign > 0)
+                       qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
+               else
+                       qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
                qgroup->excl_cmpr += sign * num_bytes;
                qgroup_dirty(fs_info, qgroup);
 
@@ -1107,7 +1177,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
        if (qgroup->excl == qgroup->rfer) {
                ret = 0;
                err = __qgroup_excl_accounting(fs_info, tmp, dst,
-                                              qgroup->excl, sign);
+                                              qgroup, sign);
                if (err < 0) {
                        ret = err;
                        goto out;
@@ -1414,7 +1484,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
        struct btrfs_qgroup_extent_record *entry;
        u64 bytenr = record->bytenr;
 
-       assert_spin_locked(&delayed_refs->lock);
+       lockdep_assert_held(&delayed_refs->lock);
        trace_btrfs_qgroup_trace_extent(fs_info, record);
 
        while (*p) {
@@ -1614,7 +1684,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
                return 0;
 
        if (!extent_buffer_uptodate(root_eb)) {
-               ret = btrfs_read_buffer(root_eb, root_gen);
+               ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
                if (ret)
                        goto out;
        }
@@ -1645,6 +1715,7 @@ walk_down:
        level = root_level;
        while (level >= 0) {
                if (path->nodes[level] == NULL) {
+                       struct btrfs_key first_key;
                        int parent_slot;
                        u64 child_gen;
                        u64 child_bytenr;
@@ -1657,8 +1728,10 @@ walk_down:
                        parent_slot = path->slots[level + 1];
                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+                       btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
 
-                       eb = read_tree_block(fs_info, child_bytenr, child_gen);
+                       eb = read_tree_block(fs_info, child_bytenr, child_gen,
+                                            level, &first_key);
                        if (IS_ERR(eb)) {
                                ret = PTR_ERR(eb);
                                goto out;
@@ -2009,9 +2082,9 @@ out_free:
        return ret;
 }
 
-int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
-                                struct btrfs_fs_info *fs_info)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_qgroup_extent_record *record;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct ulist *new_roots = NULL;
@@ -2080,17 +2153,9 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 {
        struct btrfs_root *quota_root = fs_info->quota_root;
        int ret = 0;
-       int start_rescan_worker = 0;
 
        if (!quota_root)
-               goto out;
-
-       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-           test_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
-               start_rescan_worker = 1;
-
-       if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
-               set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+               return ret;
 
        spin_lock(&fs_info->qgroup_lock);
        while (!list_empty(&fs_info->dirty_qgroups)) {
@@ -2119,18 +2184,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
        if (ret)
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 
-       if (!ret && start_rescan_worker) {
-               ret = qgroup_rescan_init(fs_info, 0, 1);
-               if (!ret) {
-                       qgroup_rescan_zero_tracking(fs_info);
-                       btrfs_queue_work(fs_info->qgroup_rescan_workers,
-                                        &fs_info->qgroup_rescan_work);
-               }
-               ret = 0;
-       }
-
-out:
-
        return ret;
 }
 
@@ -2338,24 +2391,24 @@ out:
 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
 {
        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
-           qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer)
+           qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
                return false;
 
        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
-           qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl)
+           qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
                return false;
 
        return true;
 }
 
-static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
+                         enum btrfs_qgroup_rsv_type type)
 {
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 ref_root = root->root_key.objectid;
        int ret = 0;
-       int retried = 0;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
 
@@ -2369,7 +2422,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
            capable(CAP_SYS_RESOURCE))
                enforce = false;
 
-retry:
        spin_lock(&fs_info->qgroup_lock);
        quota_root = fs_info->quota_root;
        if (!quota_root)
@@ -2385,7 +2437,7 @@ retry:
         */
        ulist_reinit(fs_info->qgroup_ulist);
        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-                       (uintptr_t)qgroup, GFP_ATOMIC);
+                       qgroup_to_aux(qgroup), GFP_ATOMIC);
        if (ret < 0)
                goto out;
        ULIST_ITER_INIT(&uiter);
@@ -2396,27 +2448,6 @@ retry:
                qg = unode_aux_to_qgroup(unode);
 
                if (enforce && !qgroup_check_limits(qg, num_bytes)) {
-                       /*
-                        * Commit the tree and retry, since we may have
-                        * deletions which would free up space.
-                        */
-                       if (!retried && qg->reserved > 0) {
-                               struct btrfs_trans_handle *trans;
-
-                               spin_unlock(&fs_info->qgroup_lock);
-                               ret = btrfs_start_delalloc_inodes(root, 0);
-                               if (ret)
-                                       return ret;
-                               btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
-                               trans = btrfs_join_transaction(root);
-                               if (IS_ERR(trans))
-                                       return PTR_ERR(trans);
-                               ret = btrfs_commit_transaction(trans);
-                               if (ret)
-                                       return ret;
-                               retried++;
-                               goto retry;
-                       }
                        ret = -EDQUOT;
                        goto out;
                }
@@ -2424,7 +2455,7 @@ retry:
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ret = ulist_add(fs_info->qgroup_ulist,
                                        glist->group->qgroupid,
-                                       (uintptr_t)glist->group, GFP_ATOMIC);
+                                       qgroup_to_aux(glist->group), GFP_ATOMIC);
                        if (ret < 0)
                                goto out;
                }
@@ -2439,8 +2470,8 @@ retry:
 
                qg = unode_aux_to_qgroup(unode);
 
-               trace_qgroup_update_reserve(fs_info, qg, num_bytes);
-               qg->reserved += num_bytes;
+               trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
+               qgroup_rsv_add(fs_info, qg, num_bytes, type);
        }
 
 out:
@@ -2448,8 +2479,18 @@ out:
        return ret;
 }
 
+/*
+ * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
+ * qgroup).
+ *
+ * Will handle all higher level qgroup too.
+ *
+ * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
+ * This special case is only used for META_PERTRANS type.
+ */
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
-                              u64 ref_root, u64 num_bytes)
+                              u64 ref_root, u64 num_bytes,
+                              enum btrfs_qgroup_rsv_type type)
 {
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
@@ -2463,6 +2504,10 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
        if (num_bytes == 0)
                return;
 
+       if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
+               WARN(1, "%s: Invalid type to free", __func__);
+               return;
+       }
        spin_lock(&fs_info->qgroup_lock);
 
        quota_root = fs_info->quota_root;
@@ -2473,9 +2518,16 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
        if (!qgroup)
                goto out;
 
+       if (num_bytes == (u64)-1)
+               /*
+                * We're freeing all pertrans rsv, get reserved value from
+                * level 0 qgroup as real num_bytes to free.
+                */
+               num_bytes = qgroup->rsv.values[type];
+
        ulist_reinit(fs_info->qgroup_ulist);
        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-                       (uintptr_t)qgroup, GFP_ATOMIC);
+                       qgroup_to_aux(qgroup), GFP_ATOMIC);
        if (ret < 0)
                goto out;
        ULIST_ITER_INIT(&uiter);
@@ -2485,16 +2537,13 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 
                qg = unode_aux_to_qgroup(unode);
 
-               trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
-               if (qg->reserved < num_bytes)
-                       report_reserved_underflow(fs_info, qg, num_bytes);
-               else
-                       qg->reserved -= num_bytes;
+               trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
+               qgroup_rsv_release(fs_info, qg, num_bytes, type);
 
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ret = ulist_add(fs_info->qgroup_ulist,
                                        glist->group->qgroupid,
-                                       (uintptr_t)glist->group, GFP_ATOMIC);
+                                       qgroup_to_aux(glist->group), GFP_ATOMIC);
                        if (ret < 0)
                                goto out;
                }
@@ -2877,7 +2926,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
                                        to_reserve, QGROUP_RESERVE);
        if (ret < 0)
                goto cleanup;
-       ret = qgroup_reserve(root, to_reserve, true);
+       ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
        if (ret < 0)
                goto cleanup;
 
@@ -2940,7 +2989,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
                        goto out;
                freed += changeset.bytes_changed;
        }
-       btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed);
+       btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+                                 BTRFS_QGROUP_RSV_DATA);
        ret = freed;
 out:
        extent_changeset_release(&changeset);
@@ -2972,7 +3022,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
        if (free)
                btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
                                BTRFS_I(inode)->root->objectid,
-                               changeset.bytes_changed);
+                               changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
        ret = changeset.bytes_changed;
 out:
        extent_changeset_release(&changeset);
@@ -3017,8 +3067,48 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
        return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
 }
 
-int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
-                             bool enforce)
+static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+                             enum btrfs_qgroup_rsv_type type)
+{
+       if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+           type != BTRFS_QGROUP_RSV_META_PERTRANS)
+               return;
+       if (num_bytes == 0)
+               return;
+
+       spin_lock(&root->qgroup_meta_rsv_lock);
+       if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+               root->qgroup_meta_rsv_prealloc += num_bytes;
+       else
+               root->qgroup_meta_rsv_pertrans += num_bytes;
+       spin_unlock(&root->qgroup_meta_rsv_lock);
+}
+
+static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+                            enum btrfs_qgroup_rsv_type type)
+{
+       if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+           type != BTRFS_QGROUP_RSV_META_PERTRANS)
+               return 0;
+       if (num_bytes == 0)
+               return 0;
+
+       spin_lock(&root->qgroup_meta_rsv_lock);
+       if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
+               num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
+                                 num_bytes);
+               root->qgroup_meta_rsv_prealloc -= num_bytes;
+       } else {
+               num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
+                                 num_bytes);
+               root->qgroup_meta_rsv_pertrans -= num_bytes;
+       }
+       spin_unlock(&root->qgroup_meta_rsv_lock);
+       return num_bytes;
+}
+
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+                               enum btrfs_qgroup_rsv_type type, bool enforce)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
@@ -3028,31 +3118,39 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
                return 0;
 
        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
-       trace_qgroup_meta_reserve(root, (s64)num_bytes);
-       ret = qgroup_reserve(root, num_bytes, enforce);
+       trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
+       ret = qgroup_reserve(root, num_bytes, enforce, type);
        if (ret < 0)
                return ret;
-       atomic64_add(num_bytes, &root->qgroup_meta_rsv);
+       /*
+        * Record what we have reserved into root.
+        *
+        * To avoid quota disabled->enabled underflow.
+        * In that case, we may try to free space we haven't reserved
+        * (since quota was disabled), so record what we reserved into root.
+        * And ensure later release won't underflow this number.
+        */
+       add_root_meta_rsv(root, num_bytes, type);
        return ret;
 }
 
-void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
-       u64 reserved;
 
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
            !is_fstree(root->objectid))
                return;
 
-       reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0);
-       if (reserved == 0)
-               return;
-       trace_qgroup_meta_reserve(root, -(s64)reserved);
-       btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved);
+       /* TODO: Update trace point to handle such free */
+       trace_qgroup_meta_free_all_pertrans(root);
+       /* Special value -1 means to free all reserved space */
+       btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
+                                 BTRFS_QGROUP_RSV_META_PERTRANS);
 }
 
-void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+                             enum btrfs_qgroup_rsv_type type)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
 
@@ -3060,11 +3158,75 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
            !is_fstree(root->objectid))
                return;
 
+       /*
+        * reservation for META_PREALLOC can happen before quota is enabled,
+        * which can lead to underflow.
+        * Here ensure we will only free what we really have reserved.
+        */
+       num_bytes = sub_root_meta_rsv(root, num_bytes, type);
        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
-       WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
-       atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
-       trace_qgroup_meta_reserve(root, -(s64)num_bytes);
-       btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes);
+       trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
+       btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
+}
+
+static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
+                               int num_bytes)
+{
+       struct btrfs_root *quota_root = fs_info->quota_root;
+       struct btrfs_qgroup *qgroup;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       int ret = 0;
+
+       if (num_bytes == 0)
+               return;
+       if (!quota_root)
+               return;
+
+       spin_lock(&fs_info->qgroup_lock);
+       qgroup = find_qgroup_rb(fs_info, ref_root);
+       if (!qgroup)
+               goto out;
+       ulist_reinit(fs_info->qgroup_ulist);
+       ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+                      qgroup_to_aux(qgroup), GFP_ATOMIC);
+       if (ret < 0)
+               goto out;
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+               struct btrfs_qgroup *qg;
+               struct btrfs_qgroup_list *glist;
+
+               qg = unode_aux_to_qgroup(unode);
+
+               qgroup_rsv_release(fs_info, qg, num_bytes,
+                               BTRFS_QGROUP_RSV_META_PREALLOC);
+               qgroup_rsv_add(fs_info, qg, num_bytes,
+                               BTRFS_QGROUP_RSV_META_PERTRANS);
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ret = ulist_add(fs_info->qgroup_ulist,
+                                       glist->group->qgroupid,
+                                       qgroup_to_aux(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               goto out;
+               }
+       }
+out:
+       spin_unlock(&fs_info->qgroup_lock);
+}
+
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+           !is_fstree(root->objectid))
+               return;
+       /* Same as btrfs_qgroup_free_meta_prealloc() */
+       num_bytes = sub_root_meta_rsv(root, num_bytes,
+                                     BTRFS_QGROUP_RSV_META_PREALLOC);
+       trace_qgroup_meta_convert(root, num_bytes);
+       qgroup_convert_meta(fs_info, root->objectid, num_bytes);
 }
 
 /*
@@ -3092,7 +3254,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
                }
                btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
                                BTRFS_I(inode)->root->objectid,
-                               changeset.bytes_changed);
+                               changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
 
        }
        extent_changeset_release(&changeset);
index d9984e87cddfcba3066e6755d3d6a72c96bc14ac..e63e2d497a8e95d20a721f20913300faf72f7e13 100644 (file)
@@ -61,6 +61,48 @@ struct btrfs_qgroup_extent_record {
        struct ulist *old_roots;
 };
 
+/*
+ * Qgroup reservation types:
+ *
+ * DATA:
+ *     space reserved for data
+ *
+ * META_PERTRANS:
+ *     Space reserved for metadata (per-transaction)
+ *     Due to the fact that qgroup data is only updated at transaction commit
+ *     time, reserved space for metadata must be kept until transaction
+ *     commits.
+ *     Any metadata reserved that are used in btrfs_start_transaction() should
+ *     be of this type.
+ *
+ * META_PREALLOC:
+ *     There are cases where metadata space is reserved before starting
+ *     transaction, and then btrfs_join_transaction() to get a trans handle.
+ *     Any metadata reserved for such usage should be of this type.
+ *     And after join_transaction() part (or all) of such reservation should
+ *     be converted into META_PERTRANS.
+ */
+enum btrfs_qgroup_rsv_type {
+       BTRFS_QGROUP_RSV_DATA = 0,
+       BTRFS_QGROUP_RSV_META_PERTRANS,
+       BTRFS_QGROUP_RSV_META_PREALLOC,
+       BTRFS_QGROUP_RSV_LAST,
+};
+
+/*
+ * Represents how many bytes we have reserved for this qgroup.
+ *
+ * Each type should have different reservation behavior.
+ * E.g, data follows its io_tree flag modification, while
+ * *currently* meta is just reserve-and-clear during transcation.
+ *
+ * TODO: Add new type for reservation which can survive transaction commit.
+ * Currect metadata reservation behavior is not suitable for such case.
+ */
+struct btrfs_qgroup_rsv {
+       u64 values[BTRFS_QGROUP_RSV_LAST];
+};
+
 /*
  * one struct for each qgroup, organized in fs_info->qgroup_tree.
  */
@@ -87,7 +129,7 @@ struct btrfs_qgroup {
        /*
         * reservation tracking
         */
-       u64 reserved;
+       struct btrfs_qgroup_rsv rsv;
 
        /*
         * lists
@@ -220,20 +262,21 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
                            struct btrfs_fs_info *fs_info,
                            u64 bytenr, u64 num_bytes,
                            struct ulist *old_roots, struct ulist *new_roots);
-int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
-                                struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
 int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
                      struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                         struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
                         struct btrfs_qgroup_inherit *inherit);
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
-                              u64 ref_root, u64 num_bytes);
+                              u64 ref_root, u64 num_bytes,
+                              enum btrfs_qgroup_rsv_type type);
 static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
                                                 u64 ref_root, u64 num_bytes)
 {
        trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
-       btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+       btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
+                                 BTRFS_QGROUP_RSV_DATA);
 }
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -248,9 +291,54 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_free_data(struct inode *inode,
                        struct extent_changeset *reserved, u64 start, u64 len);
 
-int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
-                             bool enforce);
-void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
-void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+                               enum btrfs_qgroup_rsv_type type, bool enforce);
+/* Reserve metadata space for pertrans and prealloc type */
+static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
+                               int num_bytes, bool enforce)
+{
+       return __btrfs_qgroup_reserve_meta(root, num_bytes,
+                       BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+}
+static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
+                               int num_bytes, bool enforce)
+{
+       return __btrfs_qgroup_reserve_meta(root, num_bytes,
+                       BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+}
+
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+                            enum btrfs_qgroup_rsv_type type);
+
+/* Free per-transaction meta reservation for error handling */
+static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
+                                                  int num_bytes)
+{
+       __btrfs_qgroup_free_meta(root, num_bytes,
+                       BTRFS_QGROUP_RSV_META_PERTRANS);
+}
+
+/* Pre-allocated meta reservation can be freed at need */
+static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
+                                                  int num_bytes)
+{
+       __btrfs_qgroup_free_meta(root, num_bytes,
+                       BTRFS_QGROUP_RSV_META_PREALLOC);
+}
+
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
+
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
+
 void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 #endif /* __BTRFS_QGROUP__ */
index fcfc20de2df395bfd70aa5541ad34689e72f8deb..c3a2bc8af675211d75667e0ca0692c112bc75a7e 100644 (file)
@@ -1987,7 +1987,13 @@ cleanup:
        kfree(pointers);
 
 cleanup_io:
-       if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+       /*
+        * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
+        * valid rbio which is consistent with ondisk content, thus such a
+        * valid rbio can be cached to avoid further disk reads.
+        */
+       if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+           rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
                /*
                 * - In case of two failures, where rbio->failb != -1:
                 *
@@ -2008,8 +2014,6 @@ cleanup_io:
                else
                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
-               rbio_orig_end_io(rbio, err);
-       } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
                rbio_orig_end_io(rbio, err);
        } else if (err == BLK_STS_OK) {
                rbio->faila = -1;
@@ -2768,24 +2772,8 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
        return rbio;
 }
 
-static void missing_raid56_work(struct btrfs_work *work)
-{
-       struct btrfs_raid_bio *rbio;
-
-       rbio = container_of(work, struct btrfs_raid_bio, work);
-       __raid56_parity_recover(rbio);
-}
-
-static void async_missing_raid56(struct btrfs_raid_bio *rbio)
-{
-       btrfs_init_work(&rbio->work, btrfs_rmw_helper,
-                       missing_raid56_work, NULL, NULL);
-
-       btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
-}
-
 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
 {
        if (!lock_stripe_add(rbio))
-               async_missing_raid56(rbio);
+               async_read_rebuild(rbio);
 }
index ab852b8e3e3715d263a7e8b8b7ad61151c3cbd90..a52dd12af6480d76b5fd8fec376fe543087d7aa8 100644 (file)
@@ -395,20 +395,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
                goto error;
 
        /* insert extent in reada_tree + all per-device trees, all or nothing */
-       btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+       btrfs_dev_replace_read_lock(&fs_info->dev_replace);
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
        if (ret == -EEXIST) {
                re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
                re_exist->refcnt++;
                spin_unlock(&fs_info->reada_lock);
-               btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+               btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
                radix_tree_preload_end();
                goto error;
        }
        if (ret) {
                spin_unlock(&fs_info->reada_lock);
-               btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+               btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
                radix_tree_preload_end();
                goto error;
        }
@@ -451,13 +451,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
                        }
                        radix_tree_delete(&fs_info->reada_tree, index);
                        spin_unlock(&fs_info->reada_lock);
-                       btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+                       btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
                        goto error;
                }
                have_zone = 1;
        }
        spin_unlock(&fs_info->reada_lock);
-       btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+       btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 
        if (!have_zone)
                goto error;
index 171f3cce30e6badc8468abb95e8261f681e11e62..35fab67dcbe8a5a3998e6a8fe5d217fe1256033e 100644 (file)
@@ -579,11 +579,16 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
 
        while (level >= 0) {
                if (level) {
+                       struct btrfs_key first_key;
+
                        block_bytenr = btrfs_node_blockptr(path->nodes[level],
                                                           path->slots[level]);
                        gen = btrfs_node_ptr_generation(path->nodes[level],
                                                        path->slots[level]);
-                       eb = read_tree_block(fs_info, block_bytenr, gen);
+                       btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+                                             path->slots[level]);
+                       eb = read_tree_block(fs_info, block_bytenr, gen,
+                                            level - 1, &first_key);
                        if (IS_ERR(eb))
                                return PTR_ERR(eb);
                        if (!extent_buffer_uptodate(eb)) {
index cd2298d185dd121bd1412e571a07952c343b0ab5..4874c09f6d3c5e563d74fef62e69f53dab98ca78 100644 (file)
@@ -1839,6 +1839,8 @@ again:
 
        parent = eb;
        while (1) {
+               struct btrfs_key first_key;
+
                level = btrfs_header_level(parent);
                BUG_ON(level < lowest_level);
 
@@ -1852,6 +1854,7 @@ again:
                old_bytenr = btrfs_node_blockptr(parent, slot);
         &n