Merge tag 'for-4.19/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 17 Aug 2018 16:52:15 +0000 (09:52 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 17 Aug 2018 16:52:15 +0000 (09:52 -0700)
Pull device mapper updates from Mike Snitzer:

 - A couple stable fixes for the DM writecache target.

 - A stable fix for the DM cache target that fixes the potential for
   data corruption after an unclean shutdown of a cache device using
   writeback mode.

 - Update DM integrity target to allow the metadata to be stored on a
   separate device from data.

 - Fix DM kcopyd and the snapshot target to cond_resched() where
   appropriate and be more efficient with processing completed work.

 - A few fixes and improvements for DM crypt.

 - Add DM delay target feature to configure delay of flushes independent
   of writes.

 - Update DM thin-provisioning target to include metadata_low_watermark
   threshold in pool status.

 - Fix stale DM thin-provisioning Documentation.

* tag 'for-4.19/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (26 commits)
  dm writecache: fix a crash due to reading past end of dirty_bitmap
  dm crypt: don't decrease device limits
  dm cache metadata: set dirty on all cache blocks after a crash
  dm snapshot: remove stale FIXME in snapshot_map()
  dm snapshot: improve performance by switching out_of_order_list to rbtree
  dm kcopyd: avoid softlockup in run_complete_job
  dm cache metadata: save in-core policy_hint_size to on-disk superblock
  dm thin: stop no_space_timeout worker when switching to write-mode
  dm kcopyd: return void from dm_kcopyd_copy()
  dm thin: include metadata_low_watermark threshold in pool status
  dm writecache: report start_sector in status line
  dm crypt: convert essiv from ahash to shash
  dm crypt: use wake_up_process() instead of a wait queue
  dm integrity: recalculate checksums on creation
  dm integrity: flush journal on suspend when using separate metadata device
  dm integrity: use version 2 for separate metadata
  dm integrity: allow separate metadata device
  dm integrity: add ic->start in get_data_sector()
  dm integrity: report provided data sectors in the status
  dm integrity: implement fair range locks
  ...

15 files changed:
Documentation/device-mapper/delay.txt
Documentation/device-mapper/dm-integrity.txt
Documentation/device-mapper/thin-provisioning.txt
drivers/md/dm-cache-metadata.c
drivers/md/dm-cache-target.c
drivers/md/dm-crypt.c
drivers/md/dm-delay.c
drivers/md/dm-integrity.c
drivers/md/dm-kcopyd.c
drivers/md/dm-raid1.c
drivers/md/dm-snap.c
drivers/md/dm-thin.c
drivers/md/dm-writecache.c
drivers/md/dm-zoned-reclaim.c
include/linux/dm-kcopyd.h

index 4b1d22a44ce42d19cb54442d4e608b3f61a7e1e9..6426c45273cbbaf16b169332111538dcf46e3b44 100644 (file)
@@ -5,7 +5,8 @@ Device-Mapper's "delay" target delays reads and/or writes
 and maps them to different devices.
 
 Parameters:
-    <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
+    <device> <offset> <delay> [<write_device> <write_offset> <write_delay>
+                              [<flush_device> <flush_offset> <flush_delay>]]
 
 With separate write parameters, the first set is only used for reads.
 Offsets are specified in sectors.
index f33e3ade7a09bdaf9f6b338d5142acc512a94cb2..297251b0d2d5715872449d0b4c556a0fb72cd4dc 100644 (file)
@@ -113,6 +113,10 @@ internal_hash:algorithm(:key)      (the key is optional)
        from an upper layer target, such as dm-crypt. The upper layer
        target should check the validity of the integrity tags.
 
+recalculate
+       Recalculate the integrity tags automatically. It is only valid
+       when using internal hash.
+
 journal_crypt:algorithm(:key)  (the key is optional)
        Encrypt the journal using given algorithm to make sure that the
        attacker can't read the journal. You can use a block cipher here
index 3d01948ea0611f56e4cf603dd2df14548b42471c..883e7ca5f74588aa54b8ad7a50750ca44224ca89 100644 (file)
@@ -28,17 +28,18 @@ administrator some freedom, for example to:
 Status
 ======
 
-These targets are very much still in the EXPERIMENTAL state.  Please
-do not yet rely on them in production.  But do experiment and offer us
-feedback.  Different use cases will have different performance
-characteristics, for example due to fragmentation of the data volume.
+These targets are considered safe for production use.  But different use
+cases will have different performance characteristics, for example due
+to fragmentation of the data volume.
 
 If you find this software is not performing as expected please mail
 dm-devel@redhat.com with details and we'll try our best to improve
 things for you.
 
-Userspace tools for checking and repairing the metadata are under
-development.
+Userspace tools for checking and repairing the metadata have been fully
+developed and are available as 'thin_check' and 'thin_repair'.  The name
+of the package that provides these utilities varies by distribution (on
+a Red Hat distribution it is named 'device-mapper-persistent-data').
 
 Cookbook
 ========
@@ -280,7 +281,7 @@ ii) Status
     <transaction id> <used metadata blocks>/<total metadata blocks>
     <used data blocks>/<total data blocks> <held metadata root>
     ro|rw|out_of_data_space [no_]discard_passdown [error|queue]_if_no_space
-    needs_check|-
+    needs_check|- metadata_low_watermark
 
     transaction id:
        A 64-bit number used by userspace to help synchronise with metadata
@@ -327,6 +328,11 @@ ii) Status
        thin-pool can be made fully operational again.  '-' indicates
        needs_check is not set.
 
+    metadata_low_watermark:
+       Value of metadata low watermark in blocks.  The kernel sets this
+       value internally but userspace needs to know this value to
+       determine if an event was caused by crossing this threshold.
+
 iii) Messages
 
     create_thin <dev id>
index 0d7212410e2156addae298dfdc5e5a0ba4cbf690..69dddeab124c2e1ac075dc24c14ebe725647e3a3 100644 (file)
@@ -363,7 +363,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
        disk_super->version = cpu_to_le32(cmd->version);
        memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
        memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
-       disk_super->policy_hint_size = 0;
+       disk_super->policy_hint_size = cpu_to_le32(0);
 
        __copy_sm_root(cmd, disk_super);
 
@@ -701,6 +701,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
        disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
        disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
        disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
+       disk_super->policy_hint_size = cpu_to_le32(cmd->policy_hint_size);
 
        disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
        disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
@@ -1322,6 +1323,7 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd,
 
        dm_oblock_t oblock;
        unsigned flags;
+       bool dirty = true;
 
        dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
        memcpy(&mapping, mapping_value_le, sizeof(mapping));
@@ -1332,8 +1334,10 @@ static int __load_mapping_v1(struct dm_cache_metadata *cmd,
                        dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
                        memcpy(&hint, hint_value_le, sizeof(hint));
                }
+               if (cmd->clean_when_opened)
+                       dirty = flags & M_DIRTY;
 
-               r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY,
+               r = fn(context, oblock, to_cblock(cb), dirty,
                       le32_to_cpu(hint), hints_valid);
                if (r) {
                        DMERR("policy couldn't load cache block %llu",
@@ -1361,7 +1365,7 @@ static int __load_mapping_v2(struct dm_cache_metadata *cmd,
 
        dm_oblock_t oblock;
        unsigned flags;
-       bool dirty;
+       bool dirty = true;
 
        dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
        memcpy(&mapping, mapping_value_le, sizeof(mapping));
@@ -1372,8 +1376,9 @@ static int __load_mapping_v2(struct dm_cache_metadata *cmd,
                        dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
                        memcpy(&hint, hint_value_le, sizeof(hint));
                }
+               if (cmd->clean_when_opened)
+                       dirty = dm_bitset_cursor_get_value(dirty_cursor);
 
-               dirty = dm_bitset_cursor_get_value(dirty_cursor);
                r = fn(context, oblock, to_cblock(cb), dirty,
                       le32_to_cpu(hint), hints_valid);
                if (r) {
index ce14a3d1f609dfe127f6caff75c3a00a6d99254a..a534133717254a88eb91e025c7731e408fbbdfad 100644 (file)
@@ -1188,9 +1188,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
        queue_continuation(mg->cache->wq, &mg->k);
 }
 
-static int copy(struct dm_cache_migration *mg, bool promote)
+static void copy(struct dm_cache_migration *mg, bool promote)
 {
-       int r;
        struct dm_io_region o_region, c_region;
        struct cache *cache = mg->cache;
 
@@ -1203,11 +1202,9 @@ static int copy(struct dm_cache_migration *mg, bool promote)
        c_region.count = cache->sectors_per_block;
 
        if (promote)
-               r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
+               dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
        else
-               r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
-
-       return r;
+               dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
 }
 
 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
@@ -1449,12 +1446,7 @@ static void mg_full_copy(struct work_struct *ws)
        }
 
        init_continuation(&mg->k, mg_upgrade_lock);
-
-       if (copy(mg, is_policy_promote)) {
-               DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
-               mg->k.input = BLK_STS_IOERR;
-               mg_complete(mg, false);
-       }
+       copy(mg, is_policy_promote);
 }
 
 static void mg_copy(struct work_struct *ws)
@@ -2250,7 +2242,7 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
                {0, 2, "Invalid number of cache feature arguments"},
        };
 
-       int r;
+       int r, mode_ctr = 0;
        unsigned argc;
        const char *arg;
        struct cache_features *cf = &ca->features;
@@ -2264,14 +2256,20 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
        while (argc--) {
                arg = dm_shift_arg(as);
 
-               if (!strcasecmp(arg, "writeback"))
+               if (!strcasecmp(arg, "writeback")) {
                        cf->io_mode = CM_IO_WRITEBACK;
+                       mode_ctr++;
+               }
 
-               else if (!strcasecmp(arg, "writethrough"))
+               else if (!strcasecmp(arg, "writethrough")) {
                        cf->io_mode = CM_IO_WRITETHROUGH;
+                       mode_ctr++;
+               }
 
-               else if (!strcasecmp(arg, "passthrough"))
+               else if (!strcasecmp(arg, "passthrough")) {
                        cf->io_mode = CM_IO_PASSTHROUGH;
+                       mode_ctr++;
+               }
 
                else if (!strcasecmp(arg, "metadata2"))
                        cf->metadata_version = 2;
@@ -2282,6 +2280,11 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
                }
        }
 
+       if (mode_ctr > 1) {
+               *error = "Duplicate cache io_mode features requested";
+               return -EINVAL;
+       }
+
        return 0;
 }
 
index b61b069c33afbafd17af1951f0f5d8fee4ddf9d0..f266c81f396fe7a82032827c92c95896f168deba 100644 (file)
@@ -99,7 +99,7 @@ struct crypt_iv_operations {
 };
 
 struct iv_essiv_private {
-       struct crypto_ahash *hash_tfm;
+       struct crypto_shash *hash_tfm;
        u8 *salt;
 };
 
@@ -144,7 +144,7 @@ struct crypt_config {
        struct workqueue_struct *io_queue;
        struct workqueue_struct *crypt_queue;
 
-       wait_queue_head_t write_thread_wait;
+       spinlock_t write_thread_lock;
        struct task_struct *write_thread;
        struct rb_root write_tree;
 
@@ -327,25 +327,22 @@ static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv,
 static int crypt_iv_essiv_init(struct crypt_config *cc)
 {
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-       AHASH_REQUEST_ON_STACK(req, essiv->hash_tfm);
-       struct scatterlist sg;
+       SHASH_DESC_ON_STACK(desc, essiv->hash_tfm);
        struct crypto_cipher *essiv_tfm;
        int err;
 
-       sg_init_one(&sg, cc->key, cc->key_size);
-       ahash_request_set_tfm(req, essiv->hash_tfm);
-       ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
-       ahash_request_set_crypt(req, &sg, essiv->salt, cc->key_size);
+       desc->tfm = essiv->hash_tfm;
+       desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
-       err = crypto_ahash_digest(req);
-       ahash_request_zero(req);
+       err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
+       shash_desc_zero(desc);
        if (err)
                return err;
 
        essiv_tfm = cc->iv_private;
 
        err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
-                           crypto_ahash_digestsize(essiv->hash_tfm));
+                           crypto_shash_digestsize(essiv->hash_tfm));
        if (err)
                return err;
 
@@ -356,7 +353,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
-       unsigned salt_size = crypto_ahash_digestsize(essiv->hash_tfm);
+       unsigned salt_size = crypto_shash_digestsize(essiv->hash_tfm);
        struct crypto_cipher *essiv_tfm;
        int r, err = 0;
 
@@ -408,7 +405,7 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc)
        struct crypto_cipher *essiv_tfm;
        struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
-       crypto_free_ahash(essiv->hash_tfm);
+       crypto_free_shash(essiv->hash_tfm);
        essiv->hash_tfm = NULL;
 
        kzfree(essiv->salt);
@@ -426,7 +423,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
                              const char *opts)
 {
        struct crypto_cipher *essiv_tfm = NULL;
-       struct crypto_ahash *hash_tfm = NULL;
+       struct crypto_shash *hash_tfm = NULL;
        u8 *salt = NULL;
        int err;
 
@@ -436,14 +433,14 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
        }
 
        /* Allocate hash algorithm */
-       hash_tfm = crypto_alloc_ahash(opts, 0, CRYPTO_ALG_ASYNC);
+       hash_tfm = crypto_alloc_shash(opts, 0, 0);
        if (IS_ERR(hash_tfm)) {
                ti->error = "Error initializing ESSIV hash";
                err = PTR_ERR(hash_tfm);
                goto bad;
        }
 
-       salt = kzalloc(crypto_ahash_digestsize(hash_tfm), GFP_KERNEL);
+       salt = kzalloc(crypto_shash_digestsize(hash_tfm), GFP_KERNEL);
        if (!salt) {
                ti->error = "Error kmallocing salt storage in ESSIV";
                err = -ENOMEM;
@@ -454,7 +451,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
        cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
        essiv_tfm = alloc_essiv_cipher(cc, ti, salt,
-                                      crypto_ahash_digestsize(hash_tfm));
+                                      crypto_shash_digestsize(hash_tfm));
        if (IS_ERR(essiv_tfm)) {
                crypt_iv_essiv_dtr(cc);
                return PTR_ERR(essiv_tfm);
@@ -465,7 +462,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 
 bad:
        if (hash_tfm && !IS_ERR(hash_tfm))
-               crypto_free_ahash(hash_tfm);
+               crypto_free_shash(hash_tfm);
        kfree(salt);
        return err;
 }
@@ -1620,36 +1617,31 @@ static int dmcrypt_write(void *data)
                struct rb_root write_tree;
                struct blk_plug plug;
 
-               DECLARE_WAITQUEUE(wait, current);
-
-               spin_lock_irq(&cc->write_thread_wait.lock);
+               spin_lock_irq(&cc->write_thread_lock);
 continue_locked:
 
                if (!RB_EMPTY_ROOT(&cc->write_tree))
                        goto pop_from_list;
 
                set_current_state(TASK_INTERRUPTIBLE);
-               __add_wait_queue(&cc->write_thread_wait, &wait);
 
-               spin_unlock_irq(&cc->write_thread_wait.lock);
+               spin_unlock_irq(&cc->write_thread_lock);
 
                if (unlikely(kthread_should_stop())) {
                        set_current_state(TASK_RUNNING);
-                       remove_wait_queue(&cc->write_thread_wait, &wait);
                        break;
                }
 
                schedule();
 
                set_current_state(TASK_RUNNING);
-               spin_lock_irq(&cc->write_thread_wait.lock);
-               __remove_wait_queue(&cc->write_thread_wait, &wait);
+               spin_lock_irq(&cc->write_thread_lock);
                goto continue_locked;
 
 pop_from_list:
                write_tree = cc->write_tree;
                cc->write_tree = RB_ROOT;
-               spin_unlock_irq(&cc->write_thread_wait.lock);
+               spin_unlock_irq(&cc->write_thread_lock);
 
                BUG_ON(rb_parent(write_tree.rb_node));
 
@@ -1693,7 +1685,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
                return;
        }
 
-       spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
+       spin_lock_irqsave(&cc->write_thread_lock, flags);
+       if (RB_EMPTY_ROOT(&cc->write_tree))
+               wake_up_process(cc->write_thread);
        rbp = &cc->write_tree.rb_node;
        parent = NULL;
        sector = io->sector;
@@ -1706,9 +1700,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
        }
        rb_link_node(&io->rb_node, parent, rbp);
        rb_insert_color(&io->rb_node, &cc->write_tree);
-
-       wake_up_locked(&cc->write_thread_wait);
-       spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
+       spin_unlock_irqrestore(&cc->write_thread_lock, flags);
 }
 
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
@@ -2831,7 +2823,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad;
        }
 
-       init_waitqueue_head(&cc->write_thread_wait);
+       spin_lock_init(&cc->write_thread_lock);
        cc->write_tree = RB_ROOT;
 
        cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
@@ -3069,11 +3061,11 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
         */
        limits->max_segment_size = PAGE_SIZE;
 
-       if (cc->sector_size != (1 << SECTOR_SHIFT)) {
-               limits->logical_block_size = cc->sector_size;
-               limits->physical_block_size = cc->sector_size;
-               blk_limits_io_min(limits, cc->sector_size);
-       }
+       limits->logical_block_size =
+               max_t(unsigned short, limits->logical_block_size, cc->sector_size);
+       limits->physical_block_size =
+               max_t(unsigned, limits->physical_block_size, cc->sector_size);
+       limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size);
 }
 
 static struct target_type crypt_target = {
index 1783d80c9cad029b61c261c41ed5c5dbdc8c0624..2fb7bb4304ad7f0ade6eb10ccd02d07f36271acf 100644 (file)
 
 #define DM_MSG_PREFIX "delay"
 
+struct delay_class {
+       struct dm_dev *dev;
+       sector_t start;
+       unsigned delay;
+       unsigned ops;
+};
+
 struct delay_c {
        struct timer_list delay_timer;
        struct mutex timer_lock;
@@ -25,19 +32,16 @@ struct delay_c {
        struct list_head delayed_bios;
        atomic_t may_delay;
 
-       struct dm_dev *dev_read;
-       sector_t start_read;
-       unsigned read_delay;
-       unsigned reads;
+       struct delay_class read;
+       struct delay_class write;
+       struct delay_class flush;
 
-       struct dm_dev *dev_write;
-       sector_t start_write;
-       unsigned write_delay;
-       unsigned writes;
+       int argc;
 };
 
 struct dm_delay_info {
        struct delay_c *context;
+       struct delay_class *class;
        struct list_head list;
        unsigned long expires;
 };
@@ -77,7 +81,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
 {
        struct dm_delay_info *delayed, *next;
        unsigned long next_expires = 0;
-       int start_timer = 0;
+       unsigned long start_timer = 0;
        struct bio_list flush_bios = { };
 
        mutex_lock(&delayed_bios_lock);
@@ -87,10 +91,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
                                                sizeof(struct dm_delay_info));
                        list_del(&delayed->list);
                        bio_list_add(&flush_bios, bio);
-                       if ((bio_data_dir(bio) == WRITE))
-                               delayed->context->writes--;
-                       else
-                               delayed->context->reads--;
+                       delayed->class->ops--;
                        continue;
                }
 
@@ -100,7 +101,6 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
                } else
                        next_expires = min(next_expires, delayed->expires);
        }
-
        mutex_unlock(&delayed_bios_lock);
 
        if (start_timer)
@@ -117,6 +117,50 @@ static void flush_expired_bios(struct work_struct *work)
        flush_bios(flush_delayed_bios(dc, 0));
 }
 
+static void delay_dtr(struct dm_target *ti)
+{
+       struct delay_c *dc = ti->private;
+
+       destroy_workqueue(dc->kdelayd_wq);
+
+       if (dc->read.dev)
+               dm_put_device(ti, dc->read.dev);
+       if (dc->write.dev)
+               dm_put_device(ti, dc->write.dev);
+       if (dc->flush.dev)
+               dm_put_device(ti, dc->flush.dev);
+
+       mutex_destroy(&dc->timer_lock);
+
+       kfree(dc);
+}
+
+static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv)
+{
+       int ret;
+       unsigned long long tmpll;
+       char dummy;
+
+       if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
+               ti->error = "Invalid device sector";
+               return -EINVAL;
+       }
+       c->start = tmpll;
+
+       if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) {
+               ti->error = "Invalid delay";
+               return -EINVAL;
+       }
+
+       ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev);
+       if (ret) {
+               ti->error = "Device lookup failed";
+               return ret;
+       }
+
+       return 0;
+}
+
 /*
  * Mapping parameters:
  *    <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
@@ -128,134 +172,89 @@ static void flush_expired_bios(struct work_struct *work)
 static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        struct delay_c *dc;
-       unsigned long long tmpll;
-       char dummy;
        int ret;
 
-       if (argc != 3 && argc != 6) {
-               ti->error = "Requires exactly 3 or 6 arguments";
+       if (argc != 3 && argc != 6 && argc != 9) {
+               ti->error = "Requires exactly 3, 6 or 9 arguments";
                return -EINVAL;
        }
 
-       dc = kmalloc(sizeof(*dc), GFP_KERNEL);
+       dc = kzalloc(sizeof(*dc), GFP_KERNEL);
        if (!dc) {
                ti->error = "Cannot allocate context";
                return -ENOMEM;
        }
 
-       dc->reads = dc->writes = 0;
+       ti->private = dc;
+       timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
+       INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
+       INIT_LIST_HEAD(&dc->delayed_bios);
+       mutex_init(&dc->timer_lock);
+       atomic_set(&dc->may_delay, 1);
+       dc->argc = argc;
 
-       ret = -EINVAL;
-       if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
-               ti->error = "Invalid device sector";
+       ret = delay_class_ctr(ti, &dc->read, argv);
+       if (ret)
                goto bad;
-       }
-       dc->start_read = tmpll;
 
-       if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
-               ti->error = "Invalid delay";
-               goto bad;
+       if (argc == 3) {
+               ret = delay_class_ctr(ti, &dc->write, argv);
+               if (ret)
+                       goto bad;
+               ret = delay_class_ctr(ti, &dc->flush, argv);
+               if (ret)
+                       goto bad;
+               goto out;
        }
 
-       ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
-                           &dc->dev_read);
-       if (ret) {
-               ti->error = "Device lookup failed";
+       ret = delay_class_ctr(ti, &dc->write, argv + 3);
+       if (ret)
                goto bad;
-       }
-
-       ret = -EINVAL;
-       dc->dev_write = NULL;
-       if (argc == 3)
+       if (argc == 6) {
+               ret = delay_class_ctr(ti, &dc->flush, argv + 3);
+               if (ret)
+                       goto bad;
                goto out;
-
-       if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
-               ti->error = "Invalid write device sector";
-               goto bad_dev_read;
        }
-       dc->start_write = tmpll;
 
-       if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
-               ti->error = "Invalid write delay";
-               goto bad_dev_read;
-       }
-
-       ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table),
-                           &dc->dev_write);
-       if (ret) {
-               ti->error = "Write device lookup failed";
-               goto bad_dev_read;
-       }
+       ret = delay_class_ctr(ti, &dc->flush, argv + 6);
+       if (ret)
+               goto bad;
 
 out:
-       ret = -EINVAL;
        dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
        if (!dc->kdelayd_wq) {
+               ret = -EINVAL;
                DMERR("Couldn't start kdelayd");
-               goto bad_queue;
+               goto bad;
        }
 
-       timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
-
-       INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
-       INIT_LIST_HEAD(&dc->delayed_bios);
-       mutex_init(&dc->timer_lock);
-       atomic_set(&dc->may_delay, 1);
-
        ti->num_flush_bios = 1;
        ti->num_discard_bios = 1;
        ti->per_io_data_size = sizeof(struct dm_delay_info);
-       ti->private = dc;
        return 0;
 
-bad_queue:
-       if (dc->dev_write)
-               dm_put_device(ti, dc->dev_write);
-bad_dev_read:
-       dm_put_device(ti, dc->dev_read);
 bad:
-       kfree(dc);
+       delay_dtr(ti);
        return ret;
 }
 
-static void delay_dtr(struct dm_target *ti)
-{
-       struct delay_c *dc = ti->private;
-
-       destroy_workqueue(dc->kdelayd_wq);
-
-       dm_put_device(ti, dc->dev_read);
-
-       if (dc->dev_write)
-               dm_put_device(ti, dc->dev_write);
-
-       mutex_destroy(&dc->timer_lock);
-
-       kfree(dc);
-}
-
-static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
+static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
 {
        struct dm_delay_info *delayed;
        unsigned long expires = 0;
 
-       if (!delay || !atomic_read(&dc->may_delay))
+       if (!c->delay || !atomic_read(&dc->may_delay))
                return DM_MAPIO_REMAPPED;
 
        delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
 
        delayed->context = dc;
-       delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
+       delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
 
        mutex_lock(&delayed_bios_lock);
-
-       if (bio_data_dir(bio) == WRITE)
-               dc->writes++;
-       else
-               dc->reads++;
-
+       c->ops++;
        list_add_tail(&delayed->list, &dc->delayed_bios);
-
        mutex_unlock(&delayed_bios_lock);
 
        queue_timeout(dc, expires);
@@ -282,23 +281,28 @@ static void delay_resume(struct dm_target *ti)
 static int delay_map(struct dm_target *ti, struct bio *bio)
 {
        struct delay_c *dc = ti->private;
-
-       if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
-               bio_set_dev(bio, dc->dev_write->bdev);
-               if (bio_sectors(bio))
-                       bio->bi_iter.bi_sector = dc->start_write +
-                               dm_target_offset(ti, bio->bi_iter.bi_sector);
-
-               return delay_bio(dc, dc->write_delay, bio);
+       struct delay_class *c;
+       struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
+
+       if (bio_data_dir(bio) == WRITE) {
+               if (unlikely(bio->bi_opf & REQ_PREFLUSH))
+                       c = &dc->flush;
+               else
+                       c = &dc->write;
+       } else {
+               c = &dc->read;
        }
+       delayed->class = c;
+       bio_set_dev(bio, c->dev->bdev);
+       if (bio_sectors(bio))
+               bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
 
-       bio_set_dev(bio, dc->dev_read->bdev);
-       bio->bi_iter.bi_sector = dc->start_read +
-               dm_target_offset(ti, bio->bi_iter.bi_sector);
-
-       return delay_bio(dc, dc->read_delay, bio);
+       return delay_bio(dc, c, bio);
 }
 
+#define DMEMIT_DELAY_CLASS(c) \
+       DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)
+
 static void delay_status(struct dm_target *ti, status_type_t type,
                         unsigned status_flags, char *result, unsigned maxlen)
 {
@@ -307,17 +311,19 @@ static void delay_status(struct dm_target *ti, status_type_t type,
 
        switch (type) {
        case STATUSTYPE_INFO:
-               DMEMIT("%u %u", dc->reads, dc->writes);
+               DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops);
                break;
 
        case STATUSTYPE_TABLE:
-               DMEMIT("%s %llu %u", dc->dev_read->name,
-                      (unsigned long long) dc->start_read,
-                      dc->read_delay);
-               if (dc->dev_write)
-                       DMEMIT(" %s %llu %u", dc->dev_write->name,
-                              (unsigned long long) dc->start_write,
-                              dc->write_delay);
+               DMEMIT_DELAY_CLASS(&dc->read);
+               if (dc->argc >= 6) {
+                       DMEMIT(" ");
+                       DMEMIT_DELAY_CLASS(&dc->write);
+               }
+               if (dc->argc >= 9) {
+                       DMEMIT(" ");
+                       DMEMIT_DELAY_CLASS(&dc->flush);
+               }
                break;
        }
 }
@@ -328,12 +334,15 @@ static int delay_iterate_devices(struct dm_target *ti,
        struct delay_c *dc = ti->private;
        int ret = 0;
 
-       ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data);
+       ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data);
+       if (ret)
+               goto out;
+       ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data);
+       if (ret)
+               goto out;
+       ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data);
        if (ret)
                goto out;
-
-       if (dc->dev_write)
-               ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data);
 
 out:
        return ret;
index 86438b2f10dd0a92b0e4fbd46eba6e04fff478e6..37887859946631cb40c42256f0ba3d4d5a471132 100644 (file)
@@ -31,6 +31,8 @@
 #define MIN_LOG2_INTERLEAVE_SECTORS    3
 #define MAX_LOG2_INTERLEAVE_SECTORS    31
 #define METADATA_WORKQUEUE_MAX_ACTIVE  16
+#define RECALC_SECTORS                 8192
+#define RECALC_WRITE_SUPER             16
 
 /*
  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -44,7 +46,8 @@
  */
 
 #define SB_MAGIC                       "integrt"
-#define SB_VERSION                     1
+#define SB_VERSION_1                   1
+#define SB_VERSION_2                   2
 #define SB_SECTORS                     8
 #define MAX_SECTORS_PER_BLOCK          8
 
@@ -57,9 +60,12 @@ struct superblock {
        __u64 provided_data_sectors;    /* userspace uses this value */
        __u32 flags;
        __u8 log2_sectors_per_block;
+       __u8 pad[3];
+       __u64 recalc_sector;
 };
 
 #define SB_FLAG_HAVE_JOURNAL_MAC       0x1
+#define SB_FLAG_RECALCULATING          0x2
 
 #define        JOURNAL_ENTRY_ROUNDUP           8
 
@@ -139,6 +145,7 @@ struct alg_spec {
 
 struct dm_integrity_c {
        struct dm_dev *dev;
+       struct dm_dev *meta_dev;
        unsigned tag_size;
        __s8 log2_tag_size;
        sector_t start;
@@ -170,7 +177,8 @@ struct dm_integrity_c {
        unsigned short journal_section_sectors;
        unsigned journal_sections;
        unsigned journal_entries;
-       sector_t device_sectors;
+       sector_t data_device_sectors;
+       sector_t meta_device_sectors;
        unsigned initial_sectors;
        unsigned metadata_run;
        __s8 log2_metadata_run;
@@ -178,7 +186,7 @@ struct dm_integrity_c {
        __u8 sectors_per_block;
 
        unsigned char mode;
-       bool suspending;
+       int suspending;
 
        int failed;
 
@@ -186,6 +194,7 @@ struct dm_integrity_c {
 
        /* these variables are locked with endio_wait.lock */
        struct rb_root in_progress;
+       struct list_head wait_list;
        wait_queue_head_t endio_wait;
        struct workqueue_struct *wait_wq;
 
@@ -210,6 +219,11 @@ struct dm_integrity_c {
        struct workqueue_struct *writer_wq;
        struct work_struct writer_work;
 
+       struct workqueue_struct *recalc_wq;
+       struct work_struct recalc_work;
+       u8 *recalc_buffer;
+       u8 *recalc_tags;
+
        struct bio_list flush_bio_list;
 
        unsigned long autocommit_jiffies;
@@ -233,7 +247,14 @@ struct dm_integrity_c {
 struct dm_integrity_range {
        sector_t logical_sector;
        unsigned n_sectors;
-       struct rb_node node;
+       bool waiting;
+       union {
+               struct rb_node node;
+               struct {
+                       struct task_struct *task;
+                       struct list_head wait_entry;
+               };
+       };
 };
 
 struct dm_integrity_io {
@@ -337,10 +358,14 @@ static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
 static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
                                sector_t *area, sector_t *offset)
 {
-       __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
-
-       *area = data_sector >> log2_interleave_sectors;
-       *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
+       if (!ic->meta_dev) {
+               __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
+               *area = data_sector >> log2_interleave_sectors;
+               *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
+       } else {
+               *area = 0;
+               *offset = data_sector;
+       }
 }
 
 #define sector_to_block(ic, n)                                         \
@@ -379,6 +404,9 @@ static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector
 {
        sector_t result;
 
+       if (ic->meta_dev)
+               return offset;
+
        result = area << ic->sb->log2_interleave_sectors;
        if (likely(ic->log2_metadata_run >= 0))
                result += (area + 1) << ic->log2_metadata_run;
@@ -386,6 +414,8 @@ static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector
                result += (area + 1) * ic->metadata_run;
 
        result += (sector_t)ic->initial_sectors + offset;
+       result += ic->start;
+
        return result;
 }
 
@@ -395,6 +425,14 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
                *sec_ptr -= ic->journal_sections;
 }
 
+static void sb_set_version(struct dm_integrity_c *ic)
+{
+       if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+               ic->sb->version = SB_VERSION_2;
+       else
+               ic->sb->version = SB_VERSION_1;
+}
+
 static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
 {
        struct dm_io_request io_req;
@@ -406,7 +444,7 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
        io_req.mem.ptr.addr = ic->sb;
        io_req.notify.fn = NULL;
        io_req.client = ic->io;
-       io_loc.bdev = ic->dev->bdev;
+       io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev;
        io_loc.sector = ic->start;
        io_loc.count = SB_SECTORS;
 
@@ -753,7 +791,7 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
                io_req.notify.fn = NULL;
        }
        io_req.client = ic->io;
-       io_loc.bdev = ic->dev->bdev;
+       io_loc.bdev = ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev;
        io_loc.sector = ic->start + SB_SECTORS + sector;
        io_loc.count = n_sectors;
 
@@ -857,7 +895,7 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig
        io_req.notify.context = data;
        io_req.client = ic->io;
        io_loc.bdev = ic->dev->bdev;
-       io_loc.sector = ic->start + target;
+       io_loc.sector = target;
        io_loc.count = n_sectors;
 
        r = dm_io(&io_req, 1, &io_loc, NULL);
@@ -867,13 +905,27 @@ static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsig
        }
 }
 
-static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2)
+{
+       return range1->logical_sector < range2->logical_sector + range2->n_sectors &&
+              range2->logical_sector + range2->n_sectors > range2->logical_sector;
+}
+
+static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting)
 {
        struct rb_node **n = &ic->in_progress.rb_node;
        struct rb_node *parent;
 
        BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
 
+       if (likely(check_waiting)) {
+               struct dm_integrity_range *range;
+               list_for_each_entry(range, &ic->wait_list, wait_entry) {
+                       if (unlikely(ranges_overlap(range, new_range)))
+                               return false;
+               }
+       }
+
        parent = NULL;
 
        while (*n) {
@@ -898,7 +950,22 @@ static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *
 static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
 {
        rb_erase(&range->node, &ic->in_progress);
-       wake_up_locked(&ic->endio_wait);
+       while (unlikely(!list_empty(&ic->wait_list))) {
+               struct dm_integrity_range *last_range =
+                       list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry);
+               struct task_struct *last_range_task;
+               if (!ranges_overlap(range, last_range))
+                       break;
+               last_range_task = last_range->task;
+               list_del(&last_range->wait_entry);
+               if (!add_new_range(ic, last_range, false)) {
+                       last_range->task = last_range_task;
+                       list_add(&last_range->wait_entry, &ic->wait_list);
+                       break;
+               }
+               last_range->waiting = false;
+               wake_up_process(last_range_task);
+       }
 }
 
 static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
@@ -910,6 +977,19 @@ static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *r
        spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
 }
 
+static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+       new_range->waiting = true;
+       list_add_tail(&new_range->wait_entry, &ic->wait_list);
+       new_range->task = current;
+       do {
+               __set_current_state(TASK_UNINTERRUPTIBLE);
+               spin_unlock_irq(&ic->endio_wait.lock);
+               io_schedule();
+               spin_lock_irq(&ic->endio_wait.lock);
+       } while (unlikely(new_range->waiting));
+}
+
 static void init_journal_node(struct journal_node *node)
 {
        RB_CLEAR_NODE(&node->node);
@@ -1599,8 +1679,12 @@ retry:
 
                        dio->range.n_sectors = min(dio->range.n_sectors,
                                                   ic->free_sectors << ic->sb->log2_sectors_per_block);
-                       if (unlikely(!dio->range.n_sectors))
-                               goto sleep;
+                       if (unlikely(!dio->range.n_sectors)) {
+                               if (from_map)
+                                       goto offload_to_thread;
+                               sleep_on_endio_wait(ic);
+                               goto retry;
+                       }
                        range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
                        ic->free_sectors -= range_sectors;
                        journal_section = ic->free_section;
@@ -1654,22 +1738,20 @@ retry:
                        }
                }
        }
-       if (unlikely(!add_new_range(ic, &dio->range))) {
+       if (unlikely(!add_new_range(ic, &dio->range, true))) {
                /*
                 * We must not sleep in the request routine because it could
                 * stall bios on current->bio_list.
                 * So, we offload the bio to a workqueue if we have to sleep.
                 */
-sleep:
                if (from_map) {
+offload_to_thread:
                        spin_unlock_irq(&ic->endio_wait.lock);
                        INIT_WORK(&dio->work, integrity_bio_wait);
                        queue_work(ic->wait_wq, &dio->work);
                        return;
-               } else {
-                       sleep_on_endio_wait(ic);
-                       goto retry;
                }
+               wait_and_add_new_range(ic, &dio->range);
        }
        spin_unlock_irq(&ic->endio_wait.lock);
 
@@ -1701,14 +1783,18 @@ sleep:
        bio->bi_end_io = integrity_end_io;
 
        bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
-       bio->bi_iter.bi_sector += ic->start;
        generic_make_request(bio);
 
        if (need_sync_io) {
                wait_for_completion_io(&read_comp);
+               if (unlikely(ic->recalc_wq != NULL) &&
+                   ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+                   dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
+                       goto skip_check;
                if (likely(!bio->bi_status))
                        integrity_metadata(&dio->work);
                else
+skip_check:
                        dec_in_flight(dio);
 
        } else {
@@ -1892,8 +1978,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
                        io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
 
                        spin_lock_irq(&ic->endio_wait.lock);
-                       while (unlikely(!add_new_range(ic, &io->range)))
-                               sleep_on_endio_wait(ic);
+                       if (unlikely(!add_new_range(ic, &io->range, true)))
+                               wait_and_add_new_range(ic, &io->range);
 
                        if (likely(!from_replay)) {
                                struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
@@ -1981,7 +2067,7 @@ static void integrity_writer(struct work_struct *w)
        unsigned prev_free_sectors;
 
        /* the following test is not needed, but it tests the replay code */
-       if (READ_ONCE(ic->suspending))
+       if (READ_ONCE(ic->suspending) && !ic->meta_dev)
                return;
 
        spin_lock_irq(&ic->endio_wait.lock);
@@ -2008,6 +2094,108 @@ static void integrity_writer(struct work_struct *w)
        spin_unlock_irq(&ic->endio_wait.lock);
 }
 
+static void recalc_write_super(struct dm_integrity_c *ic)
+{
+       int r;
+
+       dm_integrity_flush_buffers(ic);
+       if (dm_integrity_failed(ic))
+               return;
+
+       sb_set_version(ic);
+       r = sync_rw_sb(ic, REQ_OP_WRITE, 0);
+       if (unlikely(r))
+               dm_integrity_io_error(ic, "writing superblock", r);
+}
+
+static void integrity_recalc(struct work_struct *w)
+{
+       struct dm_integrity_c *ic = container_of(w, struct dm_integrity_c, recalc_work);
+       struct dm_integrity_range range;
+       struct dm_io_request io_req;
+       struct dm_io_region io_loc;
+       sector_t area, offset;
+       sector_t metadata_block;
+       unsigned metadata_offset;
+       __u8 *t;
+       unsigned i;
+       int r;
+       unsigned super_counter = 0;
+
+       spin_lock_irq(&ic->endio_wait.lock);
+
+next_chunk:
+
+       if (unlikely(READ_ONCE(ic->suspending)))
+               goto unlock_ret;
+
+       range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
+       if (unlikely(range.logical_sector >= ic->provided_data_sectors))
+               goto unlock_ret;
+
+       get_area_and_offset(ic, range.logical_sector, &area, &offset);
+       range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
+       if (!ic->meta_dev)
+               range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
+
+       if (unlikely(!add_new_range(ic, &range, true)))
+               wait_and_add_new_range(ic, &range);
+
+       spin_unlock_irq(&ic->endio_wait.lock);
+
+       if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
+               recalc_write_super(ic);
+               super_counter = 0;
+       }
+
+       if (unlikely(dm_integrity_failed(ic)))
+               goto err;
+
+       io_req.bi_op = REQ_OP_READ;
+       io_req.bi_op_flags = 0;
+       io_req.mem.type = DM_IO_VMA;
+       io_req.mem.ptr.addr = ic->recalc_buffer;
+       io_req.notify.fn = NULL;
+       io_req.client = ic->io;
+       io_loc.bdev = ic->dev->bdev;
+       io_loc.sector = get_data_sector(ic, area, offset);
+       io_loc.count = range.n_sectors;
+
+       r = dm_io(&io_req, 1, &io_loc, NULL);
+       if (unlikely(r)) {
+               dm_integrity_io_error(ic, "reading data", r);
+               goto err;
+       }
+
+       t = ic->recalc_tags;
+       for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
+               integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+               t += ic->tag_size;
+       }
+
+       metadata_block = get_metadata_sector_and_offset(ic, area, offset, &metadata_offset);
+
+       r = dm_integrity_rw_tag(ic, ic->recalc_tags, &metadata_block, &metadata_offset, t - ic->recalc_tags, TAG_WRITE);
+       if (unlikely(r)) {
+               dm_integrity_io_error(ic, "writing tags", r);
+               goto err;
+       }
+
+       spin_lock_irq(&ic->endio_wait.lock);
+       remove_range_unlocked(ic, &range);
+       ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
+       goto next_chunk;
+
+err:
+       remove_range(ic, &range);
+       return;
+
+unlock_ret:
+       spin_unlock_irq(&ic->endio_wait.lock);
+
+       recalc_write_super(ic);
+}
+
 static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
                         unsigned n_sections, unsigned char commit_seq)
 {
@@ -2210,17 +2398,22 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
 
        del_timer_sync(&ic->autocommit_timer);
 
-       ic->suspending = true;
+       WRITE_ONCE(ic->suspending, 1);
+
+       if (ic->recalc_wq)
+               drain_workqueue(ic->recalc_wq);
 
        queue_work(ic->commit_wq, &ic->commit_work);
        drain_workqueue(ic->commit_wq);
 
        if (ic->mode == 'J') {
+               if (ic->meta_dev)
+                       queue_work(ic->writer_wq, &ic->writer_work);
                drain_workqueue(ic->writer_wq);
                dm_integrity_flush_buffers(ic);
        }
 
-       ic->suspending = false;
+       WRITE_ONCE(ic->suspending, 0);
 
        BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
 
@@ -2232,6 +2425,16 @@ static void dm_integrity_resume(struct dm_target *ti)
        struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
 
        replay_journal(ic);
+
+       if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+               __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
+               if (recalc_pos < ic->provided_data_sectors) {
+                       queue_work(ic->recalc_wq, &ic->recalc_work);
+               } else if (recalc_pos > ic->provided_data_sectors) {
+                       ic->sb->recalc_sector = cpu_to_le64(ic->provided_data_sectors);
+                       recalc_write_super(ic);
+               }
+       }
 }
 
 static void dm_integrity_status(struct dm_target *ti, status_type_t type,
@@ -2243,7 +2446,13 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 
        switch (type) {
        case STATUSTYPE_INFO:
-               DMEMIT("%llu", (unsigned long long)atomic64_read(&ic->number_of_mismatches));
+               DMEMIT("%llu %llu",
+                       (unsigned long long)atomic64_read(&ic->number_of_mismatches),
+                       (unsigned long long)ic->provided_data_sectors);
+               if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+                       DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector));
+               else
+                       DMEMIT(" -");
                break;
 
        case STATUSTYPE_TABLE: {
@@ -2251,19 +2460,25 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
                watermark_percentage += ic->journal_entries / 2;
                do_div(watermark_percentage, ic->journal_entries);
                arg_count = 5;
+               arg_count += !!ic->meta_dev;
                arg_count += ic->sectors_per_block != 1;
+               arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
                arg_count += !!ic->internal_hash_alg.alg_string;
                arg_count += !!ic->journal_crypt_alg.alg_string;
                arg_count += !!ic->journal_mac_alg.alg_string;
                DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
                       ic->tag_size, ic->mode, arg_count);
+               if (ic->meta_dev)
+                       DMEMIT(" meta_device:%s", ic->meta_dev->name);
+               if (ic->sectors_per_block != 1)
+                       DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
+               if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+                       DMEMIT(" recalculate");
                DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
                DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
                DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
                DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
                DMEMIT(" commit_time:%u", ic->autocommit_msec);
-               if (ic->sectors_per_block != 1)
-                       DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
 
 #define EMIT_ALG(a, n)                                                 \
                do {                                                    \
@@ -2286,7 +2501,10 @@ static int dm_integrity_iterate_devices(struct dm_target *ti,
 {
        struct dm_integrity_c *ic = ti->private;
 
-       return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
+       if (!ic->meta_dev)
+               return fn(ti, ic->dev, ic->start + ic->initial_sectors + ic->metadata_run, ti->len, data);
+       else
+               return fn(ti, ic->dev, 0, ti->len, data);
 }
 
 static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -2319,26 +2537,38 @@ static void calculate_journal_section_size(struct dm_integrity_c *ic)
 static int calculate_device_limits(struct dm_integrity_c *ic)
 {
        __u64 initial_sectors;
-       sector_t last_sector, last_area, last_offset;
 
        calculate_journal_section_size(ic);
        initial_sectors = SB_SECTORS + (__u64)ic->journal_section_sectors * ic->journal_sections;
-       if (initial_sectors + METADATA_PADDING_SECTORS >= ic->device_sectors || initial_sectors > UINT_MAX)
+       if (initial_sectors + METADATA_PADDING_SECTORS >= ic->meta_device_sectors || initial_sectors > UINT_MAX)
                return -EINVAL;
        ic->initial_sectors = initial_sectors;
 
-       ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
-                                  (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
-       if (!(ic->metadata_run & (ic->metadata_run - 1)))
-               ic->log2_metadata_run = __ffs(ic->metadata_run);
-       else
-               ic->log2_metadata_run = -1;
+       if (!ic->meta_dev) {
+               sector_t last_sector, last_area, last_offset;
 
-       get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
-       last_sector = get_data_sector(ic, last_area, last_offset);
+               ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
+                                          (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
+               if (!(ic->metadata_run & (ic->metadata_run - 1)))
+                       ic->log2_metadata_run = __ffs(ic->metadata_run);
+               else
+                       ic->log2_metadata_run = -1;
 
-       if (ic->start + last_sector < last_sector || ic->start + last_sector >= ic->device_sectors)
-               return -EINVAL;
+               get_area_and_offset(ic, ic->provided_data_sectors - 1, &last_area, &last_offset);
+               last_sector = get_data_sector(ic, last_area, last_offset);
+               if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
+                       return -EINVAL;
+       } else {
+               __u64 meta_size = ic->provided_data_sectors * ic->tag_size;
+               meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
+                               >> (ic->log2_buffer_sectors + SECTOR_SHIFT);
+               meta_size <<= ic->log2_buffer_sectors;
+               if (ic->initial_sectors + meta_size < ic->initial_sectors ||
+                   ic->initial_sectors + meta_size > ic->meta_device_sectors)
+                       return -EINVAL;
+               ic->metadata_run = 1;
+               ic->log2_metadata_run = 0;
+       }
 
        return 0;
 }
@@ -2350,7 +2580,6 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
 
        memset(ic->sb, 0, SB_SECTORS << SECTOR_SHIFT);
        memcpy(ic->sb->magic, SB_MAGIC, 8);
-       ic->sb->version = SB_VERSION;
        ic->sb->integrity_tag_size = cpu_to_le16(ic->tag_size);
        ic->sb->log2_sectors_per_block = __ffs(ic->sectors_per_block);
        if (ic->journal_mac_alg.alg_string)
@@ -2360,28 +2589,55 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
        journal_sections = journal_sectors / ic->journal_section_sectors;
        if (!journal_sections)
                journal_sections = 1;
-       ic->sb->journal_sections = cpu_to_le32(journal_sections);
 
-       if (!interleave_sectors)
-               interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
-       ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
-       ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
-       ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
-
-       ic->provided_data_sectors = 0;
-       for (test_bit = fls64(ic->device_sectors) - 1; test_bit >= 3; test_bit--) {
-               __u64 prev_data_sectors = ic->provided_data_sectors;
+       if (!ic->meta_dev) {
+               ic->sb->journal_sections = cpu_to_le32(journal_sections);
+               if (!interleave_sectors)
+                       interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
+               ic->sb->log2_interleave_sectors = __fls(interleave_sectors);
+               ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
+               ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
+
+               ic->provided_data_sectors = 0;
+               for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) {
+                       __u64 prev_data_sectors = ic->provided_data_sectors;
+
+                       ic->provided_data_sectors |= (sector_t)1 << test_bit;
+                       if (calculate_device_limits(ic))
+                               ic->provided_data_sectors = prev_data_sectors;
+               }
+               if (!ic->provided_data_sectors)
+                       return -EINVAL;
+       } else {
+               ic->sb->log2_interleave_sectors = 0;
+               ic->provided_data_sectors = ic->data_device_sectors;
+               ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1);
+
+try_smaller_buffer:
+               ic->sb->journal_sections = cpu_to_le32(0);
+               for (test_bit = fls(journal_sections) - 1; test_bit >= 0; test_bit--) {
+                       __u32 prev_journal_sections = le32_to_cpu(ic->sb->journal_sections);
+                       __u32 test_journal_sections = prev_journal_sections | (1U << test_bit);
+                       if (test_journal_sections > journal_sections)
+                               continue;
+                       ic->sb->journal_sections = cpu_to_le32(test_journal_sections);
+                       if (calculate_device_limits(ic))
+                               ic->sb->journal_sections = cpu_to_le32(prev_journal_sections);
 
-               ic->provided_data_sectors |= (sector_t)1 << test_bit;
-               if (calculate_device_limits(ic))
-                       ic->provided_data_sectors = prev_data_sectors;
+               }
+               if (!le32_to_cpu(ic->sb->journal_sections)) {
+                       if (ic->log2_buffer_sectors > 3) {
+                               ic->log2_buffer_sectors--;
+                               goto try_smaller_buffer;
+                       }
+                       return -EINVAL;
+               }
        }
 
-       if (!ic->provided_data_sectors)
-               return -EINVAL;
-
        ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
 
+       sb_set_version(ic);
+
        return 0;
 }
 
@@ -2828,6 +3084,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                {0, 9, "Invalid number of feature args"},
        };
        unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
+       bool recalculate;
        bool should_write_sb;
        __u64 threshold;
        unsigned long long start;
@@ -2848,6 +3105,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
        ti->per_io_data_size = sizeof(struct dm_integrity_io);
 
        ic->in_progress = RB_ROOT;
+       INIT_LIST_HEAD(&ic->wait_list);
        init_waitqueue_head(&ic->endio_wait);
        bio_list_init(&ic->flush_bio_list);
        init_waitqueue_head(&ic->copy_to_journal_wait);
@@ -2883,13 +3141,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                goto bad;
        }
 
-       ic->device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
-       journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
-                       ic->device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
+       journal_sectors = 0;
        interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
        buffer_sectors = DEFAULT_BUFFER_SECTORS;
        journal_watermark = DEFAULT_JOURNAL_WATERMARK;
        sync_msec = DEFAULT_SYNC_MSEC;
+       recalculate = false;
        ic->sectors_per_block = 1;
 
        as.argc = argc - DIRECT_ARGUMENTS;
@@ -2908,7 +3165,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                        goto bad;
                }
                if (sscanf(opt_string, "journal_sectors:%u%c", &val, &dummy) == 1)
-                       journal_sectors = val;
+                       journal_sectors = val ? val : 1;
                else if (sscanf(opt_string, "interleave_sectors:%u%c", &val, &dummy) == 1)
                        interleave_sectors = val;
                else if (sscanf(opt_string, "buffer_sectors:%u%c", &val, &dummy) == 1)
@@ -2917,7 +3174,17 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                        journal_watermark = val;
                else if (sscanf(opt_string, "commit_time:%u%c", &val, &dummy) == 1)
                        sync_msec = val;
-               else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
+               else if (!memcmp(opt_string, "meta_device:", strlen("meta_device:"))) {
+                       if (ic->meta_dev) {
+                               dm_put_device(ti, ic->meta_dev);
+                               ic->meta_dev = NULL;
+                       }
+                       r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev);
+                       if (r) {
+                               ti->error = "Device lookup failed";
+                               goto bad;
+                       }
+               } else if (sscanf(opt_string, "block_size:%u%c", &val, &dummy) == 1) {
                        if (val < 1 << SECTOR_SHIFT ||
                            val > MAX_SECTORS_PER_BLOCK << SECTOR_SHIFT ||
                            (val & (val -1))) {
@@ -2941,6 +3208,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                                            "Invalid journal_mac argument");
                        if (r)
                                goto bad;
+               } else if (!strcmp(opt_string, "recalculate")) {
+                       recalculate = true;
                } else {
                        r = -EINVAL;
                        ti->error = "Invalid argument";
@@ -2948,6 +3217,21 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                }
        }
 
+       ic->data_device_sectors = i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       if (!ic->meta_dev)
+               ic->meta_device_sectors = ic->data_device_sectors;
+       else
+               ic->meta_device_sectors = i_size_read(ic->meta_dev->bdev->bd_inode) >> SECTOR_SHIFT;
+
+       if (!journal_sectors) {
+               journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
+                       ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
+       }
+
+       if (!buffer_sectors)
+               buffer_sectors = 1;
+       ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT);
+
        r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
                    "Invalid internal hash", "Error setting internal hash key");
        if (r)
@@ -3062,7 +3346,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                        should_write_sb = true;
        }
 
-       if (ic->sb->version != SB_VERSION) {
+       if (!ic->sb->version || ic->sb->version > SB_VERSION_2) {
                r = -EINVAL;
                ti->error = "Unknown version";
                goto bad;
@@ -3083,11 +3367,19 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                goto bad;
        }
        /* make sure that ti->max_io_len doesn't overflow */
-       if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
-           ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
-               r = -EINVAL;
-               ti->error = "Invalid interleave_sectors in the superblock";
-               goto bad;
+       if (!ic->meta_dev) {
+               if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
+                   ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
+                       r = -EINVAL;
+                       ti->error = "Invalid interleave_sectors in the superblock";
+                       goto bad;
+               }
+       } else {
+               if (ic->sb->log2_interleave_sectors) {
+                       r = -EINVAL;
+                       ti->error = "Invalid interleave_sectors in the superblock";
+                       goto bad;
+               }
        }
        ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
        if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
@@ -3101,20 +3393,28 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                ti->error = "Journal mac mismatch";
                goto bad;
        }
+
+try_smaller_buffer:
        r = calculate_device_limits(ic);
        if (r) {
+               if (ic->meta_dev) {
+                       if (ic->log2_buffer_sectors > 3) {
+                               ic->log2_buffer_sectors--;
+                               goto try_smaller_buffer;
+                       }
+               }
                ti->error = "The device is too small";
                goto bad;
        }
+       if (!ic->meta_dev)
+               ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
+
        if (ti->len > ic->provided_data_sectors) {
                r = -EINVAL;
                ti->error = "Not enough provided sectors for requested mapping size";
                goto bad;
        }
 
-       if (!buffer_sectors)
-               buffer_sectors = 1;
-       ic->log2_buffer_sectors = min3((int)__fls(buffer_sectors), (int)__ffs(ic->metadata_run), 31 - SECTOR_SHIFT);
 
        threshold = (__u64)ic->journal_entries * (100 - journal_watermark);
        threshold += 50;
@@ -3138,8 +3438,40 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                    (unsigned long long)ic->provided_data_sectors);
        DEBUG_print("   log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
 
-       ic->bufio = dm_bufio_client_create(ic->dev->bdev, 1U << (SECTOR_SHIFT + ic->log2_buffer_sectors),
-                                          1, 0, NULL, NULL);
+       if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
+               ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+               ic->sb->recalc_sector = cpu_to_le64(0);
+       }
+
+       if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+               if (!ic->internal_hash) {
+                       r = -EINVAL;
+                       ti->error = "Recalculate is only valid with internal hash";
+                       goto bad;
+               }
+               ic->recalc_wq = alloc_workqueue("dm-intergrity-recalc", WQ_MEM_RECLAIM, 1);
+               if (!ic->recalc_wq ) {
+                       ti->error = "Cannot allocate workqueue";
+                       r = -ENOMEM;
+                       goto bad;
+               }
+               INIT_WORK(&ic->recalc_work, integrity_recalc);
+               ic->recalc_buffer = vmalloc(RECALC_SECTORS << SECTOR_SHIFT);
+               if (!ic->recalc_buffer) {
+                       ti->error = "Cannot allocate buffer for recalculating";
+                       r = -ENOMEM;
+                       goto bad;
+               }
+               ic->recalc_tags = kvmalloc((RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size, GFP_KERNEL);
+               if (!ic->recalc_tags) {
+                       ti->error = "Cannot allocate tags for recalculating";
+                       r = -ENOMEM;
+                       goto bad;
+               }
+       }
+
+       ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev,
+                       1U << (SECTOR_SHIFT + ic->log2_buffer_sectors), 1, 0, NULL, NULL);
        if (IS_ERR(ic->bufio)) {
                r = PTR_ERR(ic->bufio);
                ti->error = "Cannot initialize dm-bufio";
@@ -3171,9 +3503,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                ic->just_formatted = true;
        }
 
-       r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
-       if (r)
-               goto bad;
+       if (!ic->meta_dev) {
+               r = dm_set_target_max_io_len(ti, 1U << ic->sb->log2_interleave_sectors);
+               if (r)
+                       goto bad;
+       }
 
        if (!ic->internal_hash)
                dm_integrity_set(ti, ic);
@@ -3192,6 +3526,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
        struct dm_integrity_c *ic = ti->private;
 
        BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
+       BUG_ON(!list_empty(&ic->wait_list));
 
        if (ic->metadata_wq)
                destroy_workqueue(ic->metadata_wq);
@@ -3201,6 +3536,12 @@ static void dm_integrity_dtr(struct dm_target *ti)
                destroy_workqueue(ic->commit_wq);
        if (ic->writer_wq)
                destroy_workqueue(ic->writer_wq);
+       if (ic->recalc_wq)
+               destroy_workqueue(ic->recalc_wq);
+       if (ic->recalc_buffer)
+               vfree(ic->recalc_buffer);
+       if (ic->recalc_tags)
+               kvfree(ic->recalc_tags);
        if (ic->bufio)
                dm_bufio_client_destroy(ic->bufio);
        mempool_exit(&ic->journal_io_mempool);
@@ -3208,6 +3549,8 @@ static void dm_integrity_dtr(struct dm_target *ti)
                dm_io_client_destroy(ic->io);
        if (ic->dev)
                dm_put_device(ti, ic->dev);
+       if (ic->meta_dev)
+               dm_put_device(ti, ic->meta_dev);
        dm_integrity_free_page_list(ic, ic->journal);
        dm_integrity_free_page_list(ic, ic->journal_io);
        dm_integrity_free_page_list(ic, ic->journal_xor);
@@ -3248,7 +3591,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
 
 static struct target_type integrity_target = {
        .name                   = "integrity",
-       .version                = {1, 1, 0},
+       .version                = {1, 2, 0},
        .module                 = THIS_MODULE,
        .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
        .ctr                    = dm_integrity_ctr,
index 3c7547a3c3715f38ed7fa3e265ed5dea8f6183d0..2fc4213e02b5fa396c4fc380d0b6d886750e1d17 100644 (file)
@@ -487,6 +487,8 @@ static int run_complete_job(struct kcopyd_job *job)
        if (atomic_dec_and_test(&kc->nr_jobs))
                wake_up(&kc->destroyq);
 
+       cond_resched();
+
        return 0;
 }
 
@@ -741,9 +743,9 @@ static void split_job(struct kcopyd_job *master_job)
        }
 }
 
-int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
-                  unsigned int num_dests, struct dm_io_region *dests,
-                  unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
+void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
+                   unsigned int num_dests, struct dm_io_region *dests,
+                   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
 {
        struct kcopyd_job *job;
        int i;
@@ -818,16 +820,14 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
                job->progress = 0;
                split_job(job);
        }
-
-       return 0;
 }
 EXPORT_SYMBOL(dm_kcopyd_copy);
 
-int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
-                  unsigned num_dests, struct dm_io_region *dests,
-                  unsigned flags, dm_kcopyd_notify_fn fn, void *context)
+void dm_kcopyd_zero(struct dm_kcopyd_client *kc,
+                   unsigned num_dests, struct dm_io_region *dests,
+                   unsigned flags, dm_kcopyd_notify_fn fn, void *context)
 {
-       return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
+       dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context);
 }
 EXPORT_SYMBOL(dm_kcopyd_zero);
 
index 5903e492bb34a307deee617726d91464d7e3d462..79eab1071ec22ba73458cc99c5841de30f2105b0 100644 (file)
@@ -326,9 +326,8 @@ static void recovery_complete(int read_err, unsigned long write_err,
        dm_rh_recovery_end(reg, !(read_err || write_err));
 }
 
-static int recover(struct mirror_set *ms, struct dm_region *reg)
+static void recover(struct mirror_set *ms, struct dm_region *reg)
 {
-       int r;
        unsigned i;
        struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest;
        struct mirror *m;
@@ -367,10 +366,8 @@ static int recover(struct mirror_set *ms, struct dm_region *reg)
        if (!errors_handled(ms))
                set_bit(DM_KCOPYD_IGNORE_ERROR, &flags);
 
-       r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
-                          flags, recovery_complete, reg);
-
-       return r;
+       dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
+                      flags, recovery_complete, reg);
 }
 
 static void reset_ms_flags(struct mirror_set *ms)
@@ -388,7 +385,6 @@ static void do_recovery(struct mirror_set *ms)
 {
        struct dm_region *reg;
        struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
-       int r;
 
        /*
         * Start quiescing some regions.
@@ -398,11 +394,8 @@ static void do_recovery(struct mirror_set *ms)
        /*
         * Copy any already quiesced regions.
         */
-       while ((reg = dm_rh_recovery_start(ms->rh))) {
-               r = recover(ms, reg);
-               if (r)
-                       dm_rh_recovery_end(reg, 0);
-       }
+       while ((reg = dm_rh_recovery_start(ms->rh)))
+               recover(ms, reg);
 
        /*
         * Update the in sync flag.
index 97de7a7334d4c59a0e051372b57e8538b9182e5a..ae4b33d109246e305c73f8231d122ce5b5cfdb41 100644 (file)
@@ -85,7 +85,7 @@ struct dm_snapshot {
         * A list of pending exceptions that completed out of order.
         * Protected by kcopyd single-threaded callback.
         */
-       struct list_head out_of_order_list;
+       struct rb_root out_of_order_tree;
 
        mempool_t pending_pool;
 
@@ -200,7 +200,7 @@ struct dm_snap_pending_exception {
        /* A sequence number, it is used for in-order completion. */
        sector_t exception_sequence;
 
-       struct list_head out_of_order_entry;
+       struct rb_node out_of_order_node;
 
        /*
         * For writing a complete chunk, bypassing the copy.
@@ -1173,7 +1173,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        atomic_set(&s->pending_exceptions_count, 0);
        s->exception_start_sequence = 0;
        s->exception_complete_sequence = 0;
-       INIT_LIST_HEAD(&s->out_of_order_list);
+       s->out_of_order_tree = RB_ROOT;
        mutex_init(&s->lock);
        INIT_LIST_HEAD(&s->list);
        spin_lock_init(&s->pe_lock);
@@ -1539,28 +1539,41 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
        pe->copy_error = read_err || write_err;
 
        if (pe->exception_sequence == s->exception_complete_sequence) {
+               struct rb_node *next;
+
                s->exception_complete_sequence++;
                complete_exception(pe);
 
-               while (!list_empty(&s->out_of_order_list)) {
-                       pe = list_entry(s->out_of_order_list.next,
-                                       struct dm_snap_pending_exception, out_of_order_entry);
+               next = rb_first(&s->out_of_order_tree);
+               while (next) {
+                       pe = rb_entry(next, struct dm_snap_pending_exception,
+                                       out_of_order_node);
                        if (pe->exception_sequence != s->exception_complete_sequence)
                                break;
+                       next = rb_next(next);
                        s->exception_complete_sequence++;
-                       list_del(&pe->out_of_order_entry);
+                       rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
                        complete_exception(pe);
+                       cond_resched();
                }
        } else {
-               struct list_head *lh;
+               struct rb_node *parent = NULL;
+               struct rb_node **p = &s->out_of_order_tree.rb_node;
                struct dm_snap_pending_exception *pe2;
 
-               list_for_each_prev(lh, &s->out_of_order_list) {
-                       pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
-                       if (pe2->exception_sequence < pe->exception_sequence)
-                               break;
+               while (*p) {
+                       pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
+                       parent = *p;
+
+                       BUG_ON(pe->exception_sequence == pe2->exception_sequence);
+                       if (pe->exception_sequence < pe2->exception_sequence)
+                               p = &((*p)->rb_left);
+                       else
+                               p = &((*p)->rb_right);
                }
-               list_add(&pe->out_of_order_entry, lh);
+
+               rb_link_node(&pe->out_of_order_node, parent, p);
+               rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
        }
 }
 
@@ -1694,8 +1707,6 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
        if (!s->valid)
                return DM_MAPIO_KILL;
 
-       /* FIXME: should only take write lock if we need
-        * to copy an exception */
        mutex_lock(&s->lock);
 
        if (!s->valid || (unlikely(s->snapshot_overflowed) &&
index b900723bbd0fae4845a17ef67dadcf33dc5cc67b..7bd60a150f8faec85071cf1d984bd35d27b08391 100644 (file)
@@ -1220,18 +1220,13 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
                    sector_t begin, sector_t end)
 {
-       int r;
        struct dm_io_region to;
 
        to.bdev = tc->pool_dev->bdev;
        to.sector = begin;
        to.count = end - begin;
 
-       r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
-       if (r < 0) {
-               DMERR_LIMIT("dm_kcopyd_zero() failed");
-               copy_complete(1, 1, m);
-       }
+       dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
 }
 
 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
@@ -1257,7 +1252,6 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
                          struct dm_bio_prison_cell *cell, struct bio *bio,
                          sector_t len)
 {
-       int r;
        struct pool *pool = tc->pool;
        struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
@@ -1296,19 +1290,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
                to.sector = data_dest * pool->sectors_per_block;
                to.count = len;
 
-               r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
-                                  0, copy_complete, m);
-               if (r < 0) {
-                       DMERR_LIMIT("dm_kcopyd_copy() failed");
-                       copy_complete(1, 1, m);
-
-                       /*
-                        * We allow the zero to be issued, to simplify the
-                        * error path.  Otherwise we'd need to start
-                        * worrying about decrementing the prepare_actions
-                        * counter.
-                        */
-               }
+               dm_kcopyd_copy(pool->copier, &from, 1, &to,
+                              0, copy_complete, m);
 
                /*
                 * Do we need to zero a tail region?
@@ -2520,6 +2503,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
        case PM_WRITE:
                if (old_mode != new_mode)
                        notify_of_pool_mode_change(pool, "write");
+               if (old_mode == PM_OUT_OF_DATA_SPACE)
+                       cancel_delayed_work_sync(&pool->no_space_timeout);
                pool->out_of_data_space = false;
                pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
                dm_pool_metadata_read_write(pool->pmd);
@@ -3890,6 +3875,8 @@ static void pool_status(struct dm_target *ti, status_type_t type,
                else
                        DMEMIT("- ");
 
+               DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
+
                break;
 
        case STATUSTYPE_TABLE:
@@ -3979,7 +3966,7 @@ static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-       .version = {1, 19, 0},
+       .version = {1, 20, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -4353,7 +4340,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type thin_target = {
        .name = "thin",
-       .version = {1, 19, 0},
+       .version = {1, 20, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
index 87107c995cb5be3a25b33b10b737ec1ac0dc3fea..3a28a68f184ca5baccaa541ad68fd60eade09df7 100644 (file)
@@ -457,7 +457,7 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
                COMPLETION_INITIALIZER_ONSTACK(endio.c),
                ATOMIC_INIT(1),
        };
-       unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG;
+       unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
        unsigned i = 0;
 
        while (1) {
@@ -2240,6 +2240,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
                DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
                                wc->dev->name, wc->ssd_dev->name, wc->block_size);
                extra_args = 0;
+               if (wc->start_sector)
+                       extra_args += 2;
                if (wc->high_wm_percent_set)
                        extra_args += 2;
                if (wc->low_wm_percent_set)
@@ -2254,6 +2256,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
                        extra_args++;
 
                DMEMIT("%u", extra_args);
+               if (wc->start_sector)
+                       DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
                if (wc->high_wm_percent_set) {
                        x = (uint64_t)wc->freelist_high_watermark * 100;
                        x += wc->n_blocks / 2;
@@ -2280,7 +2284,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
 
 static struct target_type writecache_target = {
        .name                   = "writecache",
-       .version                = {1, 1, 0},
+       .version                = {1, 1, 1},
        .module                 = THIS_MODULE,
        .ctr                    = writecache_ctr,
        .dtr                    = writecache_dtr,
index 44a119e12f1abd8eb5b4e4ae15689c83a9502536..edf4b95eb0750dc6485513d49c240b2982017114 100644 (file)
@@ -161,10 +161,8 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
 
                /* Copy the valid region */
                set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags);
-               ret = dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags,
-                                    dmz_reclaim_kcopy_end, zrc);
-               if (ret)
-                       return ret;
+               dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags,
+                              dmz_reclaim_kcopy_end, zrc);
 
                /* Wait for copy to complete */
                wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY,
index cfac8588ed56d84f2bf1047bfdaa6837873fc930..e42de7750c884ec6f7c56e250db963bbc154d9f9 100644 (file)
@@ -62,9 +62,9 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc);
 typedef void (*dm_kcopyd_notify_fn)(int read_err, unsigned long write_err,
                                    void *context);
 
-int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
-                  unsigned num_dests, struct dm_io_region *dests,
-                  unsigned flags, dm_kcopyd_notify_fn fn, void *context);
+void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
+                   unsigned num_dests, struct dm_io_region *dests,
+                   unsigned flags, dm_kcopyd_notify_fn fn, void *context);
 
 /*
  * Prepare a callback and submit it via the kcopyd thread.
@@ -81,9 +81,9 @@ void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
                                 dm_kcopyd_notify_fn fn, void *context);
 void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err);
 
-int dm_kcopyd_zero(struct dm_kcopyd_client *kc,
-                  unsigned num_dests, struct dm_io_region *dests,
-                  unsigned flags, dm_kcopyd_notify_fn fn, void *context);
+void dm_kcopyd_zero(struct dm_kcopyd_client *kc,
+                   unsigned num_dests, struct dm_io_region *dests,
+                   unsigned flags, dm_kcopyd_notify_fn fn, void *context);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_DM_KCOPYD_H */