Merge tag 'for-4.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git...
[muen/linux.git] / drivers / md / dm-raid.c
index e5ef0757fe230a8b4712c148564246a287e04739..7ef469e902c620126b95f69d899e528ae114b3bc 100644 (file)
@@ -29,6 +29,9 @@
  */
 #define        MIN_RAID456_JOURNAL_SPACE (4*2048)
 
+/* Global list of all raid sets */
+static LIST_HEAD(raid_sets);
+
 static bool devices_handle_discard_safely = false;
 
 /*
@@ -105,8 +108,6 @@ struct raid_dev {
 #define CTR_FLAG_JOURNAL_DEV           (1 << __CTR_FLAG_JOURNAL_DEV)
 #define CTR_FLAG_JOURNAL_MODE          (1 << __CTR_FLAG_JOURNAL_MODE)
 
-#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET)
-
 /*
  * Definitions of various constructor flags to
  * be used in checks of valid / invalid flags
@@ -209,6 +210,8 @@ struct raid_dev {
 #define RT_FLAG_UPDATE_SBS             3
 #define RT_FLAG_RESHAPE_RS             4
 #define RT_FLAG_RS_SUSPENDED           5
+#define RT_FLAG_RS_IN_SYNC             6
+#define RT_FLAG_RS_RESYNCING           7
 
 /* Array elements of 64 bit needed for rebuild/failed disk bits */
 #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -224,8 +227,8 @@ struct rs_layout {
 
 struct raid_set {
        struct dm_target *ti;
+       struct list_head list;
 
-       uint32_t bitmap_loaded;
        uint32_t stripe_cache_entries;
        unsigned long ctr_flags;
        unsigned long runtime_flags;
@@ -270,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l)
        mddev->new_chunk_sectors = l->new_chunk_sectors;
 }
 
+/* Find any raid_set in active slot for @rs on global list */
+static struct raid_set *rs_find_active(struct raid_set *rs)
+{
+       struct raid_set *r;
+       struct mapped_device *md = dm_table_get_md(rs->ti->table);
+
+       list_for_each_entry(r, &raid_sets, list)
+               if (r != rs && dm_table_get_md(r->ti->table) == md)
+                       return r;
+
+       return NULL;
+}
+
 /* raid10 algorithms (i.e. formats) */
 #define        ALGORITHM_RAID10_DEFAULT        0
 #define        ALGORITHM_RAID10_NEAR           1
@@ -572,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout)
 }
 
 /* Return md raid10 algorithm for @name */
-static int raid10_name_to_format(const char *name)
+static const int raid10_name_to_format(const char *name)
 {
        if (!strcasecmp(name, "near"))
                return ALGORITHM_RAID10_NEAR;
@@ -675,15 +691,11 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout)
        return NULL;
 }
 
-/*
- * Conditionally change bdev capacity of @rs
- * in case of a disk add/remove reshape
- */
-static void rs_set_capacity(struct raid_set *rs)
+/* Adjust rdev sectors */
+static void rs_set_rdev_sectors(struct raid_set *rs)
 {
        struct mddev *mddev = &rs->md;
        struct md_rdev *rdev;
-       struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
 
        /*
         * raid10 sets rdev->sector to the device size, which
@@ -692,8 +704,16 @@ static void rs_set_capacity(struct raid_set *rs)
        rdev_for_each(rdev, mddev)
                if (!test_bit(Journal, &rdev->flags))
                        rdev->sectors = mddev->dev_sectors;
+}
 
-       set_capacity(gendisk, mddev->array_sectors);
+/*
+ * Change bdev capacity of @rs in case of a disk add/remove reshape
+ */
+static void rs_set_capacity(struct raid_set *rs)
+{
+       struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table));
+
+       set_capacity(gendisk, rs->md.array_sectors);
        revalidate_disk(gendisk);
 }
 
@@ -744,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 
        mddev_init(&rs->md);
 
+       INIT_LIST_HEAD(&rs->list);
        rs->raid_disks = raid_devs;
        rs->delta_disks = 0;
 
@@ -761,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
        for (i = 0; i < raid_devs; i++)
                md_rdev_init(&rs->dev[i].rdev);
 
+       /* Add @rs to global list. */
+       list_add(&rs->list, &raid_sets);
+
        /*
         * Remaining items to be initialized by further RAID params:
         *  rs->md.persistent
@@ -773,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
        return rs;
 }
 
+/* Free all @rs allocations and remove it from global list. */
 static void raid_set_free(struct raid_set *rs)
 {
        int i;
@@ -790,6 +815,8 @@ static void raid_set_free(struct raid_set *rs)
                        dm_put_device(rs->ti, rs->dev[i].data_dev);
        }
 
+       list_del(&rs->list);
+
        kfree(rs);
 }
 
@@ -1002,7 +1029,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
                    !rs->dev[i].rdev.sb_page)
                        rebuild_cnt++;
 
-       switch (rs->raid_type->level) {
+       switch (rs->md.level) {
        case 0:
                break;
        case 1:
@@ -1017,6 +1044,11 @@ static int validate_raid_redundancy(struct raid_set *rs)
                break;
        case 10:
                copies = raid10_md_layout_to_copies(rs->md.new_layout);
+               if (copies < 2) {
+                       DMERR("Bogus raid10 data copies < 2!");
+                       return -EINVAL;
+               }
+
                if (rebuild_cnt < copies)
                        break;
 
@@ -1576,6 +1608,24 @@ static sector_t __rdev_sectors(struct raid_set *rs)
        return 0;
 }
 
+/* Check that calculated dev_sectors fits all component devices. */
+static int _check_data_dev_sectors(struct raid_set *rs)
+{
+       sector_t ds = ~0;
+       struct md_rdev *rdev;
+
+       rdev_for_each(rdev, &rs->md)
+               if (!test_bit(Journal, &rdev->flags) && rdev->bdev) {
+                       ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode)));
+                       if (ds < rs->md.dev_sectors) {
+                               rs->ti->error = "Component device(s) too small";
+                               return -EINVAL;
+                       }
+               }
+
+       return 0;
+}
+
 /* Calculate the sectors per device and per array used for @rs */
 static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
 {
@@ -1625,7 +1675,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
        mddev->array_sectors = array_sectors;
        mddev->dev_sectors = dev_sectors;
 
-       return 0;
+       return _check_data_dev_sectors(rs);
 bad:
        rs->ti->error = "Target length not divisible by number of data devices";
        return -EINVAL;
@@ -1674,8 +1724,11 @@ static void do_table_event(struct work_struct *ws)
        struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
 
        smp_rmb(); /* Make sure we access most actual mddev properties */
-       if (!rs_is_reshaping(rs))
+       if (!rs_is_reshaping(rs)) {
+               if (rs_is_raid10(rs))
+                       rs_set_rdev_sectors(rs);
                rs_set_capacity(rs);
+       }
        dm_table_event(rs->ti->table);
 }
 
@@ -1860,7 +1913,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
        if (rs_takeover_requested(rs))
                return false;
 
-       if (!mddev->level)
+       if (rs_is_raid0(rs))
                return false;
 
        change = mddev->new_layout != mddev->layout ||
@@ -1868,7 +1921,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
                 rs->delta_disks;
 
        /* Historical case to support raid1 reshape without delta disks */
-       if (mddev->level == 1) {
+       if (rs_is_raid1(rs)) {
                if (rs->delta_disks)
                        return !!rs->delta_disks;
 
@@ -1876,7 +1929,7 @@ static bool rs_reshape_requested(struct raid_set *rs)
                       mddev->raid_disks != rs->raid_disks;
        }
 
-       if (mddev->level == 10)
+       if (rs_is_raid10(rs))
                return change &&
                       !__is_raid10_far(mddev->new_layout) &&
                       rs->delta_disks >= 0;
@@ -2340,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
                        DMERR("new device%s provided without 'rebuild'",
                              new_devs > 1 ? "s" : "");
                        return -EINVAL;
-               } else if (rs_is_recovering(rs)) {
+               } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
                        DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
                              (unsigned long long) mddev->recovery_cp);
                        return -EINVAL;
@@ -2640,12 +2693,19 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
         * Make sure we got a minimum amount of free sectors per device
         */
        if (rs->data_offset &&
-           to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) {
+           to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) {
                rs->ti->error = data_offset ? "No space for forward reshape" :
                                              "No space for backward reshape";
                return -ENOSPC;
        }
 out:
+       /*
+        * Raise recovery_cp in case data_offset != 0 to
+        * avoid false recovery positives in the constructor.
+        */
+       if (rs->md.recovery_cp < rs->md.dev_sectors)
+               rs->md.recovery_cp += rs->dev[0].rdev.data_offset;
+
        /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
        rdev_for_each(rdev, &rs->md) {
                if (!test_bit(Journal, &rdev->flags)) {
@@ -2682,14 +2742,14 @@ static int rs_setup_takeover(struct raid_set *rs)
        sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset;
 
        if (rt_is_raid10(rs->raid_type)) {
-               if (mddev->level == 0) {
+               if (rs_is_raid0(rs)) {
                        /* Userpace reordered disks -> adjust raid_disk indexes */
                        __reorder_raid_disk_indexes(rs);
 
                        /* raid0 -> raid10_far layout */
                        mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR,
                                                                   rs->raid10_copies);
-               } else if (mddev->level == 1)
+               } else if (rs_is_raid1(rs))
                        /* raid1 -> raid10_near layout */
                        mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR,
                                                                   rs->raid_disks);
@@ -2777,6 +2837,23 @@ static int rs_prepare_reshape(struct raid_set *rs)
        return 0;
 }
 
+/* Get reshape sectors from data_offsets or raid set */
+static sector_t _get_reshape_sectors(struct raid_set *rs)
+{
+       struct md_rdev *rdev;
+       sector_t reshape_sectors = 0;
+
+       rdev_for_each(rdev, &rs->md)
+               if (!test_bit(Journal, &rdev->flags)) {
+                       reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ?
+                                       rdev->data_offset - rdev->new_data_offset :
+                                       rdev->new_data_offset - rdev->data_offset;
+                       break;
+               }
+
+       return max(reshape_sectors, (sector_t) rs->data_offset);
+}
+
 /*
  *
  * - change raid layout
@@ -2788,6 +2865,7 @@ static int rs_setup_reshape(struct raid_set *rs)
 {
        int r = 0;
        unsigned int cur_raid_devs, d;
+       sector_t reshape_sectors = _get_reshape_sectors(rs);
        struct mddev *mddev = &rs->md;
        struct md_rdev *rdev;
 
@@ -2804,13 +2882,13 @@ static int rs_setup_reshape(struct raid_set *rs)
        /*
         * Adjust array size:
         *
-        * - in case of adding disks, array size has
+        * - in case of adding disk(s), array size has
         *   to grow after the disk adding reshape,
         *   which'll hapen in the event handler;
         *   reshape will happen forward, so space has to
         *   be available at the beginning of each disk
         *
-        * - in case of removing disks, array size
+        * - in case of removing disk(s), array size
         *   has to shrink before starting the reshape,
         *   which'll happen here;
         *   reshape will happen backward, so space has to
@@ -2841,7 +2919,7 @@ static int rs_setup_reshape(struct raid_set *rs)
                        rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector;
                }
 
-               mddev->reshape_backwards = 0; /* adding disks -> forward reshape */
+               mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */
 
        /* Remove disk(s) */
        } else if (rs->delta_disks < 0) {
@@ -2874,6 +2952,15 @@ static int rs_setup_reshape(struct raid_set *rs)
                mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1;
        }
 
+       /*
+        * Adjust device size for forward reshape
+        * because md_finish_reshape() reduces it.
+        */
+       if (!mddev->reshape_backwards)
+               rdev_for_each(rdev, &rs->md)
+                       if (!test_bit(Journal, &rdev->flags))
+                               rdev->sectors += reshape_sectors;
+
        return r;
 }
 
@@ -2890,7 +2977,7 @@ static void configure_discard_support(struct raid_set *rs)
        /*
         * XXX: RAID level 4,5,6 require zeroing for safety.
         */
-       raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
+       raid456 = rs_is_raid456(rs);
 
        for (i = 0; i < rs->raid_disks; i++) {
                struct request_queue *q;
@@ -2915,7 +3002,7 @@ static void configure_discard_support(struct raid_set *rs)
         * RAID1 and RAID10 personalities require bio splitting,
         * RAID0/4/5/6 don't and process large discard bios properly.
         */
-       ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10);
+       ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs));
        ti->num_discard_bios = 1;
 }
 
@@ -2935,10 +3022,10 @@ static void configure_discard_support(struct raid_set *rs)
 static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        int r;
-       bool resize;
+       bool resize = false;
        struct raid_type *rt;
        unsigned int num_raid_params, num_raid_devs;
-       sector_t calculated_dev_sectors, rdev_sectors;
+       sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors;
        struct raid_set *rs = NULL;
        const char *arg;
        struct rs_layout rs_layout;
@@ -3021,7 +3108,10 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad;
        }
 
-       resize = calculated_dev_sectors != rdev_sectors;
+
+       reshape_sectors = _get_reshape_sectors(rs);
+       if (calculated_dev_sectors != rdev_sectors)
+               resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors);
 
        INIT_WORK(&rs->md.event_work, do_table_event);
        ti->private = rs;
@@ -3105,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                        goto bad;
                }
 
-               /*
-                 * We can only prepare for a reshape here, because the
-                 * raid set needs to run to provide the repective reshape
-                 * check functions via its MD personality instance.
-                 *
-                 * So do the reshape check after md_run() succeeded.
-                 */
-               r = rs_prepare_reshape(rs);
-               if (r)
-                       return r;
+               /* Out-of-place space has to be available to allow for a reshape unless raid1! */
+               if (reshape_sectors || rs_is_raid1(rs)) {
+                       /*
+                         * We can only prepare for a reshape here, because the
+                         * raid set needs to run to provide the repective reshape
+                         * check functions via its MD personality instance.
+                         *
+                         * So do the reshape check after md_run() succeeded.
+                         */
+                       r = rs_prepare_reshape(rs);
+                       if (r)
+                               return r;
 
-               /* Reshaping ain't recovery, so disable recovery */
-               rs_setup_recovery(rs, MaxSector);
+                       /* Reshaping ain't recovery, so disable recovery */
+                       rs_setup_recovery(rs, MaxSector);
+               }
                rs_set_cur(rs);
        } else {
                /* May not set recovery when a device rebuild is requested */
@@ -3144,7 +3237,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        mddev_lock_nointr(&rs->md);
        r = md_run(&rs->md);
        rs->md.in_sync = 0; /* Assume already marked dirty */
-
        if (r) {
                ti->error = "Failed to run raid array";
                mddev_unlock(&rs->md);
@@ -3248,25 +3340,27 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
 }
 
 /* Return string describing the current sync action of @mddev */
-static const char *decipher_sync_action(struct mddev *mddev)
+static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery)
 {
-       if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+       if (test_bit(MD_RECOVERY_FROZEN, &recovery))
                return "frozen";
 
-       if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-           (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
-               if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+       /* The MD sync thread can be done with io but still be running */
+       if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
+           (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
+            (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
+               if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
                        return "reshape";
 
-               if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-                       if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+               if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
+                       if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
                                return "resync";
-                       else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+                       else if (test_bit(MD_RECOVERY_CHECK, &recovery))
                                return "check";
                        return "repair";
                }
 
-               if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+               if (test_bit(MD_RECOVERY_RECOVER, &recovery))
                        return "recover";
        }
 
@@ -3283,7 +3377,7 @@ static const char *decipher_sync_action(struct mddev *mddev)
  *  'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device
  *  '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
  */
-static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync)
+static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
 {
        if (!rdev->bdev)
                return "-";
@@ -3291,85 +3385,108 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev,
                return "D";
        else if (test_bit(Journal, &rdev->flags))
                return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a";
-       else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
+       else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) ||
+                (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) &&
+                 !test_bit(In_sync, &rdev->flags)))
                return "a";
        else
                return "A";
 }
 
-/* Helper to return resync/reshape progress for @rs and @array_in_sync */
-static sector_t rs_get_progress(struct raid_set *rs,
-                               sector_t resync_max_sectors, bool *array_in_sync)
+/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */
+static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
+                               sector_t resync_max_sectors)
 {
-       sector_t r, curr_resync_completed;
+       sector_t r;
        struct mddev *mddev = &rs->md;
 
-       curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp;
-       *array_in_sync = false;
+       clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+       clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
 
        if (rs_is_raid0(rs)) {
                r = resync_max_sectors;
-               *array_in_sync = true;
+               set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
 
        } else {
-               r = mddev->reshape_position;
-
-               /* Reshape is relative to the array size */
-               if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
-                   r != MaxSector) {
-                       if (r == MaxSector) {
-                               *array_in_sync = true;
-                               r = resync_max_sectors;
-                       } else {
-                               /* Got to reverse on backward reshape */
-                               if (mddev->reshape_backwards)
-                                       r = mddev->array_sectors - r;
-
-                               /* Devide by # of data stripes */
-                               sector_div(r, mddev_data_stripes(rs));
-                       }
-
-               /* Sync is relative to the component device size */
-               } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
-                       r = curr_resync_completed;
+               if (test_bit(MD_RECOVERY_NEEDED, &recovery) ||
+                   test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
+                   test_bit(MD_RECOVERY_RUNNING, &recovery))
+                       r = mddev->curr_resync_completed;
                else
                        r = mddev->recovery_cp;
 
-               if ((r == MaxSector) ||
-                   (test_bit(MD_RECOVERY_DONE, &mddev->recovery) &&
-                    (mddev->curr_resync_completed == resync_max_sectors))) {
+               if (r >= resync_max_sectors &&
+                   (!test_bit(MD_RECOVERY_REQUESTED, &recovery) ||
+                    (!test_bit(MD_RECOVERY_FROZEN, &recovery) &&
+                     !test_bit(MD_RECOVERY_NEEDED, &recovery) &&
+                     !test_bit(MD_RECOVERY_RUNNING, &recovery)))) {
                        /*
                         * Sync complete.
                         */
-                       *array_in_sync = true;
-                       r = resync_max_sectors;
-               } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+                       /* In case we have finished recovering, the array is in sync. */
+                       if (test_bit(MD_RECOVERY_RECOVER, &recovery))
+                               set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+
+               } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) {
+                       /*
+                        * In case we are recovering, the array is not in sync
+                        * and health chars should show the recovering legs.
+                        */
+                       ;
+
+               } else if (test_bit(MD_RECOVERY_SYNC, &recovery) &&
+                          !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+                       /*
+                        * If "resync" is occurring, the raid set
+                        * is or may be out of sync hence the health
+                        * characters shall be 'a'.
+                        */
+                       set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
+               } else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) &&
+                          !test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
+                       /*
+                        * If "reshape" is occurring, the raid set
+                        * is or may be out of sync hence the health
+                        * characters shall be 'a'.
+                        */
+                       set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
+               } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) {
                        /*
                         * If "check" or "repair" is occurring, the raid set has
                         * undergone an initial sync and the health characters
                         * should not be 'a' anymore.
                         */
-                       *array_in_sync = true;
+                       set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+
                } else {
                        struct md_rdev *rdev;
 
+                       /*
+                        * We are idle and recovery is needed, prevent 'A' chars race
+                        * caused by components still set to in-sync by constrcuctor.
+                        */
+                       if (test_bit(MD_RECOVERY_NEEDED, &recovery))
+                               set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
+
                        /*
                         * The raid set may be doing an initial sync, or it may
                         * be rebuilding individual components.  If all the
                         * devices are In_sync, then it is the raid set that is
                         * being initialized.
                         */
+                       set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
                        rdev_for_each(rdev, mddev)
                                if (!test_bit(Journal, &rdev->flags) &&
-                                   !test_bit(In_sync, &rdev->flags))
-                                       *array_in_sync = true;
-#if 0
-                       r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
-#endif
+                                   !test_bit(In_sync, &rdev->flags)) {
+                                       clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
+                                       break;
+                               }
                }
        }
 
-       return r;
+       return min(r, resync_max_sectors);
 }
 
 /* Helper to return @dev name or "-" if !@dev */
@@ -3385,7 +3502,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
        struct mddev *mddev = &rs->md;
        struct r5conf *conf = mddev->private;
        int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
-       bool array_in_sync;
+       unsigned long recovery;
        unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
        unsigned int sz = 0;
        unsigned int rebuild_disks;
@@ -3405,17 +3522,18 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 
                /* Access most recent mddev properties for status output */
                smp_rmb();
+               recovery = rs->md.recovery;
                /* Get sensible max sectors even if raid set not yet started */
                resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
                                      mddev->resync_max_sectors : mddev->dev_sectors;
-               progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync);
+               progress = rs_get_progress(rs, recovery, resync_max_sectors);
                resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
                                    atomic64_read(&mddev->resync_mismatches) : 0;
-               sync_action = decipher_sync_action(&rs->md);
+               sync_action = decipher_sync_action(&rs->md, recovery);
 
                /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
                for (i = 0; i < rs->raid_disks; i++)
-                       DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync));
+                       DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev));
 
                /*
                 * In-sync/Reshape ratio:
@@ -3466,7 +3584,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
                 * v1.10.0+:
                 */
                DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
-                             __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-");
+                             __raid_dev_status(rs, &rs->journal_dev.rdev) : "-");
                break;
 
        case STATUSTYPE_TABLE:
@@ -3622,24 +3740,19 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
        blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs));
 }
 
-static void raid_presuspend(struct dm_target *ti)
-{
-       struct raid_set *rs = ti->private;
-
-       md_stop_writes(&rs->md);
-}
-
 static void raid_postsuspend(struct dm_target *ti)
 {
        struct raid_set *rs = ti->private;
 
        if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
+               /* Writes have to be stopped before suspending to avoid deadlocks. */
+               if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
+                       md_stop_writes(&rs->md);
+
                mddev_lock_nointr(&rs->md);
                mddev_suspend(&rs->md);
                mddev_unlock(&rs->md);
        }
-
-       rs->md.ro = 1;
 }
 
 static void attempt_restore_of_faulty_devices(struct raid_set *rs)
@@ -3816,10 +3929,33 @@ static int raid_preresume(struct dm_target *ti)
        struct raid_set *rs = ti->private;
        struct mddev *mddev = &rs->md;
 
-       /* This is a resume after a suspend of the set -> it's already started */
+       /* This is a resume after a suspend of the set -> it's already started. */
        if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags))
                return 0;
 
+       if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
+               struct raid_set *rs_active = rs_find_active(rs);
+
+               if (rs_active) {
+                       /*
+                        * In case no rebuilds have been requested
+                        * and an active table slot exists, copy
+                        * current resynchonization completed and
+                        * reshape position pointers across from
+                        * suspended raid set in the active slot.
+                        *
+                        * This resumes the new mapping at current
+                        * offsets to continue recover/reshape without
+                        * necessarily redoing a raid set partially or
+                        * causing data corruption in case of a reshape.
+                        */
+                       if (rs_active->md.curr_resync_completed != MaxSector)
+                               mddev->curr_resync_completed = rs_active->md.curr_resync_completed;
+                       if (rs_active->md.reshape_position != MaxSector)
+                               mddev->reshape_position = rs_active->md.reshape_position;
+               }
+       }
+
        /*
         * The superblocks need to be updated on disk if the
         * array is new or new devices got added (thus zeroed
@@ -3851,11 +3987,10 @@ static int raid_preresume(struct dm_target *ti)
                mddev->resync_min = mddev->recovery_cp;
        }
 
-       rs_set_capacity(rs);
-
        /* Check for any reshape request unless new raid set */
-       if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
+       if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
                /* Initiate a reshape. */
+               rs_set_rdev_sectors(rs);
                mddev_lock_nointr(mddev);
                r = rs_start_reshape(rs);
                mddev_unlock(mddev);
@@ -3881,21 +4016,15 @@ static void raid_resume(struct dm_target *ti)
                attempt_restore_of_faulty_devices(rs);
        }
 
-       mddev->ro = 0;
-       mddev->in_sync = 0;
-
-       /*
-        * Keep the RAID set frozen if reshape/rebuild flags are set.
-        * The RAID set is unfrozen once the next table load/resume,
-        * which clears the reshape/rebuild flags, occurs.
-        * This ensures that the constructor for the inactive table
-        * retrieves an up-to-date reshape_position.
-        */
-       if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
-               clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-
        if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
+               /* Only reduce raid set size before running a disk removing reshape. */
+               if (mddev->delta_disks < 0)
+                       rs_set_capacity(rs);
+
                mddev_lock_nointr(mddev);
+               clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+               mddev->ro = 0;
+               mddev->in_sync = 0;
                mddev_resume(mddev);
                mddev_unlock(mddev);
        }
@@ -3903,7 +4032,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
        .name = "raid",
-       .version = {1, 13, 0},
+       .version = {1, 13, 2},
        .module = THIS_MODULE,
        .ctr = raid_ctr,
        .dtr = raid_dtr,
@@ -3912,7 +4041,6 @@ static struct target_type raid_target = {
        .message = raid_message,
        .iterate_devices = raid_iterate_devices,
        .io_hints = raid_io_hints,
-       .presuspend = raid_presuspend,
        .postsuspend = raid_postsuspend,
        .preresume = raid_preresume,
        .resume = raid_resume,