Merge tag 'ceph-for-4.13-rc4' of git://github.com/ceph/ceph-client
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Aug 2017 17:15:11 +0000 (10:15 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Aug 2017 17:15:11 +0000 (10:15 -0700)
Pull ceph fixes from Ilya Dryomov:
 "A bunch of fixes and follow-ups for -rc1 Luminous patches: issues with
  ->reencode_message() and last minute RADOS semantic changes in
  v12.1.2"

* tag 'ceph-for-4.13-rc4' of git://github.com/ceph/ceph-client:
  libceph: make RECOVERY_DELETES feature create a new interval
  libceph: upmap semantic changes
  crush: assume weight_set != null imples weight_set_size > 0
  libceph: fallback for when there isn't a pool-specific choose_arg
  libceph: don't call ->reencode_message() more than once per message
  libceph: make encode_request_*() work with r_mempool requests

include/linux/ceph/osd_client.h
include/linux/ceph/osdmap.h
include/linux/ceph/rados.h
include/linux/crush/crush.h
net/ceph/crush/mapper.c
net/ceph/messenger.c
net/ceph/osd_client.c
net/ceph/osdmap.c

index c6d96a5f46fd6127c712d1cea79e4e47075a24e7..adf670ecaf94688449f92f240dd3f0bde3b87500 100644 (file)
@@ -148,6 +148,7 @@ struct ceph_osd_request_target {
        int size;
        int min_size;
        bool sort_bitwise;
+       bool recovery_deletes;
 
        unsigned int flags;                /* CEPH_OSD_FLAG_* */
        bool paused;
index a0996cb9faeddfff86a9837676aa3a79da1f01bf..af3444a5bfdd1a0e45493d9603596fd38c042a24 100644 (file)
@@ -272,6 +272,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
                          u32 new_pg_num,
                          bool old_sort_bitwise,
                          bool new_sort_bitwise,
+                         bool old_recovery_deletes,
+                         bool new_recovery_deletes,
                          const struct ceph_pg *pgid);
 bool ceph_osds_changed(const struct ceph_osds *old_acting,
                       const struct ceph_osds *new_acting,
index 385db08bb8b2d0a16b6fdc0188e7c5701fa9ef50..b8281feda9c77ac7a821a06f28da03fcac5565fe 100644 (file)
@@ -158,6 +158,10 @@ extern const char *ceph_osd_state_name(int s);
 #define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
 #define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
 #define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL    (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN   (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
 
 /*
  * The error code to return when an OSD can't handle a write
index 92e165d417a6119d200ce1dbf2972c7297d6da1a..07eed95e10c7d67b8a2c0543c152672bc6019203 100644 (file)
@@ -193,7 +193,7 @@ struct crush_choose_arg {
 struct crush_choose_arg_map {
 #ifdef __KERNEL__
        struct rb_node node;
-       u64 choose_args_index;
+       s64 choose_args_index;
 #endif
        struct crush_choose_arg *args; /*!< replacement for each bucket
                                             in the crushmap */
index 746b145bfd113975e1f17399d845e2fcbdc7541a..417df675c71b0a2e53a25b0f7e9195c6f3c7804e 100644 (file)
@@ -306,7 +306,7 @@ static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
                                     const struct crush_choose_arg *arg,
                                     int position)
 {
-       if (!arg || !arg->weight_set || arg->weight_set_size == 0)
+       if (!arg || !arg->weight_set)
                return bucket->item_weights;
 
        if (position >= arg->weight_set_size)
index b7cc615d42efdb2219771c7a83c0acd5fa9ea9f5..a67298c7e0cd4f8f7a441fa938af57e37c4bd4da 100644 (file)
@@ -1287,10 +1287,10 @@ static void prepare_write_message(struct ceph_connection *con)
        if (m->needs_out_seq) {
                m->hdr.seq = cpu_to_le64(++con->out_seq);
                m->needs_out_seq = false;
-       }
 
-       if (con->ops->reencode_message)
-               con->ops->reencode_message(m);
+               if (con->ops->reencode_message)
+                       con->ops->reencode_message(m);
+       }
 
        dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
             m, con->out_seq, le16_to_cpu(m->hdr.type),
index 901bb8221366253efb0baa139aee4fa62e8e71e0..dcfbdd74dfd1f13ce51cde01f541974517b35f9b 100644 (file)
@@ -1337,6 +1337,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
        bool legacy_change;
        bool split = false;
        bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+       bool recovery_deletes = ceph_osdmap_flag(osdc,
+                                                CEPH_OSDMAP_RECOVERY_DELETES);
        enum calc_target_result ct_res;
        int ret;
 
@@ -1399,6 +1401,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
                                 pi->pg_num,
                                 t->sort_bitwise,
                                 sort_bitwise,
+                                t->recovery_deletes,
+                                recovery_deletes,
                                 &last_pgid))
                force_resend = true;
 
@@ -1421,6 +1425,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
                t->pg_num = pi->pg_num;
                t->pg_num_mask = pi->pg_num_mask;
                t->sort_bitwise = sort_bitwise;
+               t->recovery_deletes = recovery_deletes;
 
                t->osd = acting.primary;
        }
@@ -1918,10 +1923,12 @@ static void encode_request_partial(struct ceph_osd_request *req,
        }
 
        ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
-       BUG_ON(p != end - 8); /* space for features */
+       BUG_ON(p > end - 8); /* space for features */
 
        msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
        /* front_len is finalized in encode_request_finish() */
+       msg->front.iov_len = p - msg->front.iov_base;
+       msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
        msg->hdr.data_len = cpu_to_le32(data_len);
        /*
         * The header "data_off" is a hint to the receiver allowing it
@@ -1937,11 +1944,12 @@ static void encode_request_partial(struct ceph_osd_request *req,
 static void encode_request_finish(struct ceph_msg *msg)
 {
        void *p = msg->front.iov_base;
+       void *const partial_end = p + msg->front.iov_len;
        void *const end = p + msg->front_alloc_len;
 
        if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
                /* luminous OSD -- encode features and be done */
-               p = end - 8;
+               p = partial_end;
                ceph_encode_64(&p, msg->con->peer_features);
        } else {
                struct {
@@ -1984,7 +1992,7 @@ static void encode_request_finish(struct ceph_msg *msg)
                oid_len = p - oid;
 
                tail = p;
-               tail_len = (end - p) - 8;
+               tail_len = partial_end - p;
 
                p = msg->front.iov_base;
                ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
index 64ae9f89773a1c44f84858eb7622599a81f25de6..f358d0bfa76b35cb978e9e92b57aea38a4e3b391 100644 (file)
@@ -295,6 +295,10 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c)
                        ret = decode_choose_arg(p, end, arg);
                        if (ret)
                                goto fail;
+
+                       if (arg->ids_size &&
+                           arg->ids_size != c->buckets[bucket_index]->size)
+                               goto e_inval;
                }
 
                insert_choose_arg_map(&c->choose_args, arg_map);
@@ -2078,6 +2082,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
                          u32 new_pg_num,
                          bool old_sort_bitwise,
                          bool new_sort_bitwise,
+                         bool old_recovery_deletes,
+                         bool new_recovery_deletes,
                          const struct ceph_pg *pgid)
 {
        return !osds_equal(old_acting, new_acting) ||
@@ -2085,7 +2091,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
               old_size != new_size ||
               old_min_size != new_min_size ||
               ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
-              old_sort_bitwise != new_sort_bitwise;
+              old_sort_bitwise != new_sort_bitwise ||
+              old_recovery_deletes != new_recovery_deletes;
 }
 
 static int calc_pg_rank(int osd, const struct ceph_osds *acting)
@@ -2301,10 +2308,17 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
        }
 }
 
+/*
+ * Magic value used for a "default" fallback choose_args, used if the
+ * crush_choose_arg_map passed to do_crush() does not exist.  If this
+ * also doesn't exist, fall back to canonical weights.
+ */
+#define CEPH_DEFAULT_CHOOSE_ARGS       -1
+
 static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
                    int *result, int result_max,
                    const __u32 *weight, int weight_max,
-                   u64 choose_args_index)
+                   s64 choose_args_index)
 {
        struct crush_choose_arg_map *arg_map;
        int r;
@@ -2313,6 +2327,9 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 
        arg_map = lookup_choose_arg_map(&map->crush->choose_args,
                                        choose_args_index);
+       if (!arg_map)
+               arg_map = lookup_choose_arg_map(&map->crush->choose_args,
+                                               CEPH_DEFAULT_CHOOSE_ARGS);
 
        mutex_lock(&map->crush_workspace_mutex);
        r = crush_do_rule(map->crush, ruleno, x, result, result_max,
@@ -2423,40 +2440,23 @@ static void apply_upmap(struct ceph_osdmap *osdmap,
                for (i = 0; i < pg->pg_upmap.len; i++)
                        raw->osds[i] = pg->pg_upmap.osds[i];
                raw->size = pg->pg_upmap.len;
-               return;
+               /* check and apply pg_upmap_items, if any */
        }
 
        pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
        if (pg) {
-               /*
-                * Note: this approach does not allow a bidirectional swap,
-                * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
-                */
-               for (i = 0; i < pg->pg_upmap_items.len; i++) {
-                       int from = pg->pg_upmap_items.from_to[i][0];
-                       int to = pg->pg_upmap_items.from_to[i][1];
-                       int pos = -1;
-                       bool exists = false;
-
-                       /* make sure replacement doesn't already appear */
-                       for (j = 0; j < raw->size; j++) {
-                               int osd = raw->osds[j];
-
-                               if (osd == to) {
-                                       exists = true;
+               for (i = 0; i < raw->size; i++) {
+                       for (j = 0; j < pg->pg_upmap_items.len; j++) {
+                               int from = pg->pg_upmap_items.from_to[j][0];
+                               int to = pg->pg_upmap_items.from_to[j][1];
+
+                               if (from == raw->osds[i]) {
+                                       if (!(to != CRUSH_ITEM_NONE &&
+                                             to < osdmap->max_osd &&
+                                             osdmap->osd_weight[to] == 0))
+                                               raw->osds[i] = to;
                                        break;
                                }
-                               /* ignore mapping if target is marked out */
-                               if (osd == from && pos < 0 &&
-                                   !(to != CRUSH_ITEM_NONE &&
-                                     to < osdmap->max_osd &&
-                                     osdmap->osd_weight[to] == 0)) {
-                                       pos = j;
-                               }
-                       }
-                       if (!exists && pos >= 0) {
-                               raw->osds[pos] = to;
-                               return;
                        }
                }
        }