Merge branch 'lru-map-fix'
authorAlexei Starovoitov <ast@kernel.org>
Tue, 14 May 2019 17:47:29 +0000 (10:47 -0700)
committerAlexei Starovoitov <ast@kernel.org>
Tue, 14 May 2019 17:47:30 +0000 (10:47 -0700)
Daniel Borkmann says:

====================
This set fixes LRU map eviction in combination with map lookups out
of system call side from user space. Main patch is the second one and
test cases are adapted and added in the last one. Thanks!
====================

Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
include/linux/bpf.h
kernel/bpf/hashtab.c
kernel/bpf/syscall.c
tools/testing/selftests/bpf/test_lru_map.c

index 59631dd..4fb3aa2 100644 (file)
@@ -36,6 +36,7 @@ struct bpf_map_ops {
        void (*map_free)(struct bpf_map *map);
        int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
        void (*map_release_uref)(struct bpf_map *map);
+       void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
 
        /* funcs callable from userspace and from eBPF programs */
        void *(*map_lookup_elem)(struct bpf_map *map, void *key);
index 192d32e..0f2708f 100644 (file)
@@ -527,18 +527,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
        return insn - insn_buf;
 }
 
-static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
+                                                       void *key, const bool mark)
 {
        struct htab_elem *l = __htab_map_lookup_elem(map, key);
 
        if (l) {
-               bpf_lru_node_set_ref(&l->lru_node);
+               if (mark)
+                       bpf_lru_node_set_ref(&l->lru_node);
                return l->key + round_up(map->key_size, 8);
        }
 
        return NULL;
 }
 
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+       return __htab_lru_map_lookup_elem(map, key, true);
+}
+
+static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
+{
+       return __htab_lru_map_lookup_elem(map, key, false);
+}
+
 static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
                                   struct bpf_insn *insn_buf)
 {
@@ -1250,6 +1262,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
        .map_free = htab_map_free,
        .map_get_next_key = htab_map_get_next_key,
        .map_lookup_elem = htab_lru_map_lookup_elem,
+       .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
        .map_update_elem = htab_lru_map_update_elem,
        .map_delete_elem = htab_lru_map_delete_elem,
        .map_gen_lookup = htab_lru_map_gen_lookup,
@@ -1281,7 +1294,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 {
-       struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
        struct htab_elem *l;
        void __percpu *pptr;
        int ret = -ENOENT;
@@ -1297,8 +1309,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
        l = __htab_map_lookup_elem(map, key);
        if (!l)
                goto out;
-       if (htab_is_lru(htab))
-               bpf_lru_node_set_ref(&l->lru_node);
+       /* We do not mark LRU map element here in order to not mess up
+        * eviction heuristics when user space does a map walk.
+        */
        pptr = htab_elem_get_ptr(l, map->key_size);
        for_each_possible_cpu(cpu) {
                bpf_long_memcpy(value + off,
index ad3ccf8..cb5440b 100644 (file)
@@ -808,7 +808,10 @@ static int map_lookup_elem(union bpf_attr *attr)
                err = map->ops->map_peek_elem(map, value);
        } else {
                rcu_read_lock();
-               ptr = map->ops->map_lookup_elem(map, key);
+               if (map->ops->map_lookup_elem_sys_only)
+                       ptr = map->ops->map_lookup_elem_sys_only(map, key);
+               else
+                       ptr = map->ops->map_lookup_elem(map, key);
                if (IS_ERR(ptr)) {
                        err = PTR_ERR(ptr);
                } else if (!ptr) {
index 781c7de..1b25a7e 100644 (file)
 #include <sys/wait.h>
 
 #include <bpf/bpf.h>
+#include <bpf/libbpf.h>
 
 #include "bpf_util.h"
 #include "bpf_rlimit.h"
+#include "../../../include/linux/filter.h"
 
 #define LOCAL_FREE_TARGET      (128)
 #define PERCPU_FREE_TARGET     (4)
@@ -40,6 +42,68 @@ static int create_map(int map_type, int map_flags, unsigned int size)
        return map_fd;
 }
 
+static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
+                                           void *value)
+{
+       struct bpf_load_program_attr prog;
+       struct bpf_create_map_attr map;
+       struct bpf_insn insns[] = {
+               BPF_LD_MAP_VALUE(BPF_REG_9, 0, 0),
+               BPF_LD_MAP_FD(BPF_REG_1, fd),
+               BPF_LD_IMM64(BPF_REG_3, key),
+               BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+               BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+               BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0),
+               BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+               BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+               BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+               BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_1, 0),
+               BPF_MOV64_IMM(BPF_REG_0, 42),
+               BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+               BPF_MOV64_IMM(BPF_REG_0, 1),
+               BPF_EXIT_INSN(),
+       };
+       __u8 data[64] = {};
+       int mfd, pfd, ret, zero = 0;
+       __u32 retval = 0;
+
+       memset(&map, 0, sizeof(map));
+       map.map_type = BPF_MAP_TYPE_ARRAY;
+       map.key_size = sizeof(int);
+       map.value_size = sizeof(unsigned long long);
+       map.max_entries = 1;
+
+       mfd = bpf_create_map_xattr(&map);
+       if (mfd < 0)
+               return -1;
+
+       insns[0].imm = mfd;
+
+       memset(&prog, 0, sizeof(prog));
+       prog.prog_type = BPF_PROG_TYPE_SCHED_CLS;
+       prog.insns = insns;
+       prog.insns_cnt = ARRAY_SIZE(insns);
+       prog.license = "GPL";
+
+       pfd = bpf_load_program_xattr(&prog, NULL, 0);
+       if (pfd < 0) {
+               close(mfd);
+               return -1;
+       }
+
+       ret = bpf_prog_test_run(pfd, 1, data, sizeof(data),
+                               NULL, NULL, &retval, NULL);
+       if (ret < 0 || retval != 42) {
+               ret = -1;
+       } else {
+               assert(!bpf_map_lookup_elem(mfd, &zero, value));
+               ret = 0;
+       }
+       close(pfd);
+       close(mfd);
+       return ret;
+}
+
 static int map_subset(int map0, int map1)
 {
        unsigned long long next_key = 0;
@@ -87,7 +151,7 @@ static int sched_next_online(int pid, int *next_to_try)
        return ret;
 }
 
-/* Size of the LRU amp is 2
+/* Size of the LRU map is 2
  * Add key=1 (+1 key)
  * Add key=2 (+1 key)
  * Lookup Key=1
@@ -157,7 +221,7 @@ static void test_lru_sanity0(int map_type, int map_flags)
         * stop LRU from removing key=1
         */
        key = 1;
-       assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+       assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
        assert(value[0] == 1234);
 
        key = 3;
@@ -167,7 +231,8 @@ static void test_lru_sanity0(int map_type, int map_flags)
 
        /* key=2 has been removed from the LRU */
        key = 2;
-       assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1);
+       assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+              errno == ENOENT);
 
        assert(map_equal(lru_map_fd, expected_map_fd));
 
@@ -221,7 +286,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
        /* Lookup 1 to tgt_free/2 */
        end_key = 1 + batch_size;
        for (key = 1; key < end_key; key++) {
-               assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+               assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
                assert(!bpf_map_update_elem(expected_map_fd, &key, value,
                                            BPF_NOEXIST));
        }
@@ -322,10 +387,11 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
        end_key = 1 + batch_size;
        value[0] = 4321;
        for (key = 1; key < end_key; key++) {
-               assert(bpf_map_lookup_elem(lru_map_fd, &key, value));
+               assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+                      errno == ENOENT);
                assert(!bpf_map_update_elem(lru_map_fd, &key, value,
                                            BPF_NOEXIST));
-               assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+               assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
                assert(value[0] == 4321);
                assert(!bpf_map_update_elem(expected_map_fd, &key, value,
                                            BPF_NOEXIST));
@@ -404,7 +470,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
        /* Lookup key 1 to tgt_free*3/2 */
        end_key = tgt_free + batch_size;
        for (key = 1; key < end_key; key++) {
-               assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+               assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
                assert(!bpf_map_update_elem(expected_map_fd, &key, value,
                                            BPF_NOEXIST));
        }
@@ -463,7 +529,7 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
        assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
 
        for (key = 1; key <= tgt_free; key++) {
-               assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+               assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
                assert(!bpf_map_update_elem(expected_map_fd, &key, value,
                                            BPF_NOEXIST));
        }
@@ -494,16 +560,16 @@ static void do_test_lru_sanity5(unsigned long long last_key, int map_fd)
        unsigned long long key, value[nr_cpus];
 
        /* Ensure the last key inserted by previous CPU can be found */
-       assert(!bpf_map_lookup_elem(map_fd, &last_key, value));
-
+       assert(!bpf_map_lookup_elem_with_ref_bit(map_fd, last_key, value));
        value[0] = 1234;
 
        key = last_key + 1;
        assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST));
-       assert(!bpf_map_lookup_elem(map_fd, &key, value));
+       assert(!bpf_map_lookup_elem_with_ref_bit(map_fd, key, value));
 
        /* Cannot find the last key because it was removed by LRU */
-       assert(bpf_map_lookup_elem(map_fd, &last_key, value));
+       assert(bpf_map_lookup_elem(map_fd, &last_key, value) == -1 &&
+              errno == ENOENT);
 }
 
 /* Test map with only one element */
@@ -590,8 +656,8 @@ static void test_lru_sanity6(int map_type, int map_flags, int tgt_free)
                /* Make ref bit sticky for key: [1, tgt_free] */
                for (stable_key = 1; stable_key <= tgt_free; stable_key++) {
                        /* Mark the ref bit */
-                       assert(!bpf_map_lookup_elem(lru_map_fd, &stable_key,
-                                                   value));
+                       assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd,
+                                                                stable_key, value));
                }
                assert(!bpf_map_update_elem(lru_map_fd, &key, value,
                                            BPF_NOEXIST));
@@ -612,6 +678,198 @@ static void test_lru_sanity6(int map_type, int map_flags, int tgt_free)
        printf("Pass\n");
 }
 
+/* Size of the LRU map is 2
+ * Add key=1 (+1 key)
+ * Add key=2 (+1 key)
+ * Lookup Key=1 (datapath)
+ * Lookup Key=2 (syscall)
+ * Add Key=3
+ *   => Key=2 will be removed by LRU
+ * Iterate map.  Only found key=1 and key=3
+ */
+static void test_lru_sanity7(int map_type, int map_flags)
+{
+       unsigned long long key, value[nr_cpus];
+       int lru_map_fd, expected_map_fd;
+       int next_cpu = 0;
+
+       printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+              map_flags);
+
+       assert(sched_next_online(0, &next_cpu) != -1);
+
+       if (map_flags & BPF_F_NO_COMMON_LRU)
+               lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
+       else
+               lru_map_fd = create_map(map_type, map_flags, 2);
+       assert(lru_map_fd != -1);
+
+       expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
+       assert(expected_map_fd != -1);
+
+       value[0] = 1234;
+
+       /* insert key=1 element */
+
+       key = 1;
+       assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+       assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+                                   BPF_NOEXIST));
+
+       /* BPF_NOEXIST means: add new element if it doesn't exist */
+       assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST) == -1
+              /* key=1 already exists */
+              && errno == EEXIST);
+
+       /* insert key=2 element */
+
+       /* check that key=2 is not found */
+       key = 2;
+       assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+              errno == ENOENT);
+
+       /* BPF_EXIST means: update existing element */
+       assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
+              /* key=2 is not there */
+              errno == ENOENT);
+
+       assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+
+       /* insert key=3 element */
+
+       /* check that key=3 is not found */
+       key = 3;
+       assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+              errno == ENOENT);
+
+       /* check that key=1 can be found and mark the ref bit to
+        * stop LRU from removing key=1
+        */
+       key = 1;
+       assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+       assert(value[0] == 1234);
+
+       /* check that key=2 can be found and do _not_ mark ref bit.
+        * this will be evicted on next update.
+        */
+       key = 2;
+       assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+       assert(value[0] == 1234);
+
+       key = 3;
+       assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+       assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+                                   BPF_NOEXIST));
+
+       /* key=2 has been removed from the LRU */
+       key = 2;
+       assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+              errno == ENOENT);
+
+       assert(map_equal(lru_map_fd, expected_map_fd));
+
+       close(expected_map_fd);
+       close(lru_map_fd);
+
+       printf("Pass\n");
+}
+
+/* Size of the LRU map is 2
+ * Add key=1 (+1 key)
+ * Add key=2 (+1 key)
+ * Lookup Key=1 (syscall)
+ * Lookup Key=2 (datapath)
+ * Add Key=3
+ *   => Key=1 will be removed by LRU
+ * Iterate map.  Only found key=2 and key=3
+ */
+static void test_lru_sanity8(int map_type, int map_flags)
+{
+       unsigned long long key, value[nr_cpus];
+       int lru_map_fd, expected_map_fd;
+       int next_cpu = 0;
+
+       printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+              map_flags);
+
+       assert(sched_next_online(0, &next_cpu) != -1);
+
+       if (map_flags & BPF_F_NO_COMMON_LRU)
+               lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
+       else
+               lru_map_fd = create_map(map_type, map_flags, 2);
+       assert(lru_map_fd != -1);
+
+       expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
+       assert(expected_map_fd != -1);
+
+       value[0] = 1234;
+
+       /* insert key=1 element */
+
+       key = 1;
+       assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+
+       /* BPF_NOEXIST means: add new element if it doesn't exist */
+       assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST) == -1
+              /* key=1 already exists */
+              && errno == EEXIST);
+
+       /* insert key=2 element */
+
+       /* check that key=2 is not found */
+       key = 2;
+       assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+              errno == ENOENT);
+
+       /* BPF_EXIST means: update existing element */
+       assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
+              /* key=2 is not there */
+              errno == ENOENT);
+
+       assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+       assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+                                   BPF_NOEXIST));
+
+       /* insert key=3 element */
+
+       /* check that key=3 is not found */
+       key = 3;
+       assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+              errno == ENOENT);
+
+       /* check that key=1 can be found and do _not_ mark ref bit.
+        * this will be evicted on next update.
+        */
+       key = 1;
+       assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+       assert(value[0] == 1234);
+
+       /* check that key=2 can be found and mark the ref bit to
+        * stop LRU from removing key=2
+        */
+       key = 2;
+       assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+       assert(value[0] == 1234);
+
+       key = 3;
+       assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+       assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+                                   BPF_NOEXIST));
+
+       /* key=1 has been removed from the LRU */
+       key = 1;
+       assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+              errno == ENOENT);
+
+       assert(map_equal(lru_map_fd, expected_map_fd));
+
+       close(expected_map_fd);
+       close(lru_map_fd);
+
+       printf("Pass\n");
+}
+
 int main(int argc, char **argv)
 {
        int map_types[] = {BPF_MAP_TYPE_LRU_HASH,
@@ -637,6 +895,8 @@ int main(int argc, char **argv)
                        test_lru_sanity4(map_types[t], map_flags[f], tgt_free);
                        test_lru_sanity5(map_types[t], map_flags[f]);
                        test_lru_sanity6(map_types[t], map_flags[f], tgt_free);
+                       test_lru_sanity7(map_types[t], map_flags[f]);
+                       test_lru_sanity8(map_types[t], map_flags[f]);
 
                        printf("\n");
                }