Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
authorDavid S. Miller <davem@davemloft.net>
Wed, 21 Mar 2018 16:08:01 +0000 (12:08 -0400)
committerDavid S. Miller <davem@davemloft.net>
Wed, 21 Mar 2018 16:08:01 +0000 (12:08 -0400)
Daniel Borkmann says:

====================
pull-request: bpf-next 2018-03-21

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Add a BPF hook for sendmsg and sendfile by reusing the ULP infrastructure
   and sockmap. Three helpers are added along with this, bpf_msg_apply_bytes(),
   bpf_msg_cork_bytes(), and bpf_msg_pull_data(). The first is used to tell
   for how many bytes the verdict should be applied to, the second to tell
   that x bytes need to be queued first to retrigger the BPF program for a
   verdict, and the third helper is mainly for the sendfile case to pull in
   data for making it private for reading and/or writing, from John.

2) Improve address to symbol resolution of user stack traces in BPF stackmap.
   Currently, the latter stores the address for each entry in the call trace,
   however to map these addresses to user space files, it is necessary to
   maintain the mapping from these virtual addresses to symbols in the binary
   which is not practical for system-wide profiling. Instead, this option for
   the stackmap rather stores the ELF build id and offset for the call trace
   entries, from Song.

3) Add support that allows BPF programs attached to perf events to read the
   address values recorded with the perf events. They are requested through
   PERF_SAMPLE_ADDR via perf_event_open(). Main motivation behind it is to
   support building memory or lock access profiling and tracing tools with
   the help of BPF, from Teng.

4) Several improvements to the tools/bpf/ Makefiles. The 'make bpf' in the
   tools directory does not provide the standard quiet output except for
   bpftool and it also does not respect specifying a build output directory.
   'make bpf_install' command neither respects specified destination nor
   prefix, all from Jiri. In addition, Jakub fixes several other minor issues
   in the Makefiles on top of that, e.g. fixing dependency paths, phony
   targets and more.

5) Various doc updates e.g. add a comment for BPF fs about reserved names
   to make the dentry lookup from there a bit more obvious, and a comment
   to the bpf_devel_QA file in order to explain the diff between native
   and bpf target clang usage with regards to pointer size, from Quentin
   and Daniel.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
39 files changed:
Documentation/bpf/bpf_devel_QA.txt
include/linux/bpf.h
include/linux/bpf_types.h
include/linux/filter.h
include/linux/socket.h
include/net/sock.h
include/uapi/linux/bpf.h
include/uapi/linux/bpf_perf_event.h
kernel/bpf/inode.c
kernel/bpf/sockmap.c
kernel/bpf/stackmap.c
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/trace/bpf_trace.c
net/core/filter.c
net/core/sock.c
net/ipv4/tcp.c
net/tls/tls_sw.c
samples/bpf/bpf_load.c
samples/bpf/trace_event_kern.c
samples/bpf/trace_event_user.c
samples/sockmap/sockmap_kern.c
samples/sockmap/sockmap_test.sh [new file with mode: 0755]
samples/sockmap/sockmap_user.c
tools/bpf/Makefile
tools/bpf/bpftool/Makefile
tools/bpf/bpftool/xlated_dumper.h
tools/include/uapi/linux/bpf.h
tools/lib/bpf/libbpf.c
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/bpf_helpers.h
tools/testing/selftests/bpf/sockmap_parse_prog.c
tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c [new file with mode: 0644]
tools/testing/selftests/bpf/sockmap_verdict_prog.c
tools/testing/selftests/bpf/test_maps.c
tools/testing/selftests/bpf/test_progs.c
tools/testing/selftests/bpf/test_stacktrace_build_id.c [new file with mode: 0644]
tools/testing/selftests/bpf/test_verifier.c
tools/testing/selftests/bpf/urandom_read.c [new file with mode: 0644]

index 84cbb302f2b56e1fe9b4ecc46dfb2f94b1ff392d..1a0b704e1a38f2a8749c83bb3f647d9a952c2787 100644 (file)
@@ -539,6 +539,18 @@ A: Although LLVM IR generation and optimization try to stay architecture
        The clang option "-fno-jump-tables" can be used to disable
        switch table generation.
 
+     - For clang -target bpf, it is guaranteed that pointer or long /
+       unsigned long types will always have a width of 64 bit, no matter
+       whether underlying clang binary or default target (or kernel) is
+       32 bit. However, when native clang target is used, then it will
+       compile these types based on the underlying architecture's conventions,
+       meaning in case of 32 bit architecture, pointer or long / unsigned
+       long types e.g. in BPF context structure will have width of 32 bit
+       while the BPF LLVM back end still operates in 64 bit. The native
+       target is mostly needed in tracing for the case of walking pt_regs
+       or other kernel structures where CPU's register width matters.
+       Otherwise, clang -target bpf is generally recommended.
+
    You should use default target when:
 
      - Your program includes a header file, e.g., ptrace.h, which eventually
index 66df387106de4bcb62547b9698cd8872269e7f37..819229c80ecaed8343cf12f3ec663af7ba031353 100644 (file)
@@ -21,6 +21,7 @@ struct bpf_verifier_env;
 struct perf_event;
 struct bpf_prog;
 struct bpf_map;
+struct sock;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
index 19b8349a38094cbfcb52ff3ade1e8893018bb0c0..5e2e8a49fb21fc16dbffae29987f75e55385f725 100644 (file)
@@ -13,6 +13,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
index fdb691b520c07d5d031bb0169a82d878950cd302..109d05ccea9a4f2cc6de63871f473378088c4a8f 100644 (file)
@@ -507,6 +507,22 @@ struct xdp_buff {
        struct xdp_rxq_info *rxq;
 };
 
+struct sk_msg_buff {
+       void *data;
+       void *data_end;
+       __u32 apply_bytes;
+       __u32 cork_bytes;
+       int sg_copybreak;
+       int sg_start;
+       int sg_curr;
+       int sg_end;
+       struct scatterlist sg_data[MAX_SKB_FRAGS];
+       bool sg_copy[MAX_SKB_FRAGS];
+       __u32 key;
+       __u32 flags;
+       struct bpf_map *map;
+};
+
 /* Compute the linear packet data range [data, data_end) which
  * will be accessed by various program types (cls_bpf, act_bpf,
  * lwt, ...). Subsystems allowing direct data access must (!)
@@ -771,6 +787,7 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp)
 void bpf_warn_invalid_xdp_action(u32 act);
 
 struct sock *do_sk_redirect_map(struct sk_buff *skb);
+struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
index 1ce1f768a58ca5a75e9cd6a0e2f3a2dd385d3910..60e01482a9c4a290f9a1265b2e0681d03911511f 100644 (file)
@@ -287,6 +287,7 @@ struct ucred {
 #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
 #define MSG_BATCH      0x40000 /* sendmmsg(): more messages coming */
 #define MSG_EOF         MSG_FIN
+#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */
 
 #define MSG_ZEROCOPY   0x4000000       /* Use user data in kernel path */
 #define MSG_FASTOPEN   0x20000000      /* Send data in TCP SYN */
index b9624581d639ff7f1f9466d7e7cd50c6b87cabf5..b7c75e024e373eaaa4600888fba1a5382a29cef2 100644 (file)
@@ -2141,6 +2141,10 @@ static inline struct page_frag *sk_page_frag(struct sock *sk)
 
 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
 
+int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
+               int sg_start, int *sg_curr, unsigned int *sg_size,
+               int first_coalesce);
+
 /*
  *     Default write policy as shown to user space via poll/select/SIGIO
  */
index 2a66769e58753f802c5f2df8c92d131e0ed5edd9..18b7c510c511df9247a82e0bf40c199c14b229b4 100644 (file)
@@ -133,6 +133,7 @@ enum bpf_prog_type {
        BPF_PROG_TYPE_SOCK_OPS,
        BPF_PROG_TYPE_SK_SKB,
        BPF_PROG_TYPE_CGROUP_DEVICE,
+       BPF_PROG_TYPE_SK_MSG,
 };
 
 enum bpf_attach_type {
@@ -143,6 +144,7 @@ enum bpf_attach_type {
        BPF_SK_SKB_STREAM_PARSER,
        BPF_SK_SKB_STREAM_VERDICT,
        BPF_CGROUP_DEVICE,
+       BPF_SK_MSG_VERDICT,
        __MAX_BPF_ATTACH_TYPE
 };
 
@@ -231,6 +233,28 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY           (1U << 3)
 #define BPF_F_WRONLY           (1U << 4)
 
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID   (1U << 5)
+
+enum bpf_stack_build_id_status {
+       /* user space need an empty entry to identify end of a trace */
+       BPF_STACK_BUILD_ID_EMPTY = 0,
+       /* with valid build_id and offset */
+       BPF_STACK_BUILD_ID_VALID = 1,
+       /* couldn't get build_id, fallback to ip */
+       BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+       __s32           status;
+       unsigned char   build_id[BPF_BUILD_ID_SIZE];
+       union {
+               __u64   offset;
+               __u64   ip;
+       };
+};
+
 union bpf_attr {
        struct { /* anonymous struct used by BPF_MAP_CREATE command */
                __u32   map_type;       /* one of enum bpf_map_type */
@@ -696,6 +720,15 @@ union bpf_attr {
  * int bpf_override_return(pt_regs, rc)
  *     @pt_regs: pointer to struct pt_regs
  *     @rc: the return value to set
+ *
+ * int bpf_msg_redirect_map(map, key, flags)
+ *     Redirect msg to a sock in map using key as a lookup key for the
+ *     sock in map.
+ *     @map: pointer to sockmap
+ *     @key: key to lookup sock in map
+ *     @flags: reserved for future use
+ *     Return: SK_PASS
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -757,7 +790,11 @@ union bpf_attr {
        FN(perf_prog_read_value),       \
        FN(getsockopt),                 \
        FN(override_return),            \
-       FN(sock_ops_cb_flags_set),
+       FN(sock_ops_cb_flags_set),      \
+       FN(msg_redirect_map),           \
+       FN(msg_apply_bytes),            \
+       FN(msg_cork_bytes),             \
+       FN(msg_pull_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -920,6 +957,14 @@ enum sk_action {
        SK_PASS,
 };
 
+/* user accessible metadata for SK_MSG packet hook, new fields must
+ * be added to the end of this structure
+ */
+struct sk_msg_md {
+       void *data;
+       void *data_end;
+};
+
 #define BPF_TAG_SIZE   8
 
 struct bpf_prog_info {
index 8f95303f9d807d10d4fd6850d91a2486b0a490ec..eb1b9d21250c6e83233721a777e5760c26d75481 100644 (file)
@@ -13,6 +13,7 @@
 struct bpf_perf_event_data {
        bpf_user_pt_regs_t regs;
        __u64 sample_period;
+       __u64 addr;
 };
 
 #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */
index 81e2f6995adb14ea7e99e2a5cf6c27203ed726ef..bf6da59ae0d012b0e1328036fcfb1ddfd08347c5 100644 (file)
@@ -178,6 +178,9 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 static struct dentry *
 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 {
+       /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
+        * extensions.
+        */
        if (strchr(dentry->d_name.name, '.'))
                return ERR_PTR(-EPERM);
 
index a927e89dad6e9591066c3a87afc497a196ebd887..69c5bccabd229f801537f6137e311d075b79db72 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/skbuff.h>
 #include <linux/workqueue.h>
 #include <linux/list.h>
+#include <linux/mm.h>
 #include <net/strparser.h>
 #include <net/tcp.h>
 
@@ -47,6 +48,7 @@
 struct bpf_stab {
        struct bpf_map map;
        struct sock **sock_map;
+       struct bpf_prog *bpf_tx_msg;
        struct bpf_prog *bpf_parse;
        struct bpf_prog *bpf_verdict;
 };
@@ -62,8 +64,7 @@ struct smap_psock_map_entry {
 
 struct smap_psock {
        struct rcu_head rcu;
-       /* refcnt is used inside sk_callback_lock */
-       u32 refcnt;
+       refcount_t refcnt;
 
        /* datapath variables */
        struct sk_buff_head rxqueue;
@@ -74,7 +75,16 @@ struct smap_psock {
        int save_off;
        struct sk_buff *save_skb;
 
+       /* datapath variables for tx_msg ULP */
+       struct sock *sk_redir;
+       int apply_bytes;
+       int cork_bytes;
+       int sg_size;
+       int eval;
+       struct sk_msg_buff *cork;
+
        struct strparser strp;
+       struct bpf_prog *bpf_tx_msg;
        struct bpf_prog *bpf_parse;
        struct bpf_prog *bpf_verdict;
        struct list_head maps;
@@ -92,6 +102,11 @@ struct smap_psock {
        void (*save_write_space)(struct sock *sk);
 };
 
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
+                           int offset, size_t size, int flags);
+
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 {
        return rcu_dereference_sk_user_data(sk);
@@ -116,27 +131,41 @@ static int bpf_tcp_init(struct sock *sk)
 
        psock->save_close = sk->sk_prot->close;
        psock->sk_proto = sk->sk_prot;
+
+       if (psock->bpf_tx_msg) {
+               tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg;
+               tcp_bpf_proto.sendpage = bpf_tcp_sendpage;
+       }
+
        sk->sk_prot = &tcp_bpf_proto;
        rcu_read_unlock();
        return 0;
 }
 
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md);
+
 static void bpf_tcp_release(struct sock *sk)
 {
        struct smap_psock *psock;
 
        rcu_read_lock();
        psock = smap_psock_sk(sk);
+       if (unlikely(!psock))
+               goto out;
 
-       if (likely(psock)) {
-               sk->sk_prot = psock->sk_proto;
-               psock->sk_proto = NULL;
+       if (psock->cork) {
+               free_start_sg(psock->sock, psock->cork);
+               kfree(psock->cork);
+               psock->cork = NULL;
        }
+
+       sk->sk_prot = psock->sk_proto;
+       psock->sk_proto = NULL;
+out:
        rcu_read_unlock();
 }
 
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
        void (*close_fun)(struct sock *sk, long timeout);
@@ -175,6 +204,7 @@ enum __sk_action {
        __SK_DROP = 0,
        __SK_PASS,
        __SK_REDIRECT,
+       __SK_NONE,
 };
 
 static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
@@ -186,10 +216,621 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
        .release        = bpf_tcp_release,
 };
 
+static int memcopy_from_iter(struct sock *sk,
+                            struct sk_msg_buff *md,
+                            struct iov_iter *from, int bytes)
+{
+       struct scatterlist *sg = md->sg_data;
+       int i = md->sg_curr, rc = -ENOSPC;
+
+       do {
+               int copy;
+               char *to;
+
+               if (md->sg_copybreak >= sg[i].length) {
+                       md->sg_copybreak = 0;
+
+                       if (++i == MAX_SKB_FRAGS)
+                               i = 0;
+
+                       if (i == md->sg_end)
+                               break;
+               }
+
+               copy = sg[i].length - md->sg_copybreak;
+               to = sg_virt(&sg[i]) + md->sg_copybreak;
+               md->sg_copybreak += copy;
+
+               if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+                       rc = copy_from_iter_nocache(to, copy, from);
+               else
+                       rc = copy_from_iter(to, copy, from);
+
+               if (rc != copy) {
+                       rc = -EFAULT;
+                       goto out;
+               }
+
+               bytes -= copy;
+               if (!bytes)
+                       break;
+
+               md->sg_copybreak = 0;
+               if (++i == MAX_SKB_FRAGS)
+                       i = 0;
+       } while (i != md->sg_end);
+out:
+       md->sg_curr = i;
+       return rc;
+}
+
+static int bpf_tcp_push(struct sock *sk, int apply_bytes,
+                       struct sk_msg_buff *md,
+                       int flags, bool uncharge)
+{
+       bool apply = apply_bytes;
+       struct scatterlist *sg;
+       int offset, ret = 0;
+       struct page *p;
+       size_t size;
+
+       while (1) {
+               sg = md->sg_data + md->sg_start;
+               size = (apply && apply_bytes < sg->length) ?
+                       apply_bytes : sg->length;
+               offset = sg->offset;
+
+               tcp_rate_check_app_limited(sk);
+               p = sg_page(sg);
+retry:
+               ret = do_tcp_sendpages(sk, p, offset, size, flags);
+               if (ret != size) {
+                       if (ret > 0) {
+                               if (apply)
+                                       apply_bytes -= ret;
+                               size -= ret;
+                               offset += ret;
+                               if (uncharge)
+                                       sk_mem_uncharge(sk, ret);
+                               goto retry;
+                       }
+
+                       sg->length = size;
+                       sg->offset = offset;
+                       return ret;
+               }
+
+               if (apply)
+                       apply_bytes -= ret;
+               sg->offset += ret;
+               sg->length -= ret;
+               if (uncharge)
+                       sk_mem_uncharge(sk, ret);
+
+               if (!sg->length) {
+                       put_page(p);
+                       md->sg_start++;
+                       if (md->sg_start == MAX_SKB_FRAGS)
+                               md->sg_start = 0;
+                       memset(sg, 0, sizeof(*sg));
+
+                       if (md->sg_start == md->sg_end)
+                               break;
+               }
+
+               if (apply && !apply_bytes)
+                       break;
+       }
+       return 0;
+}
+
+static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
+{
+       struct scatterlist *sg = md->sg_data + md->sg_start;
+
+       if (md->sg_copy[md->sg_start]) {
+               md->data = md->data_end = 0;
+       } else {
+               md->data = sg_virt(sg);
+               md->data_end = md->data + sg->length;
+       }
+}
+
+static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+{
+       struct scatterlist *sg = md->sg_data;
+       int i = md->sg_start;
+
+       do {
+               int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
+
+               sk_mem_uncharge(sk, uncharge);
+               bytes -= uncharge;
+               if (!bytes)
+                       break;
+               i++;
+               if (i == MAX_SKB_FRAGS)
+                       i = 0;
+       } while (i != md->sg_end);
+}
+
+static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+{
+       struct scatterlist *sg = md->sg_data;
+       int i = md->sg_start, free;
+
+       while (bytes && sg[i].length) {
+               free = sg[i].length;
+               if (bytes < free) {
+                       sg[i].length -= bytes;
+                       sg[i].offset += bytes;
+                       sk_mem_uncharge(sk, bytes);
+                       break;
+               }
+
+               sk_mem_uncharge(sk, sg[i].length);
+               put_page(sg_page(&sg[i]));
+               bytes -= sg[i].length;
+               sg[i].length = 0;
+               sg[i].page_link = 0;
+               sg[i].offset = 0;
+               i++;
+
+               if (i == MAX_SKB_FRAGS)
+                       i = 0;
+       }
+}
+
+static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
+{
+       struct scatterlist *sg = md->sg_data;
+       int i = start, free = 0;
+
+       while (sg[i].length) {
+               free += sg[i].length;
+               sk_mem_uncharge(sk, sg[i].length);
+               put_page(sg_page(&sg[i]));
+               sg[i].length = 0;
+               sg[i].page_link = 0;
+               sg[i].offset = 0;
+               i++;
+
+               if (i == MAX_SKB_FRAGS)
+                       i = 0;
+       }
+
+       return free;
+}
+
+static int free_start_sg(struct sock *sk, struct sk_msg_buff *md)
+{
+       int free = free_sg(sk, md->sg_start, md);
+
+       md->sg_start = md->sg_end;
+       return free;
+}
+
+static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
+{
+       return free_sg(sk, md->sg_curr, md);
+}
+
+static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
+{
+       return ((_rc == SK_PASS) ?
+              (md->map ? __SK_REDIRECT : __SK_PASS) :
+              __SK_DROP);
+}
+
+static unsigned int smap_do_tx_msg(struct sock *sk,
+                                  struct smap_psock *psock,
+                                  struct sk_msg_buff *md)
+{
+       struct bpf_prog *prog;
+       unsigned int rc, _rc;
+
+       preempt_disable();
+       rcu_read_lock();
+
+       /* If the policy was removed mid-send then default to 'accept' */
+       prog = READ_ONCE(psock->bpf_tx_msg);
+       if (unlikely(!prog)) {
+               _rc = SK_PASS;
+               goto verdict;
+       }
+
+       bpf_compute_data_pointers_sg(md);
+       rc = (*prog->bpf_func)(md, prog->insnsi);
+       psock->apply_bytes = md->apply_bytes;
+
+       /* Moving return codes from UAPI namespace into internal namespace */
+       _rc = bpf_map_msg_verdict(rc, md);
+
+       /* The psock has a refcount on the sock but not on the map and because
+        * we need to drop rcu read lock here its possible the map could be
+        * removed between here and when we need it to execute the sock
+        * redirect. So do the map lookup now for future use.
+        */
+       if (_rc == __SK_REDIRECT) {
+               if (psock->sk_redir)
+                       sock_put(psock->sk_redir);
+               psock->sk_redir = do_msg_redirect_map(md);
+               if (!psock->sk_redir) {
+                       _rc = __SK_DROP;
+                       goto verdict;
+               }
+               sock_hold(psock->sk_redir);
+       }
+verdict:
+       rcu_read_unlock();
+       preempt_enable();
+
+       return _rc;
+}
+
+static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
+                                      struct sk_msg_buff *md,
+                                      int flags)
+{
+       struct smap_psock *psock;
+       struct scatterlist *sg;
+       int i, err, free = 0;
+
+       sg = md->sg_data;
+
+       rcu_read_lock();
+       psock = smap_psock_sk(sk);
+       if (unlikely(!psock))
+               goto out_rcu;
+
+       if (!refcount_inc_not_zero(&psock->refcnt))
+               goto out_rcu;
+
+       rcu_read_unlock();
+       lock_sock(sk);
+       err = bpf_tcp_push(sk, send, md, flags, false);
+       release_sock(sk);
+       smap_release_sock(psock, sk);
+       if (unlikely(err))
+               goto out;
+       return 0;
+out_rcu:
+       rcu_read_unlock();
+out:
+       i = md->sg_start;
+       while (sg[i].length) {
+               free += sg[i].length;
+               put_page(sg_page(&sg[i]));
+               sg[i].length = 0;
+               i++;
+               if (i == MAX_SKB_FRAGS)
+                       i = 0;
+       }
+       return free;
+}
+
+static inline void bpf_md_init(struct smap_psock *psock)
+{
+       if (!psock->apply_bytes) {
+               psock->eval =  __SK_NONE;
+               if (psock->sk_redir) {
+                       sock_put(psock->sk_redir);
+                       psock->sk_redir = NULL;
+               }
+       }
+}
+
+static void apply_bytes_dec(struct smap_psock *psock, int i)
+{
+       if (psock->apply_bytes) {
+               if (psock->apply_bytes < i)
+                       psock->apply_bytes = 0;
+               else
+                       psock->apply_bytes -= i;
+       }
+}
+
+static int bpf_exec_tx_verdict(struct smap_psock *psock,
+                              struct sk_msg_buff *m,
+                              struct sock *sk,
+                              int *copied, int flags)
+{
+       bool cork = false, enospc = (m->sg_start == m->sg_end);
+       struct sock *redir;
+       int err = 0;
+       int send;
+
+more_data:
+       if (psock->eval == __SK_NONE)
+               psock->eval = smap_do_tx_msg(sk, psock, m);
+
+       if (m->cork_bytes &&
+           m->cork_bytes > psock->sg_size && !enospc) {
+               psock->cork_bytes = m->cork_bytes - psock->sg_size;
+               if (!psock->cork) {
+                       psock->cork = kcalloc(1,
+                                       sizeof(struct sk_msg_buff),
+                                       GFP_ATOMIC | __GFP_NOWARN);
+
+                       if (!psock->cork) {
+                               err = -ENOMEM;
+                               goto out_err;
+                       }
+               }
+               memcpy(psock->cork, m, sizeof(*m));
+               goto out_err;
+       }
+
+       send = psock->sg_size;
+       if (psock->apply_bytes && psock->apply_bytes < send)
+               send = psock->apply_bytes;
+
+       switch (psock->eval) {
+       case __SK_PASS:
+               err = bpf_tcp_push(sk, send, m, flags, true);
+               if (unlikely(err)) {
+                       *copied -= free_start_sg(sk, m);
+                       break;
+               }
+
+               apply_bytes_dec(psock, send);
+               psock->sg_size -= send;
+               break;
+       case __SK_REDIRECT:
+               redir = psock->sk_redir;
+               apply_bytes_dec(psock, send);
+
+               if (psock->cork) {
+                       cork = true;
+                       psock->cork = NULL;
+               }
+
+               return_mem_sg(sk, send, m);
+               release_sock(sk);
+
+               err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
+               lock_sock(sk);
+
+               if (cork) {
+                       free_start_sg(sk, m);
+                       kfree(m);
+                       m = NULL;
+               }
+               if (unlikely(err))
+                       *copied -= err;
+               else
+                       psock->sg_size -= send;
+               break;
+       case __SK_DROP:
+       default:
+               free_bytes_sg(sk, send, m);
+               apply_bytes_dec(psock, send);
+               *copied -= send;
+               psock->sg_size -= send;
+               err = -EACCES;
+               break;
+       }
+
+       if (likely(!err)) {
+               bpf_md_init(psock);
+               if (m &&
+                   m->sg_data[m->sg_start].page_link &&
+                   m->sg_data[m->sg_start].length)
+                       goto more_data;
+       }
+
+out_err:
+       return err;
+}
+
+static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+       int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+       struct sk_msg_buff md = {0};
+       unsigned int sg_copy = 0;
+       struct smap_psock *psock;
+       int copied = 0, err = 0;
+       struct scatterlist *sg;
+       long timeo;
+
+       /* Its possible a sock event or user removed the psock _but_ the ops
+        * have not been reprogrammed yet so we get here. In this case fallback
+        * to tcp_sendmsg. Note this only works because we _only_ ever allow
+        * a single ULP there is no hierarchy here.
+        */
+       rcu_read_lock();
+       psock = smap_psock_sk(sk);
+       if (unlikely(!psock)) {
+               rcu_read_unlock();
+               return tcp_sendmsg(sk, msg, size);
+       }
+
+       /* Increment the psock refcnt to ensure its not released while sending a
+        * message. Required because sk lookup and bpf programs are used in
+        * separate rcu critical sections. Its OK if we lose the map entry
+        * but we can't lose the sock reference.
+        */
+       if (!refcount_inc_not_zero(&psock->refcnt)) {
+               rcu_read_unlock();
+               return tcp_sendmsg(sk, msg, size);
+       }
+
+       sg = md.sg_data;
+       sg_init_table(sg, MAX_SKB_FRAGS);
+       rcu_read_unlock();
+
+       lock_sock(sk);
+       timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+       while (msg_data_left(msg)) {
+               struct sk_msg_buff *m;
+               bool enospc = false;
+               int copy;
+
+               if (sk->sk_err) {
+                       err = sk->sk_err;
+                       goto out_err;
+               }
+
+               copy = msg_data_left(msg);
+               if (!sk_stream_memory_free(sk))
+                       goto wait_for_sndbuf;
+
+               m = psock->cork_bytes ? psock->cork : &md;
+               m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end;
+               err = sk_alloc_sg(sk, copy, m->sg_data,
+                                 m->sg_start, &m->sg_end, &sg_copy,
+                                 m->sg_end - 1);
+               if (err) {
+                       if (err != -ENOSPC)
+                               goto wait_for_memory;
+                       enospc = true;
+                       copy = sg_copy;
+               }
+
+               err = memcopy_from_iter(sk, m, &msg->msg_iter, copy);
+               if (err < 0) {
+                       free_curr_sg(sk, m);
+                       goto out_err;
+               }
+
+               psock->sg_size += copy;
+               copied += copy;
+               sg_copy = 0;
+
+               /* When bytes are being corked skip running BPF program and
+                * applying verdict unless there is no more buffer space. In
+                * the ENOSPC case simply run BPF prorgram with currently
+                * accumulated data. We don't have much choice at this point
+                * we could try extending the page frags or chaining complex
+                * frags but even in these cases _eventually_ we will hit an
+                * OOM scenario. More complex recovery schemes may be
+                * implemented in the future, but BPF programs must handle
+                * the case where apply_cork requests are not honored. The
+                * canonical method to verify this is to check data length.
+                */
+               if (psock->cork_bytes) {
+                       if (copy > psock->cork_bytes)
+                               psock->cork_bytes = 0;
+                       else
+                               psock->cork_bytes -= copy;
+
+                       if (psock->cork_bytes && !enospc)
+                               goto out_cork;
+
+                       /* All cork bytes accounted for re-run filter */
+                       psock->eval = __SK_NONE;
+                       psock->cork_bytes = 0;
+               }
+
+               err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
+               if (unlikely(err < 0))
+                       goto out_err;
+               continue;
+wait_for_sndbuf:
+               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+               err = sk_stream_wait_memory(sk, &timeo);
+               if (err)
+                       goto out_err;
+       }
+out_err:
+       if (err < 0)
+               err = sk_stream_error(sk, msg->msg_flags, err);
+out_cork:
+       release_sock(sk);
+       smap_release_sock(psock, sk);
+       return copied ? copied : err;
+}
+
+static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
+                           int offset, size_t size, int flags)
+{
+       struct sk_msg_buff md = {0}, *m = NULL;
+       int err = 0, copied = 0;
+       struct smap_psock *psock;
+       struct scatterlist *sg;
+       bool enospc = false;
+
+       rcu_read_lock();
+       psock = smap_psock_sk(sk);
+       if (unlikely(!psock))
+               goto accept;
+
+       if (!refcount_inc_not_zero(&psock->refcnt))
+               goto accept;
+       rcu_read_unlock();
+
+       lock_sock(sk);
+
+       if (psock->cork_bytes)
+               m = psock->cork;
+       else
+               m = &md;
+
+       /* Catch case where ring is full and sendpage is stalled. */
+       if (unlikely(m->sg_end == m->sg_start &&
+           m->sg_data[m->sg_end].length))
+               goto out_err;
+
+       psock->sg_size += size;
+       sg = &m->sg_data[m->sg_end];
+       sg_set_page(sg, page, size, offset);
+       get_page(page);
+       m->sg_copy[m->sg_end] = true;
+       sk_mem_charge(sk, size);
+       m->sg_end++;
+       copied = size;
+
+       if (m->sg_end == MAX_SKB_FRAGS)
+               m->sg_end = 0;
+
+       if (m->sg_end == m->sg_start)
+               enospc = true;
+
+       if (psock->cork_bytes) {
+               if (size > psock->cork_bytes)
+                       psock->cork_bytes = 0;
+               else
+                       psock->cork_bytes -= size;
+
+               if (psock->cork_bytes && !enospc)
+                       goto out_err;
+
+               /* All cork bytes accounted for re-run filter */
+               psock->eval = __SK_NONE;
+               psock->cork_bytes = 0;
+       }
+
+       err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
+out_err:
+       release_sock(sk);
+       smap_release_sock(psock, sk);
+       return copied ? copied : err;
+accept:
+       rcu_read_unlock();
+       return tcp_sendpage(sk, page, offset, size, flags);
+}
+
+static void bpf_tcp_msg_add(struct smap_psock *psock,
+                           struct sock *sk,
+                           struct bpf_prog *tx_msg)
+{
+       struct bpf_prog *orig_tx_msg;
+
+       orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
+       if (orig_tx_msg)
+               bpf_prog_put(orig_tx_msg);
+}
+
 static int bpf_tcp_ulp_register(void)
 {
        tcp_bpf_proto = tcp_prot;
        tcp_bpf_proto.close = bpf_tcp_close;
+       /* Once BPF TX ULP is registered it is never unregistered. It
+        * will be in the ULP list for the lifetime of the system. Doing
+        * duplicate registers is not a problem.
+        */
        return tcp_register_ulp(&bpf_tcp_ulp_ops);
 }
 
@@ -373,15 +1014,13 @@ static void smap_destroy_psock(struct rcu_head *rcu)
 
 static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 {
-       psock->refcnt--;
-       if (psock->refcnt)
-               return;
-
-       tcp_cleanup_ulp(sock);
-       smap_stop_sock(psock, sock);
-       clear_bit(SMAP_TX_RUNNING, &psock->state);
-       rcu_assign_sk_user_data(sock, NULL);
-       call_rcu_sched(&psock->rcu, smap_destroy_psock);
+       if (refcount_dec_and_test(&psock->refcnt)) {
+               tcp_cleanup_ulp(sock);
+               smap_stop_sock(psock, sock);
+               clear_bit(SMAP_TX_RUNNING, &psock->state);
+               rcu_assign_sk_user_data(sock, NULL);
+               call_rcu_sched(&psock->rcu, smap_destroy_psock);
+       }
 }
 
 static int smap_parse_func_strparser(struct strparser *strp,
@@ -415,7 +1054,6 @@ static int smap_parse_func_strparser(struct strparser *strp,
        return rc;
 }
 
-
 static int smap_read_sock_done(struct strparser *strp, int err)
 {
        return err;
@@ -485,12 +1123,22 @@ static void smap_gc_work(struct work_struct *w)
                bpf_prog_put(psock->bpf_parse);
        if (psock->bpf_verdict)
                bpf_prog_put(psock->bpf_verdict);
+       if (psock->bpf_tx_msg)
+               bpf_prog_put(psock->bpf_tx_msg);
+
+       if (psock->cork) {
+               free_start_sg(psock->sock, psock->cork);
+               kfree(psock->cork);
+       }
 
        list_for_each_entry_safe(e, tmp, &psock->maps, list) {
                list_del(&e->list);
                kfree(e);
        }
 
+       if (psock->sk_redir)
+               sock_put(psock->sk_redir);
+
        sock_put(psock->sock);
        kfree(psock);
 }
@@ -506,12 +1154,13 @@ static struct smap_psock *smap_init_psock(struct sock *sock,
        if (!psock)
                return ERR_PTR(-ENOMEM);
 
+       psock->eval =  __SK_NONE;
        psock->sock = sock;
        skb_queue_head_init(&psock->rxqueue);
        INIT_WORK(&psock->tx_work, smap_tx_work);
        INIT_WORK(&psock->gc_work, smap_gc_work);
        INIT_LIST_HEAD(&psock->maps);
-       psock->refcnt = 1;
+       refcount_set(&psock->refcnt, 1);
 
        rcu_assign_sk_user_data(sock, psock);
        sock_hold(sock);
@@ -714,10 +1363,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 {
        struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
        struct smap_psock_map_entry *e = NULL;
-       struct bpf_prog *verdict, *parse;
+       struct bpf_prog *verdict, *parse, *tx_msg;
        struct sock *osock, *sock;
        struct smap_psock *psock;
        u32 i = *(u32 *)key;
+       bool new = false;
        int err;
 
        if (unlikely(flags > BPF_EXIST))
@@ -740,6 +1390,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
         */
        verdict = READ_ONCE(stab->bpf_verdict);
        parse = READ_ONCE(stab->bpf_parse);
+       tx_msg = READ_ONCE(stab->bpf_tx_msg);
 
        if (parse && verdict) {
                /* bpf prog refcnt may be zero if a concurrent attach operation
@@ -758,6 +1409,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
                }
        }
 
+       if (tx_msg) {
+               tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+               if (IS_ERR(tx_msg)) {
+                       if (verdict)
+                               bpf_prog_put(verdict);
+                       if (parse)
+                               bpf_prog_put(parse);
+                       return PTR_ERR(tx_msg);
+               }
+       }
+
        write_lock_bh(&sock->sk_callback_lock);
        psock = smap_psock_sk(sock);
 
@@ -772,7 +1434,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
                        err = -EBUSY;
                        goto out_progs;
                }
-               psock->refcnt++;
+               if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) {
+                       err = -EBUSY;
+                       goto out_progs;
+               }
+               if (!refcount_inc_not_zero(&psock->refcnt)) {
+                       err = -EAGAIN;
+                       goto out_progs;
+               }
        } else {
                psock = smap_init_psock(sock, stab);
                if (IS_ERR(psock)) {
@@ -780,11 +1449,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
                        goto out_progs;
                }
 
-               err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
-               if (err)
-                       goto out_progs;
-
                set_bit(SMAP_TX_RUNNING, &psock->state);
+               new = true;
        }
 
        e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
@@ -797,6 +1463,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
        /* 3. At this point we have a reference to a valid psock that is
         * running. Attach any BPF programs needed.
         */
+       if (tx_msg)
+               bpf_tcp_msg_add(psock, sock, tx_msg);
+       if (new) {
+               err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+               if (err)
+                       goto out_free;
+       }
+
        if (parse && verdict && !psock->strp_enabled) {
                err = smap_init_sock(psock, sock);
                if (err)
@@ -818,8 +1492,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
                struct smap_psock *opsock = smap_psock_sk(osock);
 
                write_lock_bh(&osock->sk_callback_lock);
-               if (osock != sock && parse)
-                       smap_stop_sock(opsock, osock);
                smap_list_remove(opsock, &stab->sock_map[i]);
                smap_release_sock(opsock, osock);
                write_unlock_bh(&osock->sk_callback_lock);
@@ -832,6 +1504,8 @@ out_progs:
                bpf_prog_put(verdict);
        if (parse)
                bpf_prog_put(parse);
+       if (tx_msg)
+               bpf_prog_put(tx_msg);
        write_unlock_bh(&sock->sk_callback_lock);
        kfree(e);
        return err;
@@ -846,6 +1520,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
                return -EINVAL;
 
        switch (type) {
+       case BPF_SK_MSG_VERDICT:
+               orig = xchg(&stab->bpf_tx_msg, prog);
+               break;
        case BPF_SK_SKB_STREAM_PARSER:
                orig = xchg(&stab->bpf_parse, prog);
                break;
@@ -907,6 +1584,10 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file)
        orig = xchg(&stab->bpf_verdict, NULL);
        if (orig)
                bpf_prog_put(orig);
+
+       orig = xchg(&stab->bpf_tx_msg, NULL);
+       if (orig)
+               bpf_prog_put(orig);
 }
 
 const struct bpf_map_ops sock_map_ops = {
index b0ecf43f5894d12de9a20f4399e79e0b6d2979b8..57eeb1234b67e7dabd555e9562b0a0b59cd57abb 100644 (file)
@@ -9,16 +9,19 @@
 #include <linux/filter.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
+#include <linux/elf.h>
+#include <linux/pagemap.h>
 #include "percpu_freelist.h"
 
-#define STACK_CREATE_FLAG_MASK \
-       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define STACK_CREATE_FLAG_MASK                                 \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |        \
+        BPF_F_STACK_BUILD_ID)
 
 struct stack_map_bucket {
        struct pcpu_freelist_node fnode;
        u32 hash;
        u32 nr;
-       u64 ip[];
+       u64 data[];
 };
 
 struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
        struct stack_map_bucket *buckets[];
 };
 
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+       return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+       return stack_map_use_build_id(map) ?
+               sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 {
        u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
-           value_size < 8 || value_size % 8 ||
-           value_size / 8 > sysctl_perf_event_max_stack)
+           value_size < 8 || value_size % 8)
+               return ERR_PTR(-EINVAL);
+
+       BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
+       if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
+               if (value_size % sizeof(struct bpf_stack_build_id) ||
+                   value_size / sizeof(struct bpf_stack_build_id)
+                   > sysctl_perf_event_max_stack)
+                       return ERR_PTR(-EINVAL);
+       } else if (value_size / 8 > sysctl_perf_event_max_stack)
                return ERR_PTR(-EINVAL);
 
        /* hash table size must be power of 2 */
@@ -114,13 +136,184 @@ free_smap:
        return ERR_PTR(err);
 }
 
+#define BPF_BUILD_ID 3
+/*
+ * Parse build id from the note segment. This logic can be shared between
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
+ * identical.
+ */
+static inline int stack_map_parse_build_id(void *page_addr,
+                                          unsigned char *build_id,
+                                          void *note_start,
+                                          Elf32_Word note_size)
+{
+       Elf32_Word note_offs = 0, new_offs;
+
+       /* check for overflow */
+       if (note_start < page_addr || note_start + note_size < note_start)
+               return -EINVAL;
+
+       /* only supports note that fits in the first page */
+       if (note_start + note_size > page_addr + PAGE_SIZE)
+               return -EINVAL;
+
+       while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
+               Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+
+               if (nhdr->n_type == BPF_BUILD_ID &&
+                   nhdr->n_namesz == sizeof("GNU") &&
+                   nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
+                       memcpy(build_id,
+                              note_start + note_offs +
+                              ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
+                              BPF_BUILD_ID_SIZE);
+                       return 0;
+               }
+               new_offs = note_offs + sizeof(Elf32_Nhdr) +
+                       ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
+               if (new_offs <= note_offs)  /* overflow */
+                       break;
+               note_offs = new_offs;
+       }
+       return -EINVAL;
+}
+
+/* Parse build ID from 32-bit ELF */
+static int stack_map_get_build_id_32(void *page_addr,
+                                    unsigned char *build_id)
+{
+       Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
+       Elf32_Phdr *phdr;
+       int i;
+
+       /* only supports phdr that fits in one page */
+       if (ehdr->e_phnum >
+           (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+               return -EINVAL;
+
+       phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+
+       for (i = 0; i < ehdr->e_phnum; ++i)
+               if (phdr[i].p_type == PT_NOTE)
+                       return stack_map_parse_build_id(page_addr, build_id,
+                                       page_addr + phdr[i].p_offset,
+                                       phdr[i].p_filesz);
+       return -EINVAL;
+}
+
+/* Parse build ID from 64-bit ELF */
+static int stack_map_get_build_id_64(void *page_addr,
+                                    unsigned char *build_id)
+{
+       Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
+       Elf64_Phdr *phdr;
+       int i;
+
+       /* only supports phdr that fits in one page */
+       if (ehdr->e_phnum >
+           (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+               return -EINVAL;
+
+       phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+
+       for (i = 0; i < ehdr->e_phnum; ++i)
+               if (phdr[i].p_type == PT_NOTE)
+                       return stack_map_parse_build_id(page_addr, build_id,
+                                       page_addr + phdr[i].p_offset,
+                                       phdr[i].p_filesz);
+       return -EINVAL;
+}
+
+/* Parse build ID of ELF file mapped to vma */
+static int stack_map_get_build_id(struct vm_area_struct *vma,
+                                 unsigned char *build_id)
+{
+       Elf32_Ehdr *ehdr;
+       struct page *page;
+       void *page_addr;
+       int ret;
+
+       /* only works for page backed storage  */
+       if (!vma->vm_file)
+               return -EINVAL;
+
+       page = find_get_page(vma->vm_file->f_mapping, 0);
+       if (!page)
+               return -EFAULT; /* page not mapped */
+
+       ret = -EINVAL;
+       page_addr = page_address(page);
+       ehdr = (Elf32_Ehdr *)page_addr;
+
+       /* compare magic x7f "ELF" */
+       if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
+               goto out;
+
+       /* only support executable file and shared object file */
+       if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
+               goto out;
+
+       if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
+               ret = stack_map_get_build_id_32(page_addr, build_id);
+       else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
+               ret = stack_map_get_build_id_64(page_addr, build_id);
+out:
+       put_page(page);
+       return ret;
+}
+
+static void stack_map_get_build_id_offset(struct bpf_map *map,
+                                         struct stack_map_bucket *bucket,
+                                         u64 *ips, u32 trace_nr, bool user)
+{
+       int i;
+       struct vm_area_struct *vma;
+       struct bpf_stack_build_id *id_offs;
+
+       bucket->nr = trace_nr;
+       id_offs = (struct bpf_stack_build_id *)bucket->data;
+
+       /*
+        * We cannot do up_read() in nmi context, so build_id lookup is
+        * only supported for non-nmi events. If at some point, it is
+        * possible to run find_vma() without taking the semaphore, we
+        * would like to allow build_id lookup in nmi context.
+        *
+        * Same fallback is used for kernel stack (!user) on a stackmap
+        * with build_id.
+        */
+       if (!user || !current || !current->mm || in_nmi() ||
+           down_read_trylock(&current->mm->mmap_sem) == 0) {
+               /* cannot access current->mm, fall back to ips */
+               for (i = 0; i < trace_nr; i++) {
+                       id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+                       id_offs[i].ip = ips[i];
+               }
+               return;
+       }
+
+       for (i = 0; i < trace_nr; i++) {
+               vma = find_vma(current->mm, ips[i]);
+               if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
+                       /* per entry fall back to ips */
+                       id_offs[i].status = BPF_STACK_BUILD_ID_IP;
+                       id_offs[i].ip = ips[i];
+                       continue;
+               }
+               id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
+                       - vma->vm_start;
+               id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+       }
+       up_read(&current->mm->mmap_sem);
+}
+
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
           u64, flags)
 {
        struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
        struct perf_callchain_entry *trace;
        struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
-       u32 max_depth = map->value_size / 8;
+       u32 max_depth = map->value_size / stack_map_data_size(map);
        /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
        u32 init_nr = sysctl_perf_event_max_stack - max_depth;
        u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
@@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
        bool user = flags & BPF_F_USER_STACK;
        bool kernel = !user;
        u64 *ips;
+       bool hash_matches;
 
        if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
                               BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
        id = hash & (smap->n_buckets - 1);
        bucket = READ_ONCE(smap->buckets[id]);
 
-       if (bucket && bucket->hash == hash) {
-               if (flags & BPF_F_FAST_STACK_CMP)
+       hash_matches = bucket && bucket->hash == hash;
+       /* fast cmp */
+       if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
+               return id;
+
+       if (stack_map_use_build_id(map)) {
+               /* for build_id+offset, pop a bucket before slow cmp */
+               new_bucket = (struct stack_map_bucket *)
+                       pcpu_freelist_pop(&smap->freelist);
+               if (unlikely(!new_bucket))
+                       return -ENOMEM;
+               stack_map_get_build_id_offset(map, new_bucket, ips,
+                                             trace_nr, user);
+               trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
+               if (hash_matches && bucket->nr == trace_nr &&
+                   memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
+                       pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
                        return id;
-               if (bucket->nr == trace_nr &&
-                   memcmp(bucket->ip, ips, trace_len) == 0)
+               }
+               if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+                       pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
+                       return -EEXIST;
+               }
+       } else {
+               if (hash_matches && bucket->nr == trace_nr &&
+                   memcmp(bucket->data, ips, trace_len) == 0)
                        return id;
+               if (bucket && !(flags & BPF_F_REUSE_STACKID))
+                       return -EEXIST;
+
+               new_bucket = (struct stack_map_bucket *)
+                       pcpu_freelist_pop(&smap->freelist);
+               if (unlikely(!new_bucket))
+                       return -ENOMEM;
+               memcpy(new_bucket->data, ips, trace_len);
        }
 
-       /* this call stack is not in the map, try to add it */
-       if (bucket && !(flags & BPF_F_REUSE_STACKID))
-               return -EEXIST;
-
-       new_bucket = (struct stack_map_bucket *)
-               pcpu_freelist_pop(&smap->freelist);
-       if (unlikely(!new_bucket))
-               return -ENOMEM;
-
-       memcpy(new_bucket->ip, ips, trace_len);
        new_bucket->hash = hash;
        new_bucket->nr = trace_nr;
 
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
        if (!bucket)
                return -ENOENT;
 
-       trace_len = bucket->nr * sizeof(u64);
-       memcpy(value, bucket->ip, trace_len);
+       trace_len = bucket->nr * stack_map_data_size(map);
+       memcpy(value, bucket->data, trace_len);
        memset(value + trace_len, 0, map->value_size - trace_len);
 
        old_bucket = xchg(&smap->buckets[id], bucket);
index e24aa3241387de91483a89160e50ee41deede775..3aeb4ea2a93a80abe51570aa02174e61b503758d 100644 (file)
@@ -1315,7 +1315,8 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
-static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
+static int sockmap_get_from_fd(const union bpf_attr *attr,
+                              int type, bool attach)
 {
        struct bpf_prog *prog = NULL;
        int ufd = attr->target_fd;
@@ -1329,8 +1330,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
                return PTR_ERR(map);
 
        if (attach) {
-               prog = bpf_prog_get_type(attr->attach_bpf_fd,
-                                        BPF_PROG_TYPE_SK_SKB);
+               prog = bpf_prog_get_type(attr->attach_bpf_fd, type);
                if (IS_ERR(prog)) {
                        fdput(f);
                        return PTR_ERR(prog);
@@ -1382,9 +1382,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
        case BPF_CGROUP_DEVICE:
                ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
                break;
+       case BPF_SK_MSG_VERDICT:
+               return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true);
        case BPF_SK_SKB_STREAM_PARSER:
        case BPF_SK_SKB_STREAM_VERDICT:
-               return sockmap_get_from_fd(attr, true);
+               return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
        default:
                return -EINVAL;
        }
@@ -1437,9 +1439,11 @@ static int bpf_prog_detach(const union bpf_attr *attr)
        case BPF_CGROUP_DEVICE:
                ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
                break;
+       case BPF_SK_MSG_VERDICT:
+               return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false);
        case BPF_SK_SKB_STREAM_PARSER:
        case BPF_SK_SKB_STREAM_VERDICT:
-               return sockmap_get_from_fd(attr, false);
+               return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
        default:
                return -EINVAL;
        }
index eb79a34359c05b98e5d1f294ba5d5eff818e534b..e9f7c20691c1e685ee3feda6e684918118cd6c57 100644 (file)
@@ -1248,6 +1248,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
        case BPF_PROG_TYPE_XDP:
        case BPF_PROG_TYPE_LWT_XMIT:
        case BPF_PROG_TYPE_SK_SKB:
+       case BPF_PROG_TYPE_SK_MSG:
                if (meta)
                        return meta->pkt_access;
 
@@ -2071,7 +2072,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
        case BPF_MAP_TYPE_SOCKMAP:
                if (func_id != BPF_FUNC_sk_redirect_map &&
                    func_id != BPF_FUNC_sock_map_update &&
-                   func_id != BPF_FUNC_map_delete_elem)
+                   func_id != BPF_FUNC_map_delete_elem &&
+                   func_id != BPF_FUNC_msg_redirect_map)
                        goto error;
                break;
        default:
@@ -2109,6 +2111,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                        goto error;
                break;
        case BPF_FUNC_sk_redirect_map:
+       case BPF_FUNC_msg_redirect_map:
                if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
                        goto error;
                break;
index c0a9e310d71501948d60de5f499aa17cb087eaf6..c634e093951fc97a27eef1bf758985498d702a2a 100644 (file)
@@ -726,8 +726,7 @@ const struct bpf_prog_ops tracepoint_prog_ops = {
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
                                    struct bpf_insn_access_aux *info)
 {
-       const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
-                                        sample_period);
+       const int size_u64 = sizeof(u64);
 
        if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
                return false;
@@ -738,8 +737,13 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 
        switch (off) {
        case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
-               bpf_ctx_record_field_size(info, size_sp);
-               if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
+               bpf_ctx_record_field_size(info, size_u64);
+               if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
+                       return false;
+               break;
+       case bpf_ctx_range(struct bpf_perf_event_data, addr):
+               bpf_ctx_record_field_size(info, size_u64);
+               if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
                        return false;
                break;
        default:
@@ -766,6 +770,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
                                      bpf_target_off(struct perf_sample_data, period, 8,
                                                     target_size));
                break;
+       case offsetof(struct bpf_perf_event_data, addr):
+               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
+                                                      data), si->dst_reg, si->src_reg,
+                                     offsetof(struct bpf_perf_event_data_kern, data));
+               *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
+                                     bpf_target_off(struct perf_sample_data, addr, 8,
+                                                    target_size));
+               break;
        default:
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
                                                       regs), si->dst_reg, si->src_reg,
index 33edfa8372fd5e011f2b67df4039c21c46ca2448..c86f03fd9ea5cd43d78297842ecbc75a6dd426cb 100644 (file)
@@ -1890,6 +1890,202 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
        .arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
+          struct bpf_map *, map, u32, key, u64, flags)
+{
+       /* If user passes invalid input drop the packet. */
+       if (unlikely(flags))
+               return SK_DROP;
+
+       msg->key = key;
+       msg->flags = flags;
+       msg->map = map;
+
+       return SK_PASS;
+}
+
+struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+{
+       struct sock *sk = NULL;
+
+       if (msg->map) {
+               sk = __sock_map_lookup_elem(msg->map, msg->key);
+
+               msg->key = 0;
+               msg->map = NULL;
+       }
+
+       return sk;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+       .func           = bpf_msg_redirect_map,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+       msg->apply_bytes = bytes;
+       return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
+       .func           = bpf_msg_apply_bytes,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+       msg->cork_bytes = bytes;
+       return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
+       .func           = bpf_msg_cork_bytes,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_pull_data,
+          struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+{
+       unsigned int len = 0, offset = 0, copy = 0;
+       struct scatterlist *sg = msg->sg_data;
+       int first_sg, last_sg, i, shift;
+       unsigned char *p, *to, *from;
+       int bytes = end - start;
+       struct page *page;
+
+       if (unlikely(flags || end <= start))
+               return -EINVAL;
+
+       /* First find the starting scatterlist element */
+       i = msg->sg_start;
+       do {
+               len = sg[i].length;
+               offset += len;
+               if (start < offset + len)
+                       break;
+               i++;
+               if (i == MAX_SKB_FRAGS)
+                       i = 0;
+       } while (i != msg->sg_end);
+
+       if (unlikely(start >= offset + len))
+               return -EINVAL;
+
+       if (!msg->sg_copy[i] && bytes <= len)
+               goto out;
+
+       first_sg = i;
+
+       /* At this point we need to linearize multiple scatterlist
+        * elements or a single shared page. Either way we need to
+        * copy into a linear buffer exclusively owned by BPF. Then
+        * place the buffer in the scatterlist and fixup the original
+        * entries by removing the entries now in the linear buffer
+        * and shifting the remaining entries. For now we do not try
+        * to copy partial entries to avoid complexity of running out
+        * of sg_entry slots. The downside is reading a single byte
+        * will copy the entire sg entry.
+        */
+       do {
+               copy += sg[i].length;
+               i++;
+               if (i == MAX_SKB_FRAGS)
+                       i = 0;
+               if (bytes < copy)
+                       break;
+       } while (i != msg->sg_end);
+       last_sg = i;
+
+       if (unlikely(copy < end - start))
+               return -EINVAL;
+
+       page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
+       if (unlikely(!page))
+               return -ENOMEM;
+       p = page_address(page);
+       offset = 0;
+
+       i = first_sg;
+       do {
+               from = sg_virt(&sg[i]);
+               len = sg[i].length;
+               to = p + offset;
+
+               memcpy(to, from, len);
+               offset += len;
+               sg[i].length = 0;
+               put_page(sg_page(&sg[i]));
+
+               i++;
+               if (i == MAX_SKB_FRAGS)
+                       i = 0;
+       } while (i != last_sg);
+
+       sg[first_sg].length = copy;
+       sg_set_page(&sg[first_sg], page, copy, 0);
+
+       /* To repair sg ring we need to shift entries. If we only
+        * had a single entry though we can just replace it and
+        * be done. Otherwise walk the ring and shift the entries.
+        */
+       shift = last_sg - first_sg - 1;
+       if (!shift)
+               goto out;
+
+       i = first_sg + 1;
+       do {
+               int move_from;
+
+               if (i + shift >= MAX_SKB_FRAGS)
+                       move_from = i + shift - MAX_SKB_FRAGS;
+               else
+                       move_from = i + shift;
+
+               if (move_from == msg->sg_end)
+                       break;
+
+               sg[i] = sg[move_from];
+               sg[move_from].length = 0;
+               sg[move_from].page_link = 0;
+               sg[move_from].offset = 0;
+
+               i++;
+               if (i == MAX_SKB_FRAGS)
+                       i = 0;
+       } while (1);
+       msg->sg_end -= shift;
+       if (msg->sg_end < 0)
+               msg->sg_end += MAX_SKB_FRAGS;
+out:
+       msg->data = sg_virt(&sg[i]) + start - offset;
+       msg->data_end = msg->data + bytes;
+
+       return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pull_data_proto = {
+       .func           = bpf_msg_pull_data,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
        return task_get_classid(skb);
@@ -2831,7 +3027,8 @@ bool bpf_helper_changes_pkt_data(void *func)
            func == bpf_l3_csum_replace ||
            func == bpf_l4_csum_replace ||
            func == bpf_xdp_adjust_head ||
-           func == bpf_xdp_adjust_meta)
+           func == bpf_xdp_adjust_meta ||
+           func == bpf_msg_pull_data)
                return true;
 
        return false;
@@ -3591,6 +3788,22 @@ static const struct bpf_func_proto *
        }
 }
 
+static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
+{
+       switch (func_id) {
+       case BPF_FUNC_msg_redirect_map:
+               return &bpf_msg_redirect_map_proto;
+       case BPF_FUNC_msg_apply_bytes:
+               return &bpf_msg_apply_bytes_proto;
+       case BPF_FUNC_msg_cork_bytes:
+               return &bpf_msg_cork_bytes_proto;
+       case BPF_FUNC_msg_pull_data:
+               return &bpf_msg_pull_data_proto;
+       default:
+               return bpf_base_func_proto(func_id);
+       }
+}
+
 static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
 {
        switch (func_id) {
@@ -3980,6 +4193,32 @@ static bool sk_skb_is_valid_access(int off, int size,
        return bpf_skb_is_valid_access(off, size, type, info);
 }
 
+static bool sk_msg_is_valid_access(int off, int size,
+                                  enum bpf_access_type type,
+                                  struct bpf_insn_access_aux *info)
+{
+       if (type == BPF_WRITE)
+               return false;
+
+       switch (off) {
+       case offsetof(struct sk_msg_md, data):
+               info->reg_type = PTR_TO_PACKET;
+               break;
+       case offsetof(struct sk_msg_md, data_end):
+               info->reg_type = PTR_TO_PACKET_END;
+               break;
+       }
+
+       if (off < 0 || off >= sizeof(struct sk_msg_md))
+               return false;
+       if (off % size != 0)
+               return false;
+       if (size != sizeof(__u64))
+               return false;
+
+       return true;
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
@@ -4778,6 +5017,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
        return insn - insn_buf;
 }
 
+static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
+                                    const struct bpf_insn *si,
+                                    struct bpf_insn *insn_buf,
+                                    struct bpf_prog *prog, u32 *target_size)
+{
+       struct bpf_insn *insn = insn_buf;
+
+       switch (si->off) {
+       case offsetof(struct sk_msg_md, data):
+               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+                                     si->dst_reg, si->src_reg,
+                                     offsetof(struct sk_msg_buff, data));
+               break;
+       case offsetof(struct sk_msg_md, data_end):
+               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+                                     si->dst_reg, si->src_reg,
+                                     offsetof(struct sk_msg_buff, data_end));
+               break;
+       }
+
+       return insn - insn_buf;
+}
+
 const struct bpf_verifier_ops sk_filter_verifier_ops = {
        .get_func_proto         = sk_filter_func_proto,
        .is_valid_access        = sk_filter_is_valid_access,
@@ -4868,6 +5130,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = {
 const struct bpf_prog_ops sk_skb_prog_ops = {
 };
 
+const struct bpf_verifier_ops sk_msg_verifier_ops = {
+       .get_func_proto         = sk_msg_func_proto,
+       .is_valid_access        = sk_msg_is_valid_access,
+       .convert_ctx_access     = sk_msg_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_msg_prog_ops = {
+};
+
 int sk_detach_filter(struct sock *sk)
 {
        int ret = -ENOENT;
index a8962d9128953e20815165f55f1e1ec83daea0c6..f704324d12194331a411f60b5e1d8446530e85a4 100644 (file)
@@ -2237,6 +2237,67 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
 }
 EXPORT_SYMBOL(sk_page_frag_refill);
 
+int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
+               int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
+               int first_coalesce)
+{
+       int sg_curr = *sg_curr_index, use = 0, rc = 0;
+       unsigned int size = *sg_curr_size;
+       struct page_frag *pfrag;
+       struct scatterlist *sge;
+
+       len -= size;
+       pfrag = sk_page_frag(sk);
+
+       while (len > 0) {
+               unsigned int orig_offset;
+
+               if (!sk_page_frag_refill(sk, pfrag)) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+
+               use = min_t(int, len, pfrag->size - pfrag->offset);
+
+               if (!sk_wmem_schedule(sk, use)) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+
+               sk_mem_charge(sk, use);
+               size += use;
+               orig_offset = pfrag->offset;
+               pfrag->offset += use;
+
+               sge = sg + sg_curr - 1;
+               if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
+                   sg->offset + sg->length == orig_offset) {
+                       sg->length += use;
+               } else {
+                       sge = sg + sg_curr;
+                       sg_unmark_end(sge);
+                       sg_set_page(sge, pfrag->page, use, orig_offset);
+                       get_page(pfrag->page);
+                       sg_curr++;
+
+                       if (sg_curr == MAX_SKB_FRAGS)
+                               sg_curr = 0;
+
+                       if (sg_curr == sg_start) {
+                               rc = -ENOSPC;
+                               break;
+                       }
+               }
+
+               len -= use;
+       }
+out:
+       *sg_curr_size = size;
+       *sg_curr_index = sg_curr;
+       return rc;
+}
+EXPORT_SYMBOL(sk_alloc_sg);
+
 static void __lock_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
index e553f84bde83dba139f18400065bfec21a9b14f6..d763fae1b574187908fc7882e38ff979f9c09623 100644 (file)
@@ -994,7 +994,9 @@ new_segment:
                        get_page(page);
                        skb_fill_page_desc(skb, i, page, offset, copy);
                }
-               skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+               if (!(flags & MSG_NO_SHARED_FRAGS))
+                       skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
 
                skb->len += copy;
                skb->data_len += copy;
index f26376e954aeccadc0162a74b3c37af2d4ab0051..057a558ed6d71ffbb71db7e0a898c98303f39534 100644 (file)
@@ -87,71 +87,16 @@ static void trim_both_sgl(struct sock *sk, int target_size)
                target_size);
 }
 
-static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
-                   int *sg_num_elem, unsigned int *sg_size,
-                   int first_coalesce)
-{
-       struct page_frag *pfrag;
-       unsigned int size = *sg_size;
-       int num_elem = *sg_num_elem, use = 0, rc = 0;
-       struct scatterlist *sge;
-       unsigned int orig_offset;
-
-       len -= size;
-       pfrag = sk_page_frag(sk);
-
-       while (len > 0) {
-               if (!sk_page_frag_refill(sk, pfrag)) {
-                       rc = -ENOMEM;
-                       goto out;
-               }
-
-               use = min_t(int, len, pfrag->size - pfrag->offset);
-
-               if (!sk_wmem_schedule(sk, use)) {
-                       rc = -ENOMEM;
-                       goto out;
-               }
-
-               sk_mem_charge(sk, use);
-               size += use;
-               orig_offset = pfrag->offset;
-               pfrag->offset += use;
-
-               sge = sg + num_elem - 1;
-               if (num_elem > first_coalesce && sg_page(sg) == pfrag->page &&
-                   sg->offset + sg->length == orig_offset) {
-                       sg->length += use;
-               } else {
-                       sge++;
-                       sg_unmark_end(sge);
-                       sg_set_page(sge, pfrag->page, use, orig_offset);
-                       get_page(pfrag->page);
-                       ++num_elem;
-                       if (num_elem == MAX_SKB_FRAGS) {
-                               rc = -ENOSPC;
-                               break;
-                       }
-               }
-
-               len -= use;
-       }
-       goto out;
-
-out:
-       *sg_size = size;
-       *sg_num_elem = num_elem;
-       return rc;
-}
-
 static int alloc_encrypted_sg(struct sock *sk, int len)
 {
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
        int rc = 0;
 
-       rc = alloc_sg(sk, len, ctx->sg_encrypted_data,
-                     &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0);
+       rc = sk_alloc_sg(sk, len,
+                        ctx->sg_encrypted_data, 0,
+                        &ctx->sg_encrypted_num_elem,
+                        &ctx->sg_encrypted_size, 0);
 
        return rc;
 }
@@ -162,9 +107,9 @@ static int alloc_plaintext_sg(struct sock *sk, int len)
        struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
        int rc = 0;
 
-       rc = alloc_sg(sk, len, ctx->sg_plaintext_data,
-                     &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
-                     tls_ctx->pending_open_record_frags);
+       rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
+                        &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
+                        tls_ctx->pending_open_record_frags);
 
        return rc;
 }
index 69806d74fa53bac11aa4b113661539777a300a6b..b1a310c3ae899b043c70085fc71a08b698ea0308 100644 (file)
@@ -67,6 +67,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
        bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
        bool is_sockops = strncmp(event, "sockops", 7) == 0;
        bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
+       bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
        size_t insns_cnt = size / sizeof(struct bpf_insn);
        enum bpf_prog_type prog_type;
        char buf[256];
@@ -96,6 +97,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
                prog_type = BPF_PROG_TYPE_SOCK_OPS;
        } else if (is_sk_skb) {
                prog_type = BPF_PROG_TYPE_SK_SKB;
+       } else if (is_sk_msg) {
+               prog_type = BPF_PROG_TYPE_SK_MSG;
        } else {
                printf("Unknown event '%s'\n", event);
                return -1;
@@ -113,7 +116,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
        if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
                return 0;
 
-       if (is_socket || is_sockops || is_sk_skb) {
+       if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
                if (is_socket)
                        event += 6;
                else
@@ -589,7 +592,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
                    memcmp(shname, "socket", 6) == 0 ||
                    memcmp(shname, "cgroup/", 7) == 0 ||
                    memcmp(shname, "sockops", 7) == 0 ||
-                   memcmp(shname, "sk_skb", 6) == 0) {
+                   memcmp(shname, "sk_skb", 6) == 0 ||
+                   memcmp(shname, "sk_msg", 6) == 0) {
                        ret = load_and_attach(shname, data->d_buf,
                                              data->d_size);
                        if (ret != 0)
index a77a583d94d42ca375a7519a066ff4f648e05226..7068fbdde951f1ad7b64ff6ba28ff50c382a14bf 100644 (file)
@@ -39,6 +39,7 @@ int bpf_prog1(struct bpf_perf_event_data *ctx)
 {
        char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
        char time_fmt2[] = "Get Time Failed, ErrCode: %d";
+       char addr_fmt[] = "Address recorded on event: %llx";
        char fmt[] = "CPU-%d period %lld ip %llx";
        u32 cpu = bpf_get_smp_processor_id();
        struct bpf_perf_event_value value_buf;
@@ -64,6 +65,9 @@ int bpf_prog1(struct bpf_perf_event_data *ctx)
        else
          bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
 
+       if (ctx->addr != 0)
+         bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr);
+
        val = bpf_map_lookup_elem(&counts, &key);
        if (val)
                (*val)++;
index bf4f1b6d9a52e0d33e17189be30900f90defd266..56f7a259a7c92500c41a850b45ff66ab63e47158 100644 (file)
@@ -215,6 +215,17 @@ static void test_bpf_perf_event(void)
                /* Intel Instruction Retired */
                .config = 0xc0,
        };
+       struct perf_event_attr attr_type_raw_lock_load = {
+               .sample_freq = SAMPLE_FREQ,
+               .freq = 1,
+               .type = PERF_TYPE_RAW,
+               /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */
+               .config = 0x21d0,
+               /* Request to record lock address from PEBS */
+               .sample_type = PERF_SAMPLE_ADDR,
+               /* Record address value requires precise event */
+               .precise_ip = 2,
+       };
 
        printf("Test HW_CPU_CYCLES\n");
        test_perf_event_all_cpu(&attr_type_hw);
@@ -236,6 +247,10 @@ static void test_bpf_perf_event(void)
        test_perf_event_all_cpu(&attr_type_raw);
        test_perf_event_task(&attr_type_raw);
 
+       printf("Test Lock Load\n");
+       test_perf_event_all_cpu(&attr_type_raw_lock_load);
+       test_perf_event_task(&attr_type_raw_lock_load);
+
        printf("*** PASS ***\n");
 }
 
index 52b0053274f425a6a07eb70dfbce89ca9c32d4fb..9ad5ba79c85ae1a16710a337e32d4aaf08bfe776 100644 (file)
@@ -43,6 +43,42 @@ struct bpf_map_def SEC("maps") sock_map = {
        .max_entries = 20,
 };
 
+struct bpf_map_def SEC("maps") sock_map_txmsg = {
+       .type = BPF_MAP_TYPE_SOCKMAP,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_map_redir = {
+       .type = BPF_MAP_TYPE_SOCKMAP,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") sock_apply_bytes = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_cork_bytes = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_pull_bytes = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 2
+};
+
+
 SEC("sk_skb1")
 int bpf_prog1(struct __sk_buff *skb)
 {
@@ -105,4 +141,165 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
 
        return 0;
 }
+
+SEC("sk_msg1")
+int bpf_prog4(struct sk_msg_md *msg)
+{
+       int *bytes, zero = 0, one = 1;
+       int *start, *end;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes)
+               bpf_msg_apply_bytes(msg, *bytes);
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes)
+               bpf_msg_cork_bytes(msg, *bytes);
+       start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+       end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+       if (start && end)
+               bpf_msg_pull_data(msg, *start, *end, 0);
+       return SK_PASS;
+}
+
+SEC("sk_msg2")
+int bpf_prog5(struct sk_msg_md *msg)
+{
+       int err1 = -1, err2 = -1, zero = 0, one = 1;
+       int *bytes, *start, *end, len1, len2;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes)
+               err1 = bpf_msg_apply_bytes(msg, *bytes);
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes)
+               err2 = bpf_msg_cork_bytes(msg, *bytes);
+       len1 = (__u64)msg->data_end - (__u64)msg->data;
+       start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+       end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+       if (start && end) {
+               int err;
+
+               bpf_printk("sk_msg2: pull(%i:%i)\n",
+                          start ? *start : 0, end ? *end : 0);
+               err = bpf_msg_pull_data(msg, *start, *end, 0);
+               if (err)
+                       bpf_printk("sk_msg2: pull_data err %i\n",
+                                  err);
+               len2 = (__u64)msg->data_end - (__u64)msg->data;
+               bpf_printk("sk_msg2: length update %i->%i\n",
+                          len1, len2);
+       }
+       bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
+                  len1, err1, err2);
+       return SK_PASS;
+}
+
+SEC("sk_msg3")
+int bpf_prog6(struct sk_msg_md *msg)
+{
+       int *bytes, zero = 0, one = 1;
+       int *start, *end;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes)
+               bpf_msg_apply_bytes(msg, *bytes);
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes)
+               bpf_msg_cork_bytes(msg, *bytes);
+       start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+       end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+       if (start && end)
+               bpf_msg_pull_data(msg, *start, *end, 0);
+       return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0);
+}
+
+SEC("sk_msg4")
+int bpf_prog7(struct sk_msg_md *msg)
+{
+       int err1 = 0, err2 = 0, zero = 0, one = 1;
+       int *bytes, *start, *end, len1, len2;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes)
+               err1 = bpf_msg_apply_bytes(msg, *bytes);
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes)
+               err2 = bpf_msg_cork_bytes(msg, *bytes);
+       len1 = (__u64)msg->data_end - (__u64)msg->data;
+       start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+       end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+       if (start && end) {
+               int err;
+
+               bpf_printk("sk_msg2: pull(%i:%i)\n",
+                          start ? *start : 0, end ? *end : 0);
+               err = bpf_msg_pull_data(msg, *start, *end, 0);
+               if (err)
+                       bpf_printk("sk_msg2: pull_data err %i\n",
+                                  err);
+               len2 = (__u64)msg->data_end - (__u64)msg->data;
+               bpf_printk("sk_msg2: length update %i->%i\n",
+                          len1, len2);
+       }
+       bpf_printk("sk_msg3: redirect(%iB) err1=%i err2=%i\n",
+                  len1, err1, err2);
+       return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0);
+}
+
+SEC("sk_msg5")
+int bpf_prog8(struct sk_msg_md *msg)
+{
+       void *data_end = (void *)(long) msg->data_end;
+       void *data = (void *)(long) msg->data;
+       int ret = 0, *bytes, zero = 0;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes) {
+               ret = bpf_msg_apply_bytes(msg, *bytes);
+               if (ret)
+                       return SK_DROP;
+       } else {
+               return SK_DROP;
+       }
+       return SK_PASS;
+}
+SEC("sk_msg6")
+int bpf_prog9(struct sk_msg_md *msg)
+{
+       void *data_end = (void *)(long) msg->data_end;
+       void *data = (void *)(long) msg->data;
+       int ret = 0, *bytes, zero = 0;
+
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes) {
+               if (((__u64)data_end - (__u64)data) >= *bytes)
+                       return SK_PASS;
+               ret = bpf_msg_cork_bytes(msg, *bytes);
+               if (ret)
+                       return SK_DROP;
+       }
+       return SK_PASS;
+}
+
+SEC("sk_msg7")
+int bpf_prog10(struct sk_msg_md *msg)
+{
+       int *bytes, zero = 0, one = 1;
+       int *start, *end;
+
+       bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+       if (bytes)
+               bpf_msg_apply_bytes(msg, *bytes);
+       bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+       if (bytes)
+               bpf_msg_cork_bytes(msg, *bytes);
+       start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+       end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+       if (start && end)
+               bpf_msg_pull_data(msg, *start, *end, 0);
+
+       return SK_DROP;
+}
+
+
 char _license[] SEC("license") = "GPL";
diff --git a/samples/sockmap/sockmap_test.sh b/samples/sockmap/sockmap_test.sh
new file mode 100755 (executable)
index 0000000..6d8cc40
--- /dev/null
@@ -0,0 +1,450 @@
+#Test a bunch of positive cases to verify basic functionality
+for prog in "--txmsg" "--txmsg_redir" "--txmsg_drop"; do
+for t in "sendmsg" "sendpage"; do
+for r in 1 10 100; do
+       for i in 1 10 100; do
+               for l in 1 10 100; do
+                       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+                       echo $TEST
+                       $TEST
+                       sleep 2
+               done
+       done
+done
+done
+done
+
+#Test max iov
+t="sendmsg"
+r=1
+i=1024
+l=1
+prog="--txmsg"
+
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+echo $TEST
+$TEST
+sleep 2
+prog="--txmsg_redir"
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+echo $TEST
+$TEST
+
+# Test max iov with 1k send
+
+t="sendmsg"
+r=1
+i=1024
+l=1024
+prog="--txmsg"
+
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+echo $TEST
+$TEST
+sleep 2
+prog="--txmsg_redir"
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+echo $TEST
+$TEST
+sleep 2
+
+# Test apply with 1B
+r=1
+i=1024
+l=1024
+prog="--txmsg_apply 1"
+
+for t in "sendmsg" "sendpage"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test apply with larger value than send
+r=1
+i=8
+l=1024
+prog="--txmsg_apply 2048"
+
+for t in "sendmsg" "sendpage"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test apply with apply that never reaches limit
+r=1024
+i=1
+l=1
+prog="--txmsg_apply 2048"
+
+for t in "sendmsg" "sendpage"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test apply and redirect with 1B
+r=1
+i=1024
+l=1024
+prog="--txmsg_redir --txmsg_apply 1"
+
+for t in "sendmsg" "sendpage"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test apply and redirect with larger value than send
+r=1
+i=8
+l=1024
+prog="--txmsg_redir --txmsg_apply 2048"
+
+for t in "sendmsg" "sendpage"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test apply and redirect with apply that never reaches limit
+r=1024
+i=1
+l=1
+prog="--txmsg_apply 2048"
+
+for t in "sendmsg" "sendpage"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test cork with 1B not really useful but test it anyways
+r=1
+i=1024
+l=1024
+prog="--txmsg_cork 1"
+
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test cork with a more reasonable 100B
+r=1
+i=1000
+l=1000
+prog="--txmsg_cork 100"
+
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test cork with larger value than send
+r=1
+i=8
+l=1024
+prog="--txmsg_cork 2048"
+
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test cork with cork that never reaches limit
+r=1024
+i=1
+l=1
+prog="--txmsg_cork 2048"
+
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+r=1
+i=1024
+l=1024
+prog="--txmsg_redir --txmsg_cork 1"
+
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test cork with a more reasonable 100B
+r=1
+i=1000
+l=1000
+prog="--txmsg_redir --txmsg_cork 100"
+
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test cork with larger value than send
+r=1
+i=8
+l=1024
+prog="--txmsg_redir --txmsg_cork 2048"
+
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test cork with cork that never reaches limit
+r=1024
+i=1
+l=1
+prog="--txmsg_cork 2048"
+
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+
+# mix and match cork and apply not really useful but valid programs
+
+# Test apply < cork
+r=100
+i=1
+l=5
+prog="--txmsg_apply 10 --txmsg_cork 100"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Try again with larger sizes so we hit overflow case
+r=100
+i=1000
+l=2048
+prog="--txmsg_apply 4096 --txmsg_cork 8096"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test apply > cork
+r=100
+i=1
+l=5
+prog="--txmsg_apply 100 --txmsg_cork 10"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Again with larger sizes so we hit overflow cases
+r=100
+i=1000
+l=2048
+prog="--txmsg_apply 8096 --txmsg_cork 4096"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+
+# Test apply = cork
+r=100
+i=1
+l=5
+prog="--txmsg_apply 10 --txmsg_cork 10"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+r=100
+i=1000
+l=2048
+prog="--txmsg_apply 4096 --txmsg_cork 4096"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test apply < cork
+r=100
+i=1
+l=5
+prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 100"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Try again with larger sizes so we hit overflow case
+r=100
+i=1000
+l=2048
+prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 8096"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Test apply > cork
+r=100
+i=1
+l=5
+prog="--txmsg_redir --txmsg_apply 100 --txmsg_cork 10"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Again with larger sizes so we hit overflow cases
+r=100
+i=1000
+l=2048
+prog="--txmsg_redir --txmsg_apply 8096 --txmsg_cork 4096"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+
+# Test apply = cork
+r=100
+i=1
+l=5
+prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 10"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+r=100
+i=1000
+l=2048
+prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 4096"
+for t in "sendpage" "sendmsg"; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+# Tests for bpf_msg_pull_data()
+for i in `seq 99 100 1600`; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
+               --txmsg --txmsg_start 0 --txmsg_end $i --txmsg_cork 1600"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+for i in `seq 199 100 1600`; do
+       TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
+               --txmsg --txmsg_start 100 --txmsg_end $i --txmsg_cork 1600"
+       echo $TEST
+       $TEST
+       sleep 2
+done
+
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
+       --txmsg --txmsg_start 1500 --txmsg_end 1600 --txmsg_cork 1600"
+echo $TEST
+$TEST
+sleep 2
+
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
+       --txmsg --txmsg_start 1111 --txmsg_end 1112 --txmsg_cork 1600"
+echo $TEST
+$TEST
+sleep 2
+
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
+       --txmsg --txmsg_start 1111 --txmsg_end 0 --txmsg_cork 1600"
+echo $TEST
+$TEST
+sleep 2
+
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
+       --txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1600"
+echo $TEST
+$TEST
+sleep 2
+
+TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
+       --txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1602"
+echo $TEST
+$TEST
+sleep 2
+
+# Run through gamut again with start and end
+for prog in "--txmsg" "--txmsg_redir" "--txmsg_drop"; do
+for t in "sendmsg" "sendpage"; do
+for r in 1 10 100; do
+       for i in 1 10 100; do
+               for l in 1 10 100; do
+                       TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog --txmsg_start 1 --txmsg_end 2"
+                       echo $TEST
+                       $TEST
+                       sleep 2
+               done
+       done
+done
+done
+done
+
+# Some specific tests to cover specific code paths
+./sockmap --cgroup /mnt/cgroup2/ -t sendpage \
+       -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3
+./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \
+       -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3
+./sockmap --cgroup /mnt/cgroup2/ -t sendpage \
+       -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5
+./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \
+       -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5
index 95a54a89a5322a44ec17d727f78c8ec795c8631c..07aa237221d12c46bf3f07e5cb17cb7aebe9a4c3 100644 (file)
@@ -29,6 +29,7 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/types.h>
+#include <sys/sendfile.h>
 
 #include <linux/netlink.h>
 #include <linux/socket.h>
@@ -54,6 +55,16 @@ void running_handler(int a);
 /* global sockets */
 int s1, s2, c1, c2, p1, p2;
 
+int txmsg_pass;
+int txmsg_noisy;
+int txmsg_redir;
+int txmsg_redir_noisy;
+int txmsg_drop;
+int txmsg_apply;
+int txmsg_cork;
+int txmsg_start;
+int txmsg_end;
+
 static const struct option long_options[] = {
        {"help",        no_argument,            NULL, 'h' },
        {"cgroup",      required_argument,      NULL, 'c' },
@@ -62,6 +73,16 @@ static const struct option long_options[] = {
        {"iov_count",   required_argument,      NULL, 'i' },
        {"length",      required_argument,      NULL, 'l' },
        {"test",        required_argument,      NULL, 't' },
+       {"data_test",   no_argument,            NULL, 'd' },
+       {"txmsg",               no_argument,    &txmsg_pass,  1  },
+       {"txmsg_noisy",         no_argument,    &txmsg_noisy, 1  },
+       {"txmsg_redir",         no_argument,    &txmsg_redir, 1  },
+       {"txmsg_redir_noisy",   no_argument,    &txmsg_redir_noisy, 1},
+       {"txmsg_drop",          no_argument,    &txmsg_drop, 1 },
+       {"txmsg_apply", required_argument,      NULL, 'a'},
+       {"txmsg_cork",  required_argument,      NULL, 'k'},
+       {"txmsg_start", required_argument,      NULL, 's'},
+       {"txmsg_end",   required_argument,      NULL, 'e'},
        {0, 0, NULL, 0 }
 };
 
@@ -195,19 +216,71 @@ struct msg_stats {
        struct timespec end;
 };
 
+struct sockmap_options {
+       int verbose;
+       bool base;
+       bool sendpage;
+       bool data_test;
+       bool drop_expected;
+};
+
+static int msg_loop_sendpage(int fd, int iov_length, int cnt,
+                            struct msg_stats *s,
+                            struct sockmap_options *opt)
+{
+       bool drop = opt->drop_expected;
+       unsigned char k = 0;
+       FILE *file;
+       int i, fp;
+
+       file = fopen(".sendpage_tst.tmp", "w+");
+       for (i = 0; i < iov_length * cnt; i++, k++)
+               fwrite(&k, sizeof(char), 1, file);
+       fflush(file);
+       fseek(file, 0, SEEK_SET);
+       fclose(file);
+
+       fp = open(".sendpage_tst.tmp", O_RDONLY);
+       clock_gettime(CLOCK_MONOTONIC, &s->start);
+       for (i = 0; i < cnt; i++) {
+               int sent = sendfile(fd, fp, NULL, iov_length);
+
+               if (!drop && sent < 0) {
+                       perror("send loop error:");
+                       close(fp);
+                       return sent;
+               } else if (drop && sent >= 0) {
+                       printf("sendpage loop error expected: %i\n", sent);
+                       close(fp);
+                       return -EIO;
+               }
+
+               if (sent > 0)
+                       s->bytes_sent += sent;
+       }
+       clock_gettime(CLOCK_MONOTONIC, &s->end);
+       close(fp);
+       return 0;
+}
+
 static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
-                   struct msg_stats *s, bool tx)
+                   struct msg_stats *s, bool tx,
+                   struct sockmap_options *opt)
 {
        struct msghdr msg = {0};
        int err, i, flags = MSG_NOSIGNAL;
        struct iovec *iov;
+       unsigned char k;
+       bool data_test = opt->data_test;
+       bool drop = opt->drop_expected;
 
        iov = calloc(iov_count, sizeof(struct iovec));
        if (!iov)
                return errno;
 
+       k = 0;
        for (i = 0; i < iov_count; i++) {
-               char *d = calloc(iov_length, sizeof(char));
+               unsigned char *d = calloc(iov_length, sizeof(char));
 
                if (!d) {
                        fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
@@ -215,21 +288,34 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
                }
                iov[i].iov_base = d;
                iov[i].iov_len = iov_length;
+
+               if (data_test && tx) {
+                       int j;
+
+                       for (j = 0; j < iov_length; j++)
+                               d[j] = k++;
+               }
        }
 
        msg.msg_iov = iov;
        msg.msg_iovlen = iov_count;
+       k = 0;
 
        if (tx) {
                clock_gettime(CLOCK_MONOTONIC, &s->start);
                for (i = 0; i < cnt; i++) {
                        int sent = sendmsg(fd, &msg, flags);
 
-                       if (sent < 0) {
+                       if (!drop && sent < 0) {
                                perror("send loop error:");
                                goto out_errno;
+                       } else if (drop && sent >= 0) {
+                               printf("send loop error expected: %i\n", sent);
+                               errno = -EIO;
+                               goto out_errno;
                        }
-                       s->bytes_sent += sent;
+                       if (sent > 0)
+                               s->bytes_sent += sent;
                }
                clock_gettime(CLOCK_MONOTONIC, &s->end);
        } else {
@@ -272,6 +358,26 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
                        }
 
                        s->bytes_recvd += recv;
+
+                       if (data_test) {
+                               int j;
+
+                               for (i = 0; i < msg.msg_iovlen; i++) {
+                                       unsigned char *d = iov[i].iov_base;
+
+                                       for (j = 0;
+                                            j < iov[i].iov_len && recv; j++) {
+                                               if (d[j] != k++) {
+                                                       errno = -EIO;
+                                                       fprintf(stderr,
+                                                               "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
+                                                               i, j, d[j], k - 1, d[j+1], k + 1);
+                                                       goto out_errno;
+                                               }
+                                               recv--;
+                                       }
+                               }
+                       }
                }
                clock_gettime(CLOCK_MONOTONIC, &s->end);
        }
@@ -300,7 +406,7 @@ static inline float recvdBps(struct msg_stats s)
 }
 
 static int sendmsg_test(int iov_count, int iov_buf, int cnt,
-                       int verbose, bool base)
+                       struct sockmap_options *opt)
 {
        float sent_Bps = 0, recvd_Bps = 0;
        int rx_fd, txpid, rxpid, err = 0;
@@ -309,14 +415,20 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt,
 
        errno = 0;
 
-       if (base)
+       if (opt->base)
                rx_fd = p1;
        else
                rx_fd = p2;
 
        rxpid = fork();
        if (rxpid == 0) {
-               err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false);
+               if (opt->drop_expected)
+                       exit(1);
+
+               if (opt->sendpage)
+                       iov_count = 1;
+               err = msg_loop(rx_fd, iov_count, iov_buf,
+                              cnt, &s, false, opt);
                if (err)
                        fprintf(stderr,
                                "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
@@ -339,7 +451,12 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt,
 
        txpid = fork();
        if (txpid == 0) {
-               err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true);
+               if (opt->sendpage)
+                       err = msg_loop_sendpage(c1, iov_buf, cnt, &s, opt);
+               else
+                       err = msg_loop(c1, iov_count, iov_buf,
+                                      cnt, &s, true, opt);
+
                if (err)
                        fprintf(stderr,
                                "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
@@ -364,7 +481,7 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt,
        return err;
 }
 
-static int forever_ping_pong(int rate, int verbose)
+static int forever_ping_pong(int rate, struct sockmap_options *opt)
 {
        struct timeval timeout;
        char buf[1024] = {0};
@@ -429,7 +546,7 @@ static int forever_ping_pong(int rate, int verbose)
                if (rate)
                        sleep(rate);
 
-               if (verbose) {
+               if (opt->verbose) {
                        printf(".");
                        fflush(stdout);
 
@@ -443,20 +560,34 @@ enum {
        PING_PONG,
        SENDMSG,
        BASE,
+       BASE_SENDPAGE,
+       SENDPAGE,
 };
 
 int main(int argc, char **argv)
 {
-       int iov_count = 1, length = 1024, rate = 1, verbose = 0;
+       int iov_count = 1, length = 1024, rate = 1, tx_prog_fd;
        struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
        int opt, longindex, err, cg_fd = 0;
+       struct sockmap_options options = {0};
        int test = PING_PONG;
        char filename[256];
 
-       while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:",
+       while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
                                  long_options, &longindex)) != -1) {
                switch (opt) {
-               /* Cgroup configuration */
+               case 's':
+                       txmsg_start = atoi(optarg);
+                       break;
+               case 'e':
+                       txmsg_end = atoi(optarg);
+                       break;
+               case 'a':
+                       txmsg_apply = atoi(optarg);
+                       break;
+               case 'k':
+                       txmsg_cork = atoi(optarg);
+                       break;
                case 'c':
                        cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
                        if (cg_fd < 0) {
@@ -470,7 +601,7 @@ int main(int argc, char **argv)
                        rate = atoi(optarg);
                        break;
                case 'v':
-                       verbose = 1;
+                       options.verbose = 1;
                        break;
                case 'i':
                        iov_count = atoi(optarg);
@@ -478,6 +609,9 @@ int main(int argc, char **argv)
                case 'l':
                        length = atoi(optarg);
                        break;
+               case 'd':
+                       options.data_test = true;
+                       break;
                case 't':
                        if (strcmp(optarg, "ping") == 0) {
                                test = PING_PONG;
@@ -485,11 +619,17 @@ int main(int argc, char **argv)
                                test = SENDMSG;
                        } else if (strcmp(optarg, "base") == 0) {
                                test = BASE;
+                       } else if (strcmp(optarg, "base_sendpage") == 0) {
+                               test = BASE_SENDPAGE;
+                       } else if (strcmp(optarg, "sendpage") == 0) {
+                               test = SENDPAGE;
                        } else {
                                usage(argv);
                                return -1;
                        }
                        break;
+               case 0:
+                       break;
                case 'h':
                default:
                        usage(argv);
@@ -515,16 +655,16 @@ int main(int argc, char **argv)
        /* catch SIGINT */
        signal(SIGINT, running_handler);
 
-       /* If base test skip BPF setup */
-       if (test == BASE)
-               goto run;
-
        if (load_bpf_file(filename)) {
                fprintf(stderr, "load_bpf_file: (%s) %s\n",
                        filename, strerror(errno));
                return 1;
        }
 
+       /* If base test skip BPF setup */
+       if (test == BASE || test == BASE_SENDPAGE)
+               goto run;
+
        /* Attach programs to sockmap */
        err = bpf_prog_attach(prog_fd[0], map_fd[0],
                                BPF_SK_SKB_STREAM_PARSER, 0);
@@ -557,13 +697,126 @@ run:
                goto out;
        }
 
-       if (test == PING_PONG)
-               err = forever_ping_pong(rate, verbose);
-       else if (test == SENDMSG)
-               err = sendmsg_test(iov_count, length, rate, verbose, false);
-       else if (test == BASE)
-               err = sendmsg_test(iov_count, length, rate, verbose, true);
+       /* Attach txmsg program to sockmap */
+       if (txmsg_pass)
+               tx_prog_fd = prog_fd[3];
+       else if (txmsg_noisy)
+               tx_prog_fd = prog_fd[4];
+       else if (txmsg_redir)
+               tx_prog_fd = prog_fd[5];
+       else if (txmsg_redir_noisy)
+               tx_prog_fd = prog_fd[6];
+       else if (txmsg_drop)
+               tx_prog_fd = prog_fd[9];
+       /* apply and cork must be last */
+       else if (txmsg_apply)
+               tx_prog_fd = prog_fd[7];
+       else if (txmsg_cork)
+               tx_prog_fd = prog_fd[8];
        else
+               tx_prog_fd = 0;
+
+       if (tx_prog_fd) {
+               int redir_fd, i = 0;
+
+               err = bpf_prog_attach(tx_prog_fd,
+                                     map_fd[1], BPF_SK_MSG_VERDICT, 0);
+               if (err) {
+                       fprintf(stderr,
+                               "ERROR: bpf_prog_attach (txmsg): %d (%s)\n",
+                               err, strerror(errno));
+                       return err;
+               }
+
+               err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY);
+               if (err) {
+                       fprintf(stderr,
+                               "ERROR: bpf_map_update_elem (txmsg):  %d (%s\n",
+                               err, strerror(errno));
+                       return err;
+               }
+
+               if (txmsg_redir || txmsg_redir_noisy)
+                       redir_fd = c2;
+               else
+                       redir_fd = c1;
+
+               err = bpf_map_update_elem(map_fd[2], &i, &redir_fd, BPF_ANY);
+               if (err) {
+                       fprintf(stderr,
+                               "ERROR: bpf_map_update_elem (txmsg):  %d (%s\n",
+                               err, strerror(errno));
+                       return err;
+               }
+
+               if (txmsg_apply) {
+                       err = bpf_map_update_elem(map_fd[3],
+                                                 &i, &txmsg_apply, BPF_ANY);
+                       if (err) {
+                               fprintf(stderr,
+                                       "ERROR: bpf_map_update_elem (apply_bytes):  %d (%s\n",
+                                       err, strerror(errno));
+                               return err;
+                       }
+               }
+
+               if (txmsg_cork) {
+                       err = bpf_map_update_elem(map_fd[4],
+                                                 &i, &txmsg_cork, BPF_ANY);
+                       if (err) {
+                               fprintf(stderr,
+                                       "ERROR: bpf_map_update_elem (cork_bytes):  %d (%s\n",
+                                       err, strerror(errno));
+                               return err;
+                       }
+               }
+
+               if (txmsg_start) {
+                       err = bpf_map_update_elem(map_fd[5],
+                                                 &i, &txmsg_start, BPF_ANY);
+                       if (err) {
+                               fprintf(stderr,
+                                       "ERROR: bpf_map_update_elem (txmsg_start):  %d (%s)\n",
+                                       err, strerror(errno));
+                               return err;
+                       }
+               }
+
+               if (txmsg_end) {
+                       i = 1;
+                       err = bpf_map_update_elem(map_fd[5],
+                                                 &i, &txmsg_end, BPF_ANY);
+                       if (err) {
+                               fprintf(stderr,
+                                       "ERROR: bpf_map_update_elem (txmsg_end):  %d (%s)\n",
+                                       err, strerror(errno));
+                               return err;
+                       }
+               }
+       }
+
+       if (txmsg_drop)
+               options.drop_expected = true;
+
+       if (test == PING_PONG)
+               err = forever_ping_pong(rate, &options);
+       else if (test == SENDMSG) {
+               options.base = false;
+               options.sendpage = false;
+               err = sendmsg_test(iov_count, length, rate, &options);
+       } else if (test == SENDPAGE) {
+               options.base = false;
+               options.sendpage = true;
+               err = sendmsg_test(iov_count, length, rate, &options);
+       } else if (test == BASE) {
+               options.base = true;
+               options.sendpage = false;
+               err = sendmsg_test(iov_count, length, rate, &options);
+       } else if (test == BASE_SENDPAGE) {
+               options.base = true;
+               options.sendpage = true;
+               err = sendmsg_test(iov_count, length, rate, &options);
+       } else
                fprintf(stderr, "unknown test\n");
 out:
        bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
index c8ec0ae16bf03af4c40f08b06dcea2b7e24e4b9e..1ea545965ee36c3a8dca16fd0e34f19aa4894e60 100644 (file)
@@ -1,19 +1,28 @@
 # SPDX-License-Identifier: GPL-2.0
-prefix = /usr
+include ../scripts/Makefile.include
+
+prefix ?= /usr/local
 
 CC = gcc
 LEX = flex
 YACC = bison
 MAKE = make
+INSTALL ?= install
 
 CFLAGS += -Wall -O2
-CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include
+CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/include/uapi -I$(srctree)/include
 
 ifeq ($(srctree),)
 srctree := $(patsubst %/,%,$(dir $(CURDIR)))
 srctree := $(patsubst %/,%,$(dir $(srctree)))
 endif
 
+ifeq ($(V),1)
+  Q =
+else
+  Q = @
+endif
+
 FEATURE_USER = .bpf
 FEATURE_TESTS = libbfd disassembler-four-args
 FEATURE_DISPLAY = libbfd disassembler-four-args
@@ -38,40 +47,59 @@ ifeq ($(feature-disassembler-four-args), 1)
 CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
 endif
 
-%.yacc.c: %.y
-       $(YACC) -o $@ -d $<
+$(OUTPUT)%.yacc.c: $(srctree)/tools/bpf/%.y
+       $(QUIET_BISON)$(YACC) -o $@ -d $<
 
-%.lex.c: %.l
-       $(LEX) -o $@ $<
+$(OUTPUT)%.lex.c: $(srctree)/tools/bpf/%.l
+       $(QUIET_FLEX)$(LEX) -o $@ $<
 
-all: bpf_jit_disasm bpf_dbg bpf_asm bpftool
+$(OUTPUT)%.o: $(srctree)/tools/bpf/%.c
+       $(QUIET_CC)$(COMPILE.c) -o $@ $<
 
-bpf_jit_disasm : CFLAGS += -DPACKAGE='bpf_jit_disasm'
-bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl
-bpf_jit_disasm : bpf_jit_disasm.o
+$(OUTPUT)%.yacc.o: $(OUTPUT)%.yacc.c
+       $(QUIET_CC)$(COMPILE.c) -o $@ $<
+$(OUTPUT)%.lex.o: $(OUTPUT)%.lex.c
+       $(QUIET_CC)$(COMPILE.c) -o $@ $<
 
-bpf_dbg : LDLIBS = -lreadline
-bpf_dbg : bpf_dbg.o
+PROGS = $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg $(OUTPUT)bpf_asm
 
-bpf_asm : LDLIBS =
-bpf_asm : bpf_asm.o bpf_exp.yacc.o bpf_exp.lex.o
-bpf_exp.lex.o : bpf_exp.yacc.c
+all: $(PROGS) bpftool
 
-clean: bpftool_clean
-       rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.*
+$(OUTPUT)bpf_jit_disasm: CFLAGS += -DPACKAGE='bpf_jit_disasm'
+$(OUTPUT)bpf_jit_disasm: $(OUTPUT)bpf_jit_disasm.o
+       $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ -lopcodes -lbfd -ldl
 
-install: bpftool_install
-       install bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm
-       install bpf_dbg $(prefix)/bin/bpf_dbg
-       install bpf_asm $(prefix)/bin/bpf_asm
+$(OUTPUT)bpf_dbg: $(OUTPUT)bpf_dbg.o
+       $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ -lreadline
+
+$(OUTPUT)bpf_asm: $(OUTPUT)bpf_asm.o $(OUTPUT)bpf_exp.yacc.o $(OUTPUT)bpf_exp.lex.o
+       $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^
+
+$(OUTPUT)bpf_exp.lex.c: $(OUTPUT)bpf_exp.yacc.c
+
+clean: bpftool_clean
+       $(call QUIET_CLEAN, bpf-progs)
+       $(Q)rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
+              $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.*
+       $(call QUIET_CLEAN, core-gen)
+       $(Q)rm -f $(OUTPUT)FEATURE-DUMP.bpf
+
+install: $(PROGS) bpftool_install
+       $(call QUIET_INSTALL, bpf_jit_disasm)
+       $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin
+       $(Q)$(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm
+       $(call QUIET_INSTALL, bpf_dbg)
+       $(Q)$(INSTALL) $(OUTPUT)bpf_dbg $(DESTDIR)$(prefix)/bin/bpf_dbg
+       $(call QUIET_INSTALL, bpf_asm)
+       $(Q)$(INSTALL) $(OUTPUT)bpf_asm $(DESTDIR)$(prefix)/bin/bpf_asm
 
 bpftool:
-       $(MAKE) -C bpftool
+       $(call descend,bpftool)
 
 bpftool_install:
-       $(MAKE) -C bpftool install
+       $(call descend,bpftool,install)
 
 bpftool_clean:
-       $(MAKE) -C bpftool clean
+       $(call descend,bpftool,clean)
 
-.PHONY: bpftool FORCE
+.PHONY: all install clean bpftool bpftool_install bpftool_clean
index 26901ec87361ed1b999d863e7e4c8ed6b7380123..4e69782c4a793f0860e6532aac5b6aaaecc873e0 100644 (file)
@@ -38,7 +38,7 @@ bash_compdir ?= /usr/share/bash-completion/completions
 CC = gcc
 
 CFLAGS += -O2
-CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow
+CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
 CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
 CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
 LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
@@ -70,7 +70,7 @@ ifeq ($(feature-disassembler-four-args), 1)
 CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
 endif
 
-include $(wildcard *.d)
+include $(wildcard $(OUTPUT)*.d)
 
 all: $(OUTPUT)bpftool
 
@@ -89,6 +89,8 @@ $(OUTPUT)%.o: %.c
 clean: $(LIBBPF)-clean
        $(call QUIET_CLEAN, bpftool)
        $(Q)$(RM) $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+       $(call QUIET_CLEAN, core-gen)
+       $(Q)$(RM) $(OUTPUT)FEATURE-DUMP.bpftool
 
 install: $(OUTPUT)bpftool
        $(call QUIET_INSTALL, bpftool)
index 51c935d38ae2bdfa1bb0bd325e38bfba8ffe1d70..b34affa7ef2de40efa5e00388b0e806ff66c47b6 100644 (file)
@@ -49,7 +49,7 @@ struct dump_data {
        unsigned long address_call_base;
        struct kernel_sym *sym_mapping;
        __u32 sym_count;
-       char scratch_buff[SYM_MAX_NAME];
+       char scratch_buff[SYM_MAX_NAME + 8];
 };
 
 void kernel_syms_load(struct dump_data *dd);
index db6bdc3751268351da3126f57566639fce355b12..d245c41213ac20850a1b9b58f1925b827899ff04 100644 (file)
@@ -133,6 +133,7 @@ enum bpf_prog_type {
        BPF_PROG_TYPE_SOCK_OPS,
        BPF_PROG_TYPE_SK_SKB,
        BPF_PROG_TYPE_CGROUP_DEVICE,
+       BPF_PROG_TYPE_SK_MSG,
 };
 
 enum bpf_attach_type {
@@ -143,6 +144,7 @@ enum bpf_attach_type {
        BPF_SK_SKB_STREAM_PARSER,
        BPF_SK_SKB_STREAM_VERDICT,
        BPF_CGROUP_DEVICE,
+       BPF_SK_MSG_VERDICT,
        __MAX_BPF_ATTACH_TYPE
 };
 
@@ -231,6 +233,28 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY           (1U << 3)
 #define BPF_F_WRONLY           (1U << 4)
 
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID   (1U << 5)
+
+enum bpf_stack_build_id_status {
+       /* user space need an empty entry to identify end of a trace */
+       BPF_STACK_BUILD_ID_EMPTY = 0,
+       /* with valid build_id and offset */
+       BPF_STACK_BUILD_ID_VALID = 1,
+       /* couldn't get build_id, fallback to ip */
+       BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+       __s32           status;
+       unsigned char   build_id[BPF_BUILD_ID_SIZE];
+       union {
+               __u64   offset;
+               __u64   ip;
+       };
+};
+
 union bpf_attr {
        struct { /* anonymous struct used by BPF_MAP_CREATE command */
                __u32   map_type;       /* one of enum bpf_map_type */
@@ -696,6 +720,15 @@ union bpf_attr {
  * int bpf_override_return(pt_regs, rc)
  *     @pt_regs: pointer to struct pt_regs
  *     @rc: the return value to set
+ *
+ * int bpf_msg_redirect_map(map, key, flags)
+ *     Redirect msg to a sock in map using key as a lookup key for the
+ *     sock in map.
+ *     @map: pointer to sockmap
+ *     @key: key to lookup sock in map
+ *     @flags: reserved for future use
+ *     Return: SK_PASS
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -757,7 +790,11 @@ union bpf_attr {
        FN(perf_prog_read_value),       \
        FN(getsockopt),                 \
        FN(override_return),            \
-       FN(sock_ops_cb_flags_set),
+       FN(sock_ops_cb_flags_set),      \
+       FN(msg_redirect_map),           \
+       FN(msg_apply_bytes),            \
+       FN(msg_cork_bytes),             \
+       FN(msg_pull_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -919,6 +956,14 @@ enum sk_action {
        SK_PASS,
 };
 
+/* user accessible metadata for SK_MSG packet hook, new fields must
+ * be added to the end of this structure
+ */
+struct sk_msg_md {
+       void *data;
+       void *data_end;
+};
+
 #define BPF_TAG_SIZE   8
 
 struct bpf_prog_info {
index 5bbbf285af74a0afb01ae4dcaba9a7cdbb3a3e93..64a8fc38418610b8f80a0476e75a8024bc2167fa 100644 (file)
@@ -1857,6 +1857,7 @@ static const struct {
        BPF_PROG_SEC("lwt_xmit",        BPF_PROG_TYPE_LWT_XMIT),
        BPF_PROG_SEC("sockops",         BPF_PROG_TYPE_SOCK_OPS),
        BPF_PROG_SEC("sk_skb",          BPF_PROG_TYPE_SK_SKB),
+       BPF_PROG_SEC("sk_msg",          BPF_PROG_TYPE_SK_MSG),
 };
 #undef BPF_PROG_SEC
 
index 8567a858b789340598f7aa3f7a8ce553b5bdea09..f35fb02bdf568a91432dc3f45fccc9cadc2fa1b7 100644 (file)
@@ -13,6 +13,14 @@ endif
 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
 LDLIBS += -lcap -lelf -lrt -lpthread
 
+TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
+all: $(TEST_CUSTOM_PROGS)
+
+$(TEST_CUSTOM_PROGS): urandom_read
+
+urandom_read: urandom_read.c
+       $(CC) -o $(TEST_CUSTOM_PROGS) -static $<
+
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
        test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
@@ -21,7 +29,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
        test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
        sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
        test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-       sample_map_ret0.o test_tcpbpf_kern.o
+       sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
+       sockmap_tcp_msg_prog.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -74,3 +83,5 @@ $(OUTPUT)/%.o: %.c
        $(CLANG) $(CLANG_FLAGS) \
                 -O2 -target bpf -emit-llvm -c $< -o - |      \
        $(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@
+
+EXTRA_CLEAN := $(TEST_CUSTOM_PROGS)
index dde2c11d7771078071ccca4c1e9f2a85014e0443..7cae376d8d0c76c556dee7e6ce3c2a2de448a16e 100644 (file)
@@ -86,6 +86,14 @@ static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
        (void *) BPF_FUNC_perf_prog_read_value;
 static int (*bpf_override_return)(void *ctx, unsigned long rc) =
        (void *) BPF_FUNC_override_return;
+static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) =
+       (void *) BPF_FUNC_msg_redirect_map;
+static int (*bpf_msg_apply_bytes)(void *ctx, int len) =
+       (void *) BPF_FUNC_msg_apply_bytes;
+static int (*bpf_msg_cork_bytes)(void *ctx, int len) =
+       (void *) BPF_FUNC_msg_cork_bytes;
+static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) =
+       (void *) BPF_FUNC_msg_pull_data;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
@@ -123,6 +131,8 @@ static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
        (void *) BPF_FUNC_skb_under_cgroup;
 static int (*bpf_skb_change_head)(void *, int len, int flags) =
        (void *) BPF_FUNC_skb_change_head;
+static int (*bpf_skb_pull_data)(void *, int len) =
+       (void *) BPF_FUNC_skb_pull_data;
 
 /* Scan the ARCH passed in from ARCH env variable (see Makefile) */
 #if defined(__TARGET_ARCH_x86)
index a1dec2b6d9c5b28618996fe6e8312df814d2b4cf..0f92858f6226a37feb921eeea494902a2520d8bc 100644 (file)
@@ -20,14 +20,25 @@ int bpf_prog1(struct __sk_buff *skb)
        __u32 lport = skb->local_port;
        __u32 rport = skb->remote_port;
        __u8 *d = data;
+       __u32 len = (__u32) data_end - (__u32) data;
+       int err;
 
-       if (data + 10 > data_end)
-               return skb->len;
+       if (data + 10 > data_end) {
+               err = bpf_skb_pull_data(skb, 10);
+               if (err)
+                       return SK_DROP;
+
+               data_end = (void *)(long)skb->data_end;
+               data = (void *)(long)skb->data;
+               if (data + 10 > data_end)
+                       return SK_DROP;
+       }
 
        /* This write/read is a bit pointless but tests the verifier and
         * strparser handler for read/write pkt data and access into sk
         * fields.
         */
+       d = data;
        d[7] = 1;
        return skb->len;
 }
diff --git a/tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c
new file mode 100644 (file)
index 0000000..12a7b5c
--- /dev/null
@@ -0,0 +1,33 @@
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+#include "bpf_util.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+
+#define bpf_printk(fmt, ...)                                   \
+({                                                             \
+              char ____fmt[] = fmt;                            \
+              bpf_trace_printk(____fmt, sizeof(____fmt),       \
+                               ##__VA_ARGS__);                 \
+})
+
+SEC("sk_msg1")
+int bpf_prog1(struct sk_msg_md *msg)
+{
+       void *data_end = (void *)(long) msg->data_end;
+       void *data = (void *)(long) msg->data;
+
+       char *d;
+
+       if (data + 8 > data_end)
+               return SK_DROP;
+
+       bpf_printk("data length %i\n", (__u64)msg->data_end - (__u64)msg->data);
+       d = (char *)data;
+       bpf_printk("hello sendmsg hook %i %i\n", d[0], d[1]);
+
+       return SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
index d7bea972cb21bf09c84b38938df9951a0eedf5fc..2ce7634a4012a19c6b35c3bdf35d4518c5ae5c46 100644 (file)
@@ -26,6 +26,13 @@ struct bpf_map_def SEC("maps") sock_map_tx = {
        .max_entries = 20,
 };
 
+struct bpf_map_def SEC("maps") sock_map_msg = {
+       .type = BPF_MAP_TYPE_SOCKMAP,
+       .key_size = sizeof(int),
+       .value_size = sizeof(int),
+       .max_entries = 20,
+};
+
 struct bpf_map_def SEC("maps") sock_map_break = {
        .type = BPF_MAP_TYPE_ARRAY,
        .key_size = sizeof(int),
index 1238733c5b33bceb33f6e59b74e03ba4137d93df..6c253343a6f96e3d5eb9fbd67d0ad8a4eb7f2bf6 100644 (file)
@@ -464,15 +464,17 @@ static void test_devmap(int task, void *data)
 #include <linux/err.h>
 #define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.o"
 #define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o"
+#define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.o"
 static void test_sockmap(int tasks, void *data)
 {
-       int one = 1, map_fd_rx = 0, map_fd_tx = 0, map_fd_break, s, sc, rc;
-       struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_break;
+       struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break;
+       int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break;
        int ports[] = {50200, 50201, 50202, 50204};
        int err, i, fd, udp, sfd[6] = {0xdeadbeef};
        u8 buf[20] = {0x0, 0x5, 0x3, 0x2, 0x1, 0x0};
-       int parse_prog, verdict_prog;
+       int parse_prog, verdict_prog, msg_prog;
        struct sockaddr_in addr;
+       int one = 1, s, sc, rc;
        struct bpf_object *obj;
        struct timeval to;
        __u32 key, value;
@@ -584,6 +586,12 @@ static void test_sockmap(int tasks, void *data)
                goto out_sockmap;
        }
 
+       err = bpf_prog_attach(-1, fd, BPF_SK_MSG_VERDICT, 0);
+       if (!err) {
+               printf("Failed invalid msg verdict prog attach\n");
+               goto out_sockmap;
+       }
+
        err = bpf_prog_attach(-1, fd, __MAX_BPF_ATTACH_TYPE, 0);
        if (!err) {
                printf("Failed unknown prog attach\n");
@@ -602,6 +610,12 @@ static void test_sockmap(int tasks, void *data)
                goto out_sockmap;
        }
 
+       err = bpf_prog_detach(fd, BPF_SK_MSG_VERDICT);
+       if (err) {
+               printf("Failed empty msg verdict prog detach\n");
+               goto out_sockmap;
+       }
+
        err = bpf_prog_detach(fd, __MAX_BPF_ATTACH_TYPE);
        if (!err) {
                printf("Detach invalid prog successful\n");
@@ -616,6 +630,13 @@ static void test_sockmap(int tasks, void *data)
                goto out_sockmap;
        }
 
+       err = bpf_prog_load(SOCKMAP_TCP_MSG_PROG,
+                           BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog);
+       if (err) {
+               printf("Failed to load SK_SKB msg prog\n");
+               goto out_sockmap;
+       }
+
        err = bpf_prog_load(SOCKMAP_VERDICT_PROG,
                            BPF_PROG_TYPE_SK_SKB, &obj, &verdict_prog);
        if (err) {
@@ -631,7 +652,7 @@ static void test_sockmap(int tasks, void *data)
 
        map_fd_rx = bpf_map__fd(bpf_map_rx);
        if (map_fd_rx < 0) {
-               printf("Failed to get map fd\n");
+               printf("Failed to get map rx fd\n");
                goto out_sockmap;
        }
 
@@ -647,6 +668,18 @@ static void test_sockmap(int tasks, void *data)
                goto out_sockmap;
        }
 
+       bpf_map_msg = bpf_object__find_map_by_name(obj, "sock_map_msg");
+       if (IS_ERR(bpf_map_msg)) {
+               printf("Failed to load map msg from msg_verdict prog\n");
+               goto out_sockmap;
+       }
+
+       map_fd_msg = bpf_map__fd(bpf_map_msg);
+       if (map_fd_msg < 0) {
+               printf("Failed to get map msg fd\n");
+               goto out_sockmap;
+       }
+
        bpf_map_break = bpf_object__find_map_by_name(obj, "sock_map_break");
        if (IS_ERR(bpf_map_break)) {
                printf("Failed to load map tx from verdict prog\n");
@@ -680,6 +713,12 @@ static void test_sockmap(int tasks, void *data)
                goto out_sockmap;
        }
 
+       err = bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0);
+       if (err) {
+               printf("Failed msg verdict bpf prog attach\n");
+               goto out_sockmap;
+       }
+
        err = bpf_prog_attach(verdict_prog, map_fd_rx,
                              __MAX_BPF_ATTACH_TYPE, 0);
        if (!err) {
@@ -719,6 +758,14 @@ static void test_sockmap(int tasks, void *data)
                }
        }
 
+       /* Put sfd[2] (sending fd below) into msg map to test sendmsg bpf */
+       i = 0;
+       err = bpf_map_update_elem(map_fd_msg, &i, &sfd[2], BPF_ANY);
+       if (err) {
+               printf("Failed map_fd_msg update sockmap %i\n", err);
+               goto out_sockmap;
+       }
+
        /* Test map send/recv */
        for (i = 0; i < 2; i++) {
                buf[0] = i;
index 27ad5404389e32a8e5d41abaf01f25b70f55dd1f..e9df48b306df6a2cdde861a6571a4759bd9158f9 100644 (file)
@@ -841,7 +841,8 @@ static void test_tp_attach_query(void)
 static int compare_map_keys(int map1_fd, int map2_fd)
 {
        __u32 key, next_key;
-       char val_buf[PERF_MAX_STACK_DEPTH * sizeof(__u64)];
+       char val_buf[PERF_MAX_STACK_DEPTH *
+                    sizeof(struct bpf_stack_build_id)];
        int err;
 
        err = bpf_map_get_next_key(map1_fd, NULL, &key);
@@ -964,6 +965,166 @@ out:
        return;
 }
 
+static int extract_build_id(char *build_id, size_t size)
+{
+       FILE *fp;
+       char *line = NULL;
+       size_t len = 0;
+
+       fp = popen("readelf -n ./urandom_read | grep 'Build ID'", "r");
+       if (fp == NULL)
+               return -1;
+
+       if (getline(&line, &len, fp) == -1)
+               goto err;
+       fclose(fp);
+
+       if (len > size)
+               len = size;
+       memcpy(build_id, line, len);
+       build_id[len] = '\0';
+       return 0;
+err:
+       fclose(fp);
+       return -1;
+}
+
+static void test_stacktrace_build_id(void)
+{
+       int control_map_fd, stackid_hmap_fd, stackmap_fd;
+       const char *file = "./test_stacktrace_build_id.o";
+       int bytes, efd, err, pmu_fd, prog_fd;
+       struct perf_event_attr attr = {};
+       __u32 key, previous_key, val, duration = 0;
+       struct bpf_object *obj;
+       char buf[256];
+       int i, j;
+       struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+       int build_id_matches = 0;
+
+       err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+       if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+               goto out;
+
+       /* Get the ID for the sched/sched_switch tracepoint */
+       snprintf(buf, sizeof(buf),
+                "/sys/kernel/debug/tracing/events/random/urandom_read/id");
+       efd = open(buf, O_RDONLY, 0);
+       if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+               goto close_prog;
+
+       bytes = read(efd, buf, sizeof(buf));
+       close(efd);
+       if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
+                 "read", "bytes %d errno %d\n", bytes, errno))
+               goto close_prog;
+
+       /* Open the perf event and attach bpf progrram */
+       attr.config = strtol(buf, NULL, 0);
+       attr.type = PERF_TYPE_TRACEPOINT;
+       attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+       attr.sample_period = 1;
+       attr.wakeup_events = 1;
+       pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+                        0 /* cpu 0 */, -1 /* group id */,
+                        0 /* flags */);
+       if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+                 pmu_fd, errno))
+               goto close_prog;
+
+       err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+       if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+                 err, errno))
+               goto close_pmu;
+
+       err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+       if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+                 err, errno))
+               goto disable_pmu;
+
+       /* find map fds */
+       control_map_fd = bpf_find_map(__func__, obj, "control_map");
+       if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+                 "err %d errno %d\n", err, errno))
+               goto disable_pmu;
+
+       stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+       if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+                 "err %d errno %d\n", err, errno))
+               goto disable_pmu;
+
+       stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+       if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+                 err, errno))
+               goto disable_pmu;
+
+       assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+              == 0);
+       assert(system("./urandom_read if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0);
+       /* disable stack trace collection */
+       key = 0;
+       val = 1;
+       bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+       /* for every element in stackid_hmap, we can find a corresponding one
+        * in stackmap, and vise versa.
+        */
+       err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+       if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+                 "err %d errno %d\n", err, errno))
+               goto disable_pmu;
+
+       err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+       if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+                 "err %d errno %d\n", err, errno))
+               goto disable_pmu;
+
+       err = extract_build_id(buf, 256);
+
+       if (CHECK(err, "get build_id with readelf",
+                 "err %d errno %d\n", err, errno))
+               goto disable_pmu;
+
+       err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+       if (CHECK(err, "get_next_key from stackmap",
+                 "err %d, errno %d\n", err, errno))
+               goto disable_pmu;
+
+       do {
+               char build_id[64];
+
+               err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+               if (CHECK(err, "lookup_elem from stackmap",
+                         "err %d, errno %d\n", err, errno))
+                       goto disable_pmu;
+               for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+                       if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+                           id_offs[i].offset != 0) {
+                               for (j = 0; j < 20; ++j)
+                                       sprintf(build_id + 2 * j, "%02x",
+                                               id_offs[i].build_id[j] & 0xff);
+                               if (strstr(buf, build_id) != NULL)
+                                       build_id_matches = 1;
+                       }
+               previous_key = key;
+       } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+       CHECK(build_id_matches < 1, "build id match",
+             "Didn't find expected build ID from the map");
+
+disable_pmu:
+       ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+       close(pmu_fd);
+
+close_prog:
+       bpf_object__close(obj);
+
+out:
+       return;
+}
+
 int main(void)
 {
        test_pkt_access();
@@ -976,6 +1137,7 @@ int main(void)
        test_obj_name();
        test_tp_attach_query();
        test_stacktrace_map();
+       test_stacktrace_build_id();
 
        printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
        return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
new file mode 100644 (file)
index 0000000..b755bd7
--- /dev/null
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH         127
+#endif
+
+struct bpf_map_def SEC("maps") control_map = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(__u32),
+       .value_size = sizeof(__u32),
+       .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") stackid_hmap = {
+       .type = BPF_MAP_TYPE_HASH,
+       .key_size = sizeof(__u32),
+       .value_size = sizeof(__u32),
+       .max_entries = 10000,
+};
+
+struct bpf_map_def SEC("maps") stackmap = {
+       .type = BPF_MAP_TYPE_STACK_TRACE,
+       .key_size = sizeof(__u32),
+       .value_size = sizeof(struct bpf_stack_build_id)
+               * PERF_MAX_STACK_DEPTH,
+       .max_entries = 128,
+       .map_flags = BPF_F_STACK_BUILD_ID,
+};
+
+/* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
+struct random_urandom_args {
+       unsigned long long pad;
+       int got_bits;
+       int pool_left;
+       int input_left;
+};
+
+SEC("tracepoint/random/urandom_read")
+int oncpu(struct random_urandom_args *args)
+{
+       __u32 key = 0, val = 0, *value_p;
+
+       value_p = bpf_map_lookup_elem(&control_map, &key);
+       if (value_p && *value_p)
+               return 0; /* skip if non-zero *value_p */
+
+       /* The size of stackmap and stackid_hmap should be the same */
+       key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK);
+       if ((int)key >= 0)
+               bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
index 86d7ff491b6fbff5824d10bea0f79e4a3bcd6748..3e7718b1a9ae49c176a407d9f14ad31704255701 100644 (file)
@@ -1596,6 +1596,60 @@ static struct bpf_test tests[] = {
                .result = ACCEPT,
                .prog_type = BPF_PROG_TYPE_SK_SKB,
        },
+       {
+               "direct packet read for SK_MSG",
+               .insns = {
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+                                   offsetof(struct sk_msg_md, data)),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
+                                   offsetof(struct sk_msg_md, data_end)),
+                       BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+                       BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+                       BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .result = ACCEPT,
+               .prog_type = BPF_PROG_TYPE_SK_MSG,
+       },
+       {
+               "direct packet write for SK_MSG",
+               .insns = {
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+                                   offsetof(struct sk_msg_md, data)),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
+                                   offsetof(struct sk_msg_md, data_end)),
+                       BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+                       BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+                       BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .result = ACCEPT,
+               .prog_type = BPF_PROG_TYPE_SK_MSG,
+       },
+       {
+               "overlapping checks for direct packet access SK_MSG",
+               .insns = {
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+                                   offsetof(struct sk_msg_md, data)),
+                       BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
+                                   offsetof(struct sk_msg_md, data_end)),
+                       BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+                       BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 4),
+                       BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+                       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+                       BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+                       BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_2, 6),
+                       BPF_MOV64_IMM(BPF_REG_0, 0),
+                       BPF_EXIT_INSN(),
+               },
+               .result = ACCEPT,
+               .prog_type = BPF_PROG_TYPE_SK_MSG,
+       },
        {
                "check skb->mark is not writeable by sockets",
                .insns = {
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
new file mode 100644 (file)
index 0000000..4acfdeb
--- /dev/null
@@ -0,0 +1,22 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+#define BUF_SIZE 256
+int main(void)
+{
+       int fd = open("/dev/urandom", O_RDONLY);
+       int i;
+       char buf[BUF_SIZE];
+
+       if (fd < 0)
+               return 1;
+       for (i = 0; i < 4; ++i)
+               read(fd, buf, BUF_SIZE);
+
+       close(fd);
+       return 0;
+}