a5d8cad18ead411a02ff2899c39f300f3a57456a
[muen/linux.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t && t->dev->flags & IFF_UP)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         err = -E2BIG;
257         if (parms->name[0]) {
258                 if (!dev_valid_name(parms->name))
259                         goto failed;
260                 strlcpy(name, parms->name, IFNAMSIZ);
261         } else {
262                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
263                         goto failed;
264                 strcpy(name, ops->kind);
265                 strcat(name, "%d");
266         }
267
268         ASSERT_RTNL();
269         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270         if (!dev) {
271                 err = -ENOMEM;
272                 goto failed;
273         }
274         dev_net_set(dev, net);
275
276         dev->rtnl_link_ops = ops;
277
278         tunnel = netdev_priv(dev);
279         tunnel->parms = *parms;
280         tunnel->net = net;
281
282         err = register_netdevice(dev);
283         if (err)
284                 goto failed_free;
285
286         return dev;
287
288 failed_free:
289         free_netdev(dev);
290 failed:
291         return ERR_PTR(err);
292 }
293
294 static int ip_tunnel_bind_dev(struct net_device *dev)
295 {
296         struct net_device *tdev = NULL;
297         struct ip_tunnel *tunnel = netdev_priv(dev);
298         const struct iphdr *iph;
299         int hlen = LL_MAX_HEADER;
300         int mtu = ETH_DATA_LEN;
301         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
302
303         iph = &tunnel->parms.iph;
304
305         /* Guess output device to choose reasonable mtu and needed_headroom */
306         if (iph->daddr) {
307                 struct flowi4 fl4;
308                 struct rtable *rt;
309
310                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
311                                     iph->saddr, tunnel->parms.o_key,
312                                     RT_TOS(iph->tos), tunnel->parms.link,
313                                     tunnel->fwmark, 0);
314                 rt = ip_route_output_key(tunnel->net, &fl4);
315
316                 if (!IS_ERR(rt)) {
317                         tdev = rt->dst.dev;
318                         ip_rt_put(rt);
319                 }
320                 if (dev->type != ARPHRD_ETHER)
321                         dev->flags |= IFF_POINTOPOINT;
322
323                 dst_cache_reset(&tunnel->dst_cache);
324         }
325
326         if (!tdev && tunnel->parms.link)
327                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
328
329         if (tdev) {
330                 hlen = tdev->hard_header_len + tdev->needed_headroom;
331                 mtu = min(tdev->mtu, IP_MAX_MTU);
332         }
333
334         dev->needed_headroom = t_hlen + hlen;
335         mtu -= (dev->hard_header_len + t_hlen);
336
337         if (mtu < IPV4_MIN_MTU)
338                 mtu = IPV4_MIN_MTU;
339
340         return mtu;
341 }
342
343 static struct ip_tunnel *ip_tunnel_create(struct net *net,
344                                           struct ip_tunnel_net *itn,
345                                           struct ip_tunnel_parm *parms)
346 {
347         struct ip_tunnel *nt;
348         struct net_device *dev;
349         int t_hlen;
350         int mtu;
351         int err;
352
353         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
354         if (IS_ERR(dev))
355                 return ERR_CAST(dev);
356
357         mtu = ip_tunnel_bind_dev(dev);
358         err = dev_set_mtu(dev, mtu);
359         if (err)
360                 goto err_dev_set_mtu;
361
362         nt = netdev_priv(dev);
363         t_hlen = nt->hlen + sizeof(struct iphdr);
364         dev->min_mtu = ETH_MIN_MTU;
365         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
366         ip_tunnel_add(itn, nt);
367         return nt;
368
369 err_dev_set_mtu:
370         unregister_netdevice(dev);
371         return ERR_PTR(err);
372 }
373
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376                   bool log_ecn_error)
377 {
378         struct pcpu_sw_netstats *tstats;
379         const struct iphdr *iph = ip_hdr(skb);
380         int err;
381
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383         if (ipv4_is_multicast(iph->daddr)) {
384                 tunnel->dev->stats.multicast++;
385                 skb->pkt_type = PACKET_BROADCAST;
386         }
387 #endif
388
389         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391                 tunnel->dev->stats.rx_crc_errors++;
392                 tunnel->dev->stats.rx_errors++;
393                 goto drop;
394         }
395
396         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397                 if (!(tpi->flags&TUNNEL_SEQ) ||
398                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399                         tunnel->dev->stats.rx_fifo_errors++;
400                         tunnel->dev->stats.rx_errors++;
401                         goto drop;
402                 }
403                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
404         }
405
406         skb_reset_network_header(skb);
407
408         err = IP_ECN_decapsulate(iph, skb);
409         if (unlikely(err)) {
410                 if (log_ecn_error)
411                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412                                         &iph->saddr, iph->tos);
413                 if (err > 1) {
414                         ++tunnel->dev->stats.rx_frame_errors;
415                         ++tunnel->dev->stats.rx_errors;
416                         goto drop;
417                 }
418         }
419
420         tstats = this_cpu_ptr(tunnel->dev->tstats);
421         u64_stats_update_begin(&tstats->syncp);
422         tstats->rx_packets++;
423         tstats->rx_bytes += skb->len;
424         u64_stats_update_end(&tstats->syncp);
425
426         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427
428         if (tunnel->dev->type == ARPHRD_ETHER) {
429                 skb->protocol = eth_type_trans(skb, tunnel->dev);
430                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431         } else {
432                 skb->dev = tunnel->dev;
433         }
434
435         if (tun_dst)
436                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
437
438         gro_cells_receive(&tunnel->gro_cells, skb);
439         return 0;
440
441 drop:
442         if (tun_dst)
443                 dst_release((struct dst_entry *)tun_dst);
444         kfree_skb(skb);
445         return 0;
446 }
447 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
448
449 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
450                             unsigned int num)
451 {
452         if (num >= MAX_IPTUN_ENCAP_OPS)
453                 return -ERANGE;
454
455         return !cmpxchg((const struct ip_tunnel_encap_ops **)
456                         &iptun_encaps[num],
457                         NULL, ops) ? 0 : -1;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
460
461 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462                             unsigned int num)
463 {
464         int ret;
465
466         if (num >= MAX_IPTUN_ENCAP_OPS)
467                 return -ERANGE;
468
469         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
470                        &iptun_encaps[num],
471                        ops, NULL) == ops) ? 0 : -1;
472
473         synchronize_net();
474
475         return ret;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
478
479 int ip_tunnel_encap_setup(struct ip_tunnel *t,
480                           struct ip_tunnel_encap *ipencap)
481 {
482         int hlen;
483
484         memset(&t->encap, 0, sizeof(t->encap));
485
486         hlen = ip_encap_hlen(ipencap);
487         if (hlen < 0)
488                 return hlen;
489
490         t->encap.type = ipencap->type;
491         t->encap.sport = ipencap->sport;
492         t->encap.dport = ipencap->dport;
493         t->encap.flags = ipencap->flags;
494
495         t->encap_hlen = hlen;
496         t->hlen = t->encap_hlen + t->tun_hlen;
497
498         return 0;
499 }
500 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
501
502 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
503                             struct rtable *rt, __be16 df,
504                             const struct iphdr *inner_iph,
505                             int tunnel_hlen, __be32 dst, bool md)
506 {
507         struct ip_tunnel *tunnel = netdev_priv(dev);
508         int pkt_size;
509         int mtu;
510
511         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
512         pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
513
514         if (df)
515                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
516                                         - sizeof(struct iphdr) - tunnel_hlen;
517         else
518                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
519
520         if (skb_valid_dst(skb))
521                 skb_dst_update_pmtu(skb, mtu);
522
523         if (skb->protocol == htons(ETH_P_IP)) {
524                 if (!skb_is_gso(skb) &&
525                     (inner_iph->frag_off & htons(IP_DF)) &&
526                     mtu < pkt_size) {
527                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
528                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
529                         return -E2BIG;
530                 }
531         }
532 #if IS_ENABLED(CONFIG_IPV6)
533         else if (skb->protocol == htons(ETH_P_IPV6)) {
534                 struct rt6_info *rt6;
535                 __be32 daddr;
536
537                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
538                                            NULL;
539                 daddr = md ? dst : tunnel->parms.iph.daddr;
540
541                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
542                            mtu >= IPV6_MIN_MTU) {
543                         if ((daddr && !ipv4_is_multicast(daddr)) ||
544                             rt6->rt6i_dst.plen == 128) {
545                                 rt6->rt6i_flags |= RTF_MODIFIED;
546                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
547                         }
548                 }
549
550                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
551                                         mtu < pkt_size) {
552                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
553                         return -E2BIG;
554                 }
555         }
556 #endif
557         return 0;
558 }
559
560 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
561                        u8 proto, int tunnel_hlen)
562 {
563         struct ip_tunnel *tunnel = netdev_priv(dev);
564         u32 headroom = sizeof(struct iphdr);
565         struct ip_tunnel_info *tun_info;
566         const struct ip_tunnel_key *key;
567         const struct iphdr *inner_iph;
568         struct rtable *rt = NULL;
569         struct flowi4 fl4;
570         __be16 df = 0;
571         u8 tos, ttl;
572         bool use_cache;
573
574         tun_info = skb_tunnel_info(skb);
575         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
576                      ip_tunnel_info_af(tun_info) != AF_INET))
577                 goto tx_error;
578         key = &tun_info->key;
579         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
580         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
581         tos = key->tos;
582         if (tos == 1) {
583                 if (skb->protocol == htons(ETH_P_IP))
584                         tos = inner_iph->tos;
585                 else if (skb->protocol == htons(ETH_P_IPV6))
586                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
587         }
588         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
589                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
590                             0, skb->mark, skb_get_hash(skb));
591         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
592                 goto tx_error;
593
594         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
595         if (use_cache)
596                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
597         if (!rt) {
598                 rt = ip_route_output_key(tunnel->net, &fl4);
599                 if (IS_ERR(rt)) {
600                         dev->stats.tx_carrier_errors++;
601                         goto tx_error;
602                 }
603                 if (use_cache)
604                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
605                                           fl4.saddr);
606         }
607         if (rt->dst.dev == dev) {
608                 ip_rt_put(rt);
609                 dev->stats.collisions++;
610                 goto tx_error;
611         }
612
613         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
614                 df = htons(IP_DF);
615         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
616                             key->u.ipv4.dst, true)) {
617                 ip_rt_put(rt);
618                 goto tx_error;
619         }
620
621         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
622         ttl = key->ttl;
623         if (ttl == 0) {
624                 if (skb->protocol == htons(ETH_P_IP))
625                         ttl = inner_iph->ttl;
626                 else if (skb->protocol == htons(ETH_P_IPV6))
627                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
628                 else
629                         ttl = ip4_dst_hoplimit(&rt->dst);
630         }
631
632         if (!df && skb->protocol == htons(ETH_P_IP))
633                 df = inner_iph->frag_off & htons(IP_DF);
634
635         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
636         if (headroom > dev->needed_headroom)
637                 dev->needed_headroom = headroom;
638
639         if (skb_cow_head(skb, dev->needed_headroom)) {
640                 ip_rt_put(rt);
641                 goto tx_dropped;
642         }
643         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
644                       df, !net_eq(tunnel->net, dev_net(dev)));
645         return;
646 tx_error:
647         dev->stats.tx_errors++;
648         goto kfree;
649 tx_dropped:
650         dev->stats.tx_dropped++;
651 kfree:
652         kfree_skb(skb);
653 }
654 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
655
656 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
657                     const struct iphdr *tnl_params, u8 protocol)
658 {
659         struct ip_tunnel *tunnel = netdev_priv(dev);
660         struct ip_tunnel_info *tun_info = NULL;
661         const struct iphdr *inner_iph;
662         unsigned int max_headroom;      /* The extra header space needed */
663         struct rtable *rt = NULL;               /* Route to the other host */
664         bool use_cache = false;
665         struct flowi4 fl4;
666         bool md = false;
667         bool connected;
668         u8 tos, ttl;
669         __be32 dst;
670         __be16 df;
671
672         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
673         connected = (tunnel->parms.iph.daddr != 0);
674
675         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
676
677         dst = tnl_params->daddr;
678         if (dst == 0) {
679                 /* NBMA tunnel */
680
681                 if (!skb_dst(skb)) {
682                         dev->stats.tx_fifo_errors++;
683                         goto tx_error;
684                 }
685
686                 tun_info = skb_tunnel_info(skb);
687                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
688                     ip_tunnel_info_af(tun_info) == AF_INET &&
689                     tun_info->key.u.ipv4.dst) {
690                         dst = tun_info->key.u.ipv4.dst;
691                         md = true;
692                         connected = true;
693                 }
694                 else if (skb->protocol == htons(ETH_P_IP)) {
695                         rt = skb_rtable(skb);
696                         dst = rt_nexthop(rt, inner_iph->daddr);
697                 }
698 #if IS_ENABLED(CONFIG_IPV6)
699                 else if (skb->protocol == htons(ETH_P_IPV6)) {
700                         const struct in6_addr *addr6;
701                         struct neighbour *neigh;
702                         bool do_tx_error_icmp;
703                         int addr_type;
704
705                         neigh = dst_neigh_lookup(skb_dst(skb),
706                                                  &ipv6_hdr(skb)->daddr);
707                         if (!neigh)
708                                 goto tx_error;
709
710                         addr6 = (const struct in6_addr *)&neigh->primary_key;
711                         addr_type = ipv6_addr_type(addr6);
712
713                         if (addr_type == IPV6_ADDR_ANY) {
714                                 addr6 = &ipv6_hdr(skb)->daddr;
715                                 addr_type = ipv6_addr_type(addr6);
716                         }
717
718                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
719                                 do_tx_error_icmp = true;
720                         else {
721                                 do_tx_error_icmp = false;
722                                 dst = addr6->s6_addr32[3];
723                         }
724                         neigh_release(neigh);
725                         if (do_tx_error_icmp)
726                                 goto tx_error_icmp;
727                 }
728 #endif
729                 else
730                         goto tx_error;
731
732                 if (!md)
733                         connected = false;
734         }
735
736         tos = tnl_params->tos;
737         if (tos & 0x1) {
738                 tos &= ~0x1;
739                 if (skb->protocol == htons(ETH_P_IP)) {
740                         tos = inner_iph->tos;
741                         connected = false;
742                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
743                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
744                         connected = false;
745                 }
746         }
747
748         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
749                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
750                             tunnel->fwmark, skb_get_hash(skb));
751
752         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
753                 goto tx_error;
754
755         if (connected && md) {
756                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
757                 if (use_cache)
758                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
759                                                &fl4.saddr);
760         } else {
761                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
762                                                 &fl4.saddr) : NULL;
763         }
764
765         if (!rt) {
766                 rt = ip_route_output_key(tunnel->net, &fl4);
767
768                 if (IS_ERR(rt)) {
769                         dev->stats.tx_carrier_errors++;
770                         goto tx_error;
771                 }
772                 if (use_cache)
773                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
774                                           fl4.saddr);
775                 else if (!md && connected)
776                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
777                                           fl4.saddr);
778         }
779
780         if (rt->dst.dev == dev) {
781                 ip_rt_put(rt);
782                 dev->stats.collisions++;
783                 goto tx_error;
784         }
785
786         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
787                             0, 0, false)) {
788                 ip_rt_put(rt);
789                 goto tx_error;
790         }
791
792         if (tunnel->err_count > 0) {
793                 if (time_before(jiffies,
794                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
795                         tunnel->err_count--;
796
797                         dst_link_failure(skb);
798                 } else
799                         tunnel->err_count = 0;
800         }
801
802         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
803         ttl = tnl_params->ttl;
804         if (ttl == 0) {
805                 if (skb->protocol == htons(ETH_P_IP))
806                         ttl = inner_iph->ttl;
807 #if IS_ENABLED(CONFIG_IPV6)
808                 else if (skb->protocol == htons(ETH_P_IPV6))
809                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
810 #endif
811                 else
812                         ttl = ip4_dst_hoplimit(&rt->dst);
813         }
814
815         df = tnl_params->frag_off;
816         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
817                 df |= (inner_iph->frag_off&htons(IP_DF));
818
819         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
820                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
821         if (max_headroom > dev->needed_headroom)
822                 dev->needed_headroom = max_headroom;
823
824         if (skb_cow_head(skb, dev->needed_headroom)) {
825                 ip_rt_put(rt);
826                 dev->stats.tx_dropped++;
827                 kfree_skb(skb);
828                 return;
829         }
830
831         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
832                       df, !net_eq(tunnel->net, dev_net(dev)));
833         return;
834
835 #if IS_ENABLED(CONFIG_IPV6)
836 tx_error_icmp:
837         dst_link_failure(skb);
838 #endif
839 tx_error:
840         dev->stats.tx_errors++;
841         kfree_skb(skb);
842 }
843 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
844
845 static void ip_tunnel_update(struct ip_tunnel_net *itn,
846                              struct ip_tunnel *t,
847                              struct net_device *dev,
848                              struct ip_tunnel_parm *p,
849                              bool set_mtu,
850                              __u32 fwmark)
851 {
852         ip_tunnel_del(itn, t);
853         t->parms.iph.saddr = p->iph.saddr;
854         t->parms.iph.daddr = p->iph.daddr;
855         t->parms.i_key = p->i_key;
856         t->parms.o_key = p->o_key;
857         if (dev->type != ARPHRD_ETHER) {
858                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
859                 memcpy(dev->broadcast, &p->iph.daddr, 4);
860         }
861         ip_tunnel_add(itn, t);
862
863         t->parms.iph.ttl = p->iph.ttl;
864         t->parms.iph.tos = p->iph.tos;
865         t->parms.iph.frag_off = p->iph.frag_off;
866
867         if (t->parms.link != p->link || t->fwmark != fwmark) {
868                 int mtu;
869
870                 t->parms.link = p->link;
871                 t->fwmark = fwmark;
872                 mtu = ip_tunnel_bind_dev(dev);
873                 if (set_mtu)
874                         dev->mtu = mtu;
875         }
876         dst_cache_reset(&t->dst_cache);
877         netdev_state_change(dev);
878 }
879
880 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
881 {
882         int err = 0;
883         struct ip_tunnel *t = netdev_priv(dev);
884         struct net *net = t->net;
885         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
886
887         switch (cmd) {
888         case SIOCGETTUNNEL:
889                 if (dev == itn->fb_tunnel_dev) {
890                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
891                         if (!t)
892                                 t = netdev_priv(dev);
893                 }
894                 memcpy(p, &t->parms, sizeof(*p));
895                 break;
896
897         case SIOCADDTUNNEL:
898         case SIOCCHGTUNNEL:
899                 err = -EPERM;
900                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
901                         goto done;
902                 if (p->iph.ttl)
903                         p->iph.frag_off |= htons(IP_DF);
904                 if (!(p->i_flags & VTI_ISVTI)) {
905                         if (!(p->i_flags & TUNNEL_KEY))
906                                 p->i_key = 0;
907                         if (!(p->o_flags & TUNNEL_KEY))
908                                 p->o_key = 0;
909                 }
910
911                 t = ip_tunnel_find(itn, p, itn->type);
912
913                 if (cmd == SIOCADDTUNNEL) {
914                         if (!t) {
915                                 t = ip_tunnel_create(net, itn, p);
916                                 err = PTR_ERR_OR_ZERO(t);
917                                 break;
918                         }
919
920                         err = -EEXIST;
921                         break;
922                 }
923                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
924                         if (t) {
925                                 if (t->dev != dev) {
926                                         err = -EEXIST;
927                                         break;
928                                 }
929                         } else {
930                                 unsigned int nflags = 0;
931
932                                 if (ipv4_is_multicast(p->iph.daddr))
933                                         nflags = IFF_BROADCAST;
934                                 else if (p->iph.daddr)
935                                         nflags = IFF_POINTOPOINT;
936
937                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
938                                         err = -EINVAL;
939                                         break;
940                                 }
941
942                                 t = netdev_priv(dev);
943                         }
944                 }
945
946                 if (t) {
947                         err = 0;
948                         ip_tunnel_update(itn, t, dev, p, true, 0);
949                 } else {
950                         err = -ENOENT;
951                 }
952                 break;
953
954         case SIOCDELTUNNEL:
955                 err = -EPERM;
956                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
957                         goto done;
958
959                 if (dev == itn->fb_tunnel_dev) {
960                         err = -ENOENT;
961                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
962                         if (!t)
963                                 goto done;
964                         err = -EPERM;
965                         if (t == netdev_priv(itn->fb_tunnel_dev))
966                                 goto done;
967                         dev = t->dev;
968                 }
969                 unregister_netdevice(dev);
970                 err = 0;
971                 break;
972
973         default:
974                 err = -EINVAL;
975         }
976
977 done:
978         return err;
979 }
980 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
981
982 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
983 {
984         struct ip_tunnel *tunnel = netdev_priv(dev);
985         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
986         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
987
988         if (new_mtu < ETH_MIN_MTU)
989                 return -EINVAL;
990
991         if (new_mtu > max_mtu) {
992                 if (strict)
993                         return -EINVAL;
994
995                 new_mtu = max_mtu;
996         }
997
998         dev->mtu = new_mtu;
999         return 0;
1000 }
1001 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1002
1003 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1004 {
1005         return __ip_tunnel_change_mtu(dev, new_mtu, true);
1006 }
1007 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1008
1009 static void ip_tunnel_dev_free(struct net_device *dev)
1010 {
1011         struct ip_tunnel *tunnel = netdev_priv(dev);
1012
1013         gro_cells_destroy(&tunnel->gro_cells);
1014         dst_cache_destroy(&tunnel->dst_cache);
1015         free_percpu(dev->tstats);
1016 }
1017
1018 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1019 {
1020         struct ip_tunnel *tunnel = netdev_priv(dev);
1021         struct ip_tunnel_net *itn;
1022
1023         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1024
1025         if (itn->fb_tunnel_dev != dev) {
1026                 ip_tunnel_del(itn, netdev_priv(dev));
1027                 unregister_netdevice_queue(dev, head);
1028         }
1029 }
1030 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1031
1032 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1033 {
1034         struct ip_tunnel *tunnel = netdev_priv(dev);
1035
1036         return tunnel->net;
1037 }
1038 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1039
1040 int ip_tunnel_get_iflink(const struct net_device *dev)
1041 {
1042         struct ip_tunnel *tunnel = netdev_priv(dev);
1043
1044         return tunnel->parms.link;
1045 }
1046 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1047
1048 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1049                                   struct rtnl_link_ops *ops, char *devname)
1050 {
1051         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1052         struct ip_tunnel_parm parms;
1053         unsigned int i;
1054
1055         itn->rtnl_link_ops = ops;
1056         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1057                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1058
1059         if (!ops || !net_has_fallback_tunnels(net)) {
1060                 struct ip_tunnel_net *it_init_net;
1061
1062                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1063                 itn->type = it_init_net->type;
1064                 itn->fb_tunnel_dev = NULL;
1065                 return 0;
1066         }
1067
1068         memset(&parms, 0, sizeof(parms));
1069         if (devname)
1070                 strlcpy(parms.name, devname, IFNAMSIZ);
1071
1072         rtnl_lock();
1073         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1074         /* FB netdevice is special: we have one, and only one per netns.
1075          * Allowing to move it to another netns is clearly unsafe.
1076          */
1077         if (!IS_ERR(itn->fb_tunnel_dev)) {
1078                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1079                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1080                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1081                 itn->type = itn->fb_tunnel_dev->type;
1082         }
1083         rtnl_unlock();
1084
1085         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1086 }
1087 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1088
1089 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1090                               struct list_head *head,
1091                               struct rtnl_link_ops *ops)
1092 {
1093         struct net_device *dev, *aux;
1094         int h;
1095
1096         for_each_netdev_safe(net, dev, aux)
1097                 if (dev->rtnl_link_ops == ops)
1098                         unregister_netdevice_queue(dev, head);
1099
1100         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1101                 struct ip_tunnel *t;
1102                 struct hlist_node *n;
1103                 struct hlist_head *thead = &itn->tunnels[h];
1104
1105                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1106                         /* If dev is in the same netns, it has already
1107                          * been added to the list by the previous loop.
1108                          */
1109                         if (!net_eq(dev_net(t->dev), net))
1110                                 unregister_netdevice_queue(t->dev, head);
1111         }
1112 }
1113
1114 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1115                            struct rtnl_link_ops *ops)
1116 {
1117         struct ip_tunnel_net *itn;
1118         struct net *net;
1119         LIST_HEAD(list);
1120
1121         rtnl_lock();
1122         list_for_each_entry(net, net_list, exit_list) {
1123                 itn = net_generic(net, id);
1124                 ip_tunnel_destroy(net, itn, &list, ops);
1125         }
1126         unregister_netdevice_many(&list);
1127         rtnl_unlock();
1128 }
1129 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1130
1131 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1132                       struct ip_tunnel_parm *p, __u32 fwmark)
1133 {
1134         struct ip_tunnel *nt;
1135         struct net *net = dev_net(dev);
1136         struct ip_tunnel_net *itn;
1137         int mtu;
1138         int err;
1139
1140         nt = netdev_priv(dev);
1141         itn = net_generic(net, nt->ip_tnl_net_id);
1142
1143         if (nt->collect_md) {
1144                 if (rtnl_dereference(itn->collect_md_tun))
1145                         return -EEXIST;
1146         } else {
1147                 if (ip_tunnel_find(itn, p, dev->type))
1148                         return -EEXIST;
1149         }
1150
1151         nt->net = net;
1152         nt->parms = *p;
1153         nt->fwmark = fwmark;
1154         err = register_netdevice(dev);
1155         if (err)
1156                 goto err_register_netdevice;
1157
1158         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1159                 eth_hw_addr_random(dev);
1160
1161         mtu = ip_tunnel_bind_dev(dev);
1162         if (tb[IFLA_MTU]) {
1163                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1164
1165                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1166                             (unsigned int)(max - sizeof(struct iphdr)));
1167         }
1168
1169         err = dev_set_mtu(dev, mtu);
1170         if (err)
1171                 goto err_dev_set_mtu;
1172
1173         ip_tunnel_add(itn, nt);
1174         return 0;
1175
1176 err_dev_set_mtu:
1177         unregister_netdevice(dev);
1178 err_register_netdevice:
1179         return err;
1180 }
1181 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1182
1183 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1184                          struct ip_tunnel_parm *p, __u32 fwmark)
1185 {
1186         struct ip_tunnel *t;
1187         struct ip_tunnel *tunnel = netdev_priv(dev);
1188         struct net *net = tunnel->net;
1189         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1190
1191         if (dev == itn->fb_tunnel_dev)
1192                 return -EINVAL;
1193
1194         t = ip_tunnel_find(itn, p, dev->type);
1195
1196         if (t) {
1197                 if (t->dev != dev)
1198                         return -EEXIST;
1199         } else {
1200                 t = tunnel;
1201
1202                 if (dev->type != ARPHRD_ETHER) {
1203                         unsigned int nflags = 0;
1204
1205                         if (ipv4_is_multicast(p->iph.daddr))
1206                                 nflags = IFF_BROADCAST;
1207                         else if (p->iph.daddr)
1208                                 nflags = IFF_POINTOPOINT;
1209
1210                         if ((dev->flags ^ nflags) &
1211                             (IFF_POINTOPOINT | IFF_BROADCAST))
1212                                 return -EINVAL;
1213                 }
1214         }
1215
1216         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1217         return 0;
1218 }
1219 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1220
1221 int ip_tunnel_init(struct net_device *dev)
1222 {
1223         struct ip_tunnel *tunnel = netdev_priv(dev);
1224         struct iphdr *iph = &tunnel->parms.iph;
1225         int err;
1226
1227         dev->needs_free_netdev = true;
1228         dev->priv_destructor = ip_tunnel_dev_free;
1229         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1230         if (!dev->tstats)
1231                 return -ENOMEM;
1232
1233         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1234         if (err) {
1235                 free_percpu(dev->tstats);
1236                 return err;
1237         }
1238
1239         err = gro_cells_init(&tunnel->gro_cells, dev);
1240         if (err) {
1241                 dst_cache_destroy(&tunnel->dst_cache);
1242                 free_percpu(dev->tstats);
1243                 return err;
1244         }
1245
1246         tunnel->dev = dev;
1247         tunnel->net = dev_net(dev);
1248         strcpy(tunnel->parms.name, dev->name);
1249         iph->version            = 4;
1250         iph->ihl                = 5;
1251
1252         if (tunnel->collect_md) {
1253                 dev->features |= NETIF_F_NETNS_LOCAL;
1254                 netif_keep_dst(dev);
1255         }
1256         return 0;
1257 }
1258 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1259
1260 void ip_tunnel_uninit(struct net_device *dev)
1261 {
1262         struct ip_tunnel *tunnel = netdev_priv(dev);
1263         struct net *net = tunnel->net;
1264         struct ip_tunnel_net *itn;
1265
1266         itn = net_generic(net, tunnel->ip_tnl_net_id);
1267         /* fb_tunnel_dev will be unregisted in net-exit call. */
1268         if (itn->fb_tunnel_dev != dev)
1269                 ip_tunnel_del(itn, netdev_priv(dev));
1270
1271         dst_cache_reset(&tunnel->dst_cache);
1272 }
1273 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1274
1275 /* Do least required initialization, rest of init is done in tunnel_init call */
1276 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1277 {
1278         struct ip_tunnel *tunnel = netdev_priv(dev);
1279         tunnel->ip_tnl_net_id = net_id;
1280 }
1281 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1282
1283 MODULE_LICENSE("GPL");