2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
213 return neigh_create(&nd_tbl, daddr, dev);
216 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
222 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276 static struct dst_ops ip6_dst_blackhole_ops = {
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = ATOMIC_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 static const struct rt6_info ip6_null_entry_template = {
303 .__refcnt = ATOMIC_INIT(1),
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 static const struct rt6_info ip6_blk_hole_entry_template = {
329 .__refcnt = ATOMIC_INIT(1),
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
333 .input = dst_discard,
334 .output = dst_discard_out,
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
363 EXPORT_SYMBOL(ip6_dst_alloc);
365 static void ip6_dst_destroy(struct dst_entry *dst)
367 struct rt6_info *rt = (struct rt6_info *)dst;
368 struct fib6_info *from;
369 struct inet6_dev *idev;
371 dst_destroy_metrics_generic(dst);
372 rt6_uncached_list_del(rt);
374 idev = rt->rt6i_idev;
376 rt->rt6i_idev = NULL;
381 from = rcu_dereference(rt->from);
382 rcu_assign_pointer(rt->from, NULL);
383 fib6_info_release(from);
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
398 rt->rt6i_idev = loopback_idev;
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
428 struct fib6_info *fib6_multipath_select(const struct net *net,
429 struct fib6_info *match,
430 struct flowi6 *fl6, int oif,
431 const struct sk_buff *skb,
434 struct fib6_info *sibling, *next_sibling;
436 /* We might have already computed the hash for ICMPv6 errors. In such
437 * case it will always be non-zero. Otherwise now is the time to do it.
440 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
452 if (rt6_score_route(sibling, oif, strict) < 0)
462 * Route lookup. rcu_read_lock() should be held.
465 static inline struct fib6_info *rt6_device_match(struct net *net,
466 struct fib6_info *rt,
467 const struct in6_addr *saddr,
471 struct fib6_info *sprt;
473 if (!oif && ipv6_addr_any(saddr) &&
474 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
477 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
478 const struct net_device *dev = sprt->fib6_nh.nh_dev;
480 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 if (dev->ifindex == oif)
487 if (ipv6_chk_addr(net, saddr, dev,
488 flags & RT6_LOOKUP_F_IFACE))
493 if (oif && flags & RT6_LOOKUP_F_IFACE)
494 return net->ipv6.fib6_null_entry;
496 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
499 #ifdef CONFIG_IPV6_ROUTER_PREF
500 struct __rt6_probe_work {
501 struct work_struct work;
502 struct in6_addr target;
503 struct net_device *dev;
506 static void rt6_probe_deferred(struct work_struct *w)
508 struct in6_addr mcaddr;
509 struct __rt6_probe_work *work =
510 container_of(w, struct __rt6_probe_work, work);
512 addrconf_addr_solict_mult(&work->target, &mcaddr);
513 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
518 static void rt6_probe(struct fib6_info *rt)
520 struct __rt6_probe_work *work;
521 const struct in6_addr *nh_gw;
522 struct neighbour *neigh;
523 struct net_device *dev;
526 * Okay, this does not seem to be appropriate
527 * for now, however, we need to check if it
528 * is really so; aka Router Reachability Probing.
530 * Router Reachability Probe MUST be rate-limited
531 * to no more than one per minute.
533 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
536 nh_gw = &rt->fib6_nh.nh_gw;
537 dev = rt->fib6_nh.nh_dev;
539 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
541 struct inet6_dev *idev;
543 if (neigh->nud_state & NUD_VALID)
546 idev = __in6_dev_get(dev);
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
558 work = kmalloc(sizeof(*work), GFP_ATOMIC);
562 INIT_WORK(&work->work, rt6_probe_deferred);
563 work->target = *nh_gw;
566 schedule_work(&work->work);
570 rcu_read_unlock_bh();
573 static inline void rt6_probe(struct fib6_info *rt)
579 * Default Router Selection (RFC 2461 6.3.6)
581 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
583 const struct net_device *dev = rt->fib6_nh.nh_dev;
585 if (!oif || dev->ifindex == oif)
590 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
592 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
593 struct neighbour *neigh;
595 if (rt->fib6_flags & RTF_NONEXTHOP ||
596 !(rt->fib6_flags & RTF_GATEWAY))
597 return RT6_NUD_SUCCEED;
600 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
603 read_lock(&neigh->lock);
604 if (neigh->nud_state & NUD_VALID)
605 ret = RT6_NUD_SUCCEED;
606 #ifdef CONFIG_IPV6_ROUTER_PREF
607 else if (!(neigh->nud_state & NUD_FAILED))
608 ret = RT6_NUD_SUCCEED;
610 ret = RT6_NUD_FAIL_PROBE;
612 read_unlock(&neigh->lock);
614 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
615 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
617 rcu_read_unlock_bh();
622 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
626 m = rt6_check_dev(rt, oif);
627 if (!m && (strict & RT6_LOOKUP_F_IFACE))
628 return RT6_NUD_FAIL_HARD;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
632 if (strict & RT6_LOOKUP_F_REACHABLE) {
633 int n = rt6_check_neigh(rt);
640 /* called with rc_read_lock held */
641 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
643 const struct net_device *dev = fib6_info_nh_dev(f6i);
647 const struct inet6_dev *idev = __in6_dev_get(dev);
649 rc = !!idev->cnf.ignore_routes_with_linkdown;
655 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
656 int *mpri, struct fib6_info *match,
660 bool match_do_rr = false;
662 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
665 if (fib6_ignore_linkdown(rt) &&
666 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
667 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670 if (fib6_check_expired(rt))
673 m = rt6_score_route(rt, oif, strict);
674 if (m == RT6_NUD_FAIL_DO_RR) {
676 m = 0; /* lowest valid score */
677 } else if (m == RT6_NUD_FAIL_HARD) {
681 if (strict & RT6_LOOKUP_F_REACHABLE)
684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
686 *do_rr = match_do_rr;
694 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
695 struct fib6_info *leaf,
696 struct fib6_info *rr_head,
697 u32 metric, int oif, int strict,
700 struct fib6_info *rt, *match, *cont;
705 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
706 if (rt->fib6_metric != metric) {
711 match = find_match(rt, oif, strict, &mpri, match, do_rr);
714 for (rt = leaf; rt && rt != rr_head;
715 rt = rcu_dereference(rt->fib6_next)) {
716 if (rt->fib6_metric != metric) {
721 match = find_match(rt, oif, strict, &mpri, match, do_rr);
727 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
728 match = find_match(rt, oif, strict, &mpri, match, do_rr);
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
736 struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 struct fib6_info *match, *rt0;
741 if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 return net->ipv6.fib6_null_entry;
744 rt0 = rcu_dereference(fn->rr_ptr);
748 /* Double check to make sure fn is not an intermediate node
749 * and fn->leaf does not points to its child's leaf
750 * (This might happen if all routes under fn are deleted from
751 * the tree and fib6_repair_tree() is called on the node.)
753 key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 if (rt0->fib6_src.plen)
756 key_plen = rt0->fib6_src.plen;
758 if (fn->fn_bit != key_plen)
759 return net->ipv6.fib6_null_entry;
761 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
767 /* no entries matched; do round-robin */
768 if (!next || next->fib6_metric != rt0->fib6_metric)
772 spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 /* make sure next is not being deleted from the tree */
775 rcu_assign_pointer(fn->rr_ptr, next);
776 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780 return match ? match : net->ipv6.fib6_null_entry;
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
785 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 const struct in6_addr *gwaddr)
792 struct net *net = dev_net(dev);
793 struct route_info *rinfo = (struct route_info *) opt;
794 struct in6_addr prefix_buf, *prefix;
796 unsigned long lifetime;
797 struct fib6_info *rt;
799 if (len < sizeof(struct route_info)) {
803 /* Sanity check for prefix_len and length */
804 if (rinfo->length > 3) {
806 } else if (rinfo->prefix_len > 128) {
808 } else if (rinfo->prefix_len > 64) {
809 if (rinfo->length < 2) {
812 } else if (rinfo->prefix_len > 0) {
813 if (rinfo->length < 1) {
818 pref = rinfo->route_pref;
819 if (pref == ICMPV6_ROUTER_PREF_INVALID)
822 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
824 if (rinfo->length == 3)
825 prefix = (struct in6_addr *)rinfo->prefix;
827 /* this function is safe */
828 ipv6_addr_prefix(&prefix_buf,
829 (struct in6_addr *)rinfo->prefix,
831 prefix = &prefix_buf;
834 if (rinfo->prefix_len == 0)
835 rt = rt6_get_dflt_router(net, gwaddr, dev);
837 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
840 if (rt && !lifetime) {
846 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
849 rt->fib6_flags = RTF_ROUTEINFO |
850 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
853 if (!addrconf_finite_timeout(lifetime))
854 fib6_clean_expires(rt);
856 fib6_set_expires(rt, jiffies + HZ * lifetime);
858 fib6_info_release(rt);
865 * Misc support functions
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
871 struct net_device *dev = rt->fib6_nh.nh_dev;
873 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 /* for copies of local routes, dst->dev needs to be the
875 * device if it is a master device, the master device if
876 * device is enslaved, and the loopback as the default
878 if (netif_is_l3_slave(dev) &&
879 !rt6_need_strict(&rt->fib6_dst.addr))
880 dev = l3mdev_master_dev_rcu(dev);
881 else if (!netif_is_l3_master(dev))
882 dev = dev_net(dev)->loopback_dev;
883 /* last case is netif_is_l3_master(dev) is true in which
884 * case we want dev returned to be dev
891 static const int fib6_prop[RTN_MAX + 1] = {
898 [RTN_BLACKHOLE] = -EINVAL,
899 [RTN_UNREACHABLE] = -EHOSTUNREACH,
900 [RTN_PROHIBIT] = -EACCES,
901 [RTN_THROW] = -EAGAIN,
903 [RTN_XRESOLVE] = -EINVAL,
906 static int ip6_rt_type_to_error(u8 fib6_type)
908 return fib6_prop[fib6_type];
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
913 unsigned short flags = 0;
916 flags |= DST_NOCOUNT;
917 if (rt->dst_nopolicy)
918 flags |= DST_NOPOLICY;
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
927 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
929 switch (ort->fib6_type) {
931 rt->dst.output = dst_discard_out;
932 rt->dst.input = dst_discard;
935 rt->dst.output = ip6_pkt_prohibit_out;
936 rt->dst.input = ip6_pkt_prohibit;
939 case RTN_UNREACHABLE:
941 rt->dst.output = ip6_pkt_discard_out;
942 rt->dst.input = ip6_pkt_discard;
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
949 rt->dst.flags |= fib6_info_dst_flags(ort);
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
977 rt->rt6i_flags &= ~RTF_EXPIRES;
978 fib6_info_hold(from);
979 rcu_assign_pointer(rt->from, from);
980 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
981 if (from->fib6_metrics != &dst_default_metrics) {
982 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
983 refcount_inc(&from->fib6_metrics->refcnt);
987 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
989 struct net_device *dev = fib6_info_nh_dev(ort);
991 ip6_rt_init_dst(rt, ort);
993 rt->rt6i_dst = ort->fib6_dst;
994 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
995 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
996 rt->rt6i_flags = ort->fib6_flags;
997 rt6_set_from(rt, ort);
998 #ifdef CONFIG_IPV6_SUBTREES
999 rt->rt6i_src = ort->fib6_src;
1001 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1002 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1005 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1006 struct in6_addr *saddr)
1008 struct fib6_node *pn, *sn;
1010 if (fn->fn_flags & RTN_TL_ROOT)
1012 pn = rcu_dereference(fn->parent);
1013 sn = FIB6_SUBTREE(pn);
1015 fn = fib6_node_lookup(sn, NULL, saddr);
1018 if (fn->fn_flags & RTN_RTINFO)
1023 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1026 struct rt6_info *rt = *prt;
1028 if (dst_hold_safe(&rt->dst))
1030 if (null_fallback) {
1031 rt = net->ipv6.ip6_null_entry;
1040 /* called with rcu_lock held */
1041 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1043 unsigned short flags = fib6_info_dst_flags(rt);
1044 struct net_device *dev = rt->fib6_nh.nh_dev;
1045 struct rt6_info *nrt;
1047 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1049 ip6_rt_copy_init(nrt, rt);
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055 struct fib6_table *table,
1057 const struct sk_buff *skb,
1060 struct fib6_info *f6i;
1061 struct fib6_node *fn;
1062 struct rt6_info *rt;
1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065 flags &= ~RT6_LOOKUP_F_IFACE;
1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1070 f6i = rcu_dereference(fn->leaf);
1072 f6i = net->ipv6.fib6_null_entry;
1074 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075 fl6->flowi6_oif, flags);
1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077 f6i = fib6_multipath_select(net, f6i, fl6,
1078 fl6->flowi6_oif, skb,
1081 if (f6i == net->ipv6.fib6_null_entry) {
1082 fn = fib6_backtrack(fn, &fl6->saddr);
1087 trace_fib6_table_lookup(net, f6i, table, fl6);
1089 /* Search through exception table */
1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1092 if (ip6_hold_safe(net, &rt, true))
1093 dst_use_noref(&rt->dst, jiffies);
1094 } else if (f6i == net->ipv6.fib6_null_entry) {
1095 rt = net->ipv6.ip6_null_entry;
1098 rt = ip6_create_rt_rcu(f6i);
1100 rt = net->ipv6.ip6_null_entry;
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111 const struct sk_buff *skb, int flags)
1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118 const struct in6_addr *saddr, int oif,
1119 const struct sk_buff *skb, int strict)
1121 struct flowi6 fl6 = {
1125 struct dst_entry *dst;
1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130 flags |= RT6_LOOKUP_F_HAS_SADDR;
1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134 if (dst->error == 0)
1135 return (struct rt6_info *) dst;
1141 EXPORT_SYMBOL(rt6_lookup);
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144 * It takes new route entry, the addition fails by any reason the
1145 * route is released.
1146 * Caller must hold dst before calling it.
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150 struct netlink_ext_ack *extack)
1153 struct fib6_table *table;
1155 table = rt->fib6_table;
1156 spin_lock_bh(&table->tb6_lock);
1157 err = fib6_add(&table->tb6_root, rt, info, extack);
1158 spin_unlock_bh(&table->tb6_lock);
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1165 struct nl_info info = { .nl_net = net, };
1167 return __ip6_ins_rt(rt, &info, NULL);
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171 const struct in6_addr *daddr,
1172 const struct in6_addr *saddr)
1174 struct net_device *dev;
1175 struct rt6_info *rt;
1181 dev = ip6_rt_get_dev_rcu(ort);
1182 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186 ip6_rt_copy_init(rt, ort);
1187 rt->rt6i_flags |= RTF_CACHE;
1188 rt->dst.flags |= DST_HOST;
1189 rt->rt6i_dst.addr = *daddr;
1190 rt->rt6i_dst.plen = 128;
1192 if (!rt6_is_gw_or_nonexthop(ort)) {
1193 if (ort->fib6_dst.plen != 128 &&
1194 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1195 rt->rt6i_flags |= RTF_ANYCAST;
1196 #ifdef CONFIG_IPV6_SUBTREES
1197 if (rt->rt6i_src.plen && saddr) {
1198 rt->rt6i_src.addr = *saddr;
1199 rt->rt6i_src.plen = 128;
1207 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1209 unsigned short flags = fib6_info_dst_flags(rt);
1210 struct net_device *dev;
1211 struct rt6_info *pcpu_rt;
1214 dev = ip6_rt_get_dev_rcu(rt);
1215 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1219 ip6_rt_copy_init(pcpu_rt, rt);
1220 pcpu_rt->rt6i_flags |= RTF_PCPU;
1224 /* It should be called with rcu_read_lock() acquired */
1225 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1227 struct rt6_info *pcpu_rt, **p;
1229 p = this_cpu_ptr(rt->rt6i_pcpu);
1233 ip6_hold_safe(NULL, &pcpu_rt, false);
1238 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1239 struct fib6_info *rt)
1241 struct rt6_info *pcpu_rt, *prev, **p;
1243 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1245 dst_hold(&net->ipv6.ip6_null_entry->dst);
1246 return net->ipv6.ip6_null_entry;
1249 dst_hold(&pcpu_rt->dst);
1250 p = this_cpu_ptr(rt->rt6i_pcpu);
1251 prev = cmpxchg(p, NULL, pcpu_rt);
1257 /* exception hash table implementation
1259 static DEFINE_SPINLOCK(rt6_exception_lock);
1261 /* Remove rt6_ex from hash table and free the memory
1262 * Caller must hold rt6_exception_lock
1264 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1265 struct rt6_exception *rt6_ex)
1269 if (!bucket || !rt6_ex)
1272 net = dev_net(rt6_ex->rt6i->dst.dev);
1273 hlist_del_rcu(&rt6_ex->hlist);
1274 dst_release(&rt6_ex->rt6i->dst);
1275 kfree_rcu(rt6_ex, rcu);
1276 WARN_ON_ONCE(!bucket->depth);
1278 net->ipv6.rt6_stats->fib_rt_cache--;
1281 /* Remove oldest rt6_ex in bucket and free the memory
1282 * Caller must hold rt6_exception_lock
1284 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1286 struct rt6_exception *rt6_ex, *oldest = NULL;
1291 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1292 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1295 rt6_remove_exception(bucket, oldest);
1298 static u32 rt6_exception_hash(const struct in6_addr *dst,
1299 const struct in6_addr *src)
1301 static u32 seed __read_mostly;
1304 net_get_random_once(&seed, sizeof(seed));
1305 val = jhash(dst, sizeof(*dst), seed);
1307 #ifdef CONFIG_IPV6_SUBTREES
1309 val = jhash(src, sizeof(*src), val);
1311 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1314 /* Helper function to find the cached rt in the hash table
1315 * and update bucket pointer to point to the bucket for this
1316 * (daddr, saddr) pair
1317 * Caller must hold rt6_exception_lock
1319 static struct rt6_exception *
1320 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1321 const struct in6_addr *daddr,
1322 const struct in6_addr *saddr)
1324 struct rt6_exception *rt6_ex;
1327 if (!(*bucket) || !daddr)
1330 hval = rt6_exception_hash(daddr, saddr);
1333 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1334 struct rt6_info *rt6 = rt6_ex->rt6i;
1335 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1337 #ifdef CONFIG_IPV6_SUBTREES
1338 if (matched && saddr)
1339 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1347 /* Helper function to find the cached rt in the hash table
1348 * and update bucket pointer to point to the bucket for this
1349 * (daddr, saddr) pair
1350 * Caller must hold rcu_read_lock()
1352 static struct rt6_exception *
1353 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1354 const struct in6_addr *daddr,
1355 const struct in6_addr *saddr)
1357 struct rt6_exception *rt6_ex;
1360 WARN_ON_ONCE(!rcu_read_lock_held());
1362 if (!(*bucket) || !daddr)
1365 hval = rt6_exception_hash(daddr, saddr);
1368 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1369 struct rt6_info *rt6 = rt6_ex->rt6i;
1370 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1372 #ifdef CONFIG_IPV6_SUBTREES
1373 if (matched && saddr)
1374 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1382 static unsigned int fib6_mtu(const struct fib6_info *rt)
1386 if (rt->fib6_pmtu) {
1387 mtu = rt->fib6_pmtu;
1389 struct net_device *dev = fib6_info_nh_dev(rt);
1390 struct inet6_dev *idev;
1393 idev = __in6_dev_get(dev);
1394 mtu = idev->cnf.mtu6;
1398 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1400 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1403 static int rt6_insert_exception(struct rt6_info *nrt,
1404 struct fib6_info *ort)
1406 struct net *net = dev_net(nrt->dst.dev);
1407 struct rt6_exception_bucket *bucket;
1408 struct in6_addr *src_key = NULL;
1409 struct rt6_exception *rt6_ex;
1412 spin_lock_bh(&rt6_exception_lock);
1414 if (ort->exception_bucket_flushed) {
1419 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1420 lockdep_is_held(&rt6_exception_lock));
1422 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1428 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1431 #ifdef CONFIG_IPV6_SUBTREES
1432 /* rt6i_src.plen != 0 indicates ort is in subtree
1433 * and exception table is indexed by a hash of
1434 * both rt6i_dst and rt6i_src.
1435 * Otherwise, the exception table is indexed by
1436 * a hash of only rt6i_dst.
1438 if (ort->fib6_src.plen)
1439 src_key = &nrt->rt6i_src.addr;
1442 /* Update rt6i_prefsrc as it could be changed
1443 * in rt6_remove_prefsrc()
1445 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1446 /* rt6_mtu_change() might lower mtu on ort.
1447 * Only insert this exception route if its mtu
1448 * is less than ort's mtu value.
1450 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1455 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1458 rt6_remove_exception(bucket, rt6_ex);
1460 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1466 rt6_ex->stamp = jiffies;
1467 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1469 net->ipv6.rt6_stats->fib_rt_cache++;
1471 if (bucket->depth > FIB6_MAX_DEPTH)
1472 rt6_exception_remove_oldest(bucket);
1475 spin_unlock_bh(&rt6_exception_lock);
1477 /* Update fn->fn_sernum to invalidate all cached dst */
1479 spin_lock_bh(&ort->fib6_table->tb6_lock);
1480 fib6_update_sernum(net, ort);
1481 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1482 fib6_force_start_gc(net);
1488 void rt6_flush_exceptions(struct fib6_info *rt)
1490 struct rt6_exception_bucket *bucket;
1491 struct rt6_exception *rt6_ex;
1492 struct hlist_node *tmp;
1495 spin_lock_bh(&rt6_exception_lock);
1496 /* Prevent rt6_insert_exception() to recreate the bucket list */
1497 rt->exception_bucket_flushed = 1;
1499 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500 lockdep_is_held(&rt6_exception_lock));
1504 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1505 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1506 rt6_remove_exception(bucket, rt6_ex);
1507 WARN_ON_ONCE(bucket->depth);
1512 spin_unlock_bh(&rt6_exception_lock);
1515 /* Find cached rt in the hash table inside passed in rt
1516 * Caller has to hold rcu_read_lock()
1518 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1519 struct in6_addr *daddr,
1520 struct in6_addr *saddr)
1522 struct rt6_exception_bucket *bucket;
1523 struct in6_addr *src_key = NULL;
1524 struct rt6_exception *rt6_ex;
1525 struct rt6_info *res = NULL;
1527 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1529 #ifdef CONFIG_IPV6_SUBTREES
1530 /* rt6i_src.plen != 0 indicates rt is in subtree
1531 * and exception table is indexed by a hash of
1532 * both rt6i_dst and rt6i_src.
1533 * Otherwise, the exception table is indexed by
1534 * a hash of only rt6i_dst.
1536 if (rt->fib6_src.plen)
1539 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1541 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1547 /* Remove the passed in cached rt from the hash table that contains it */
1548 static int rt6_remove_exception_rt(struct rt6_info *rt)
1550 struct rt6_exception_bucket *bucket;
1551 struct in6_addr *src_key = NULL;
1552 struct rt6_exception *rt6_ex;
1553 struct fib6_info *from;
1556 from = rcu_dereference(rt->from);
1558 !(rt->rt6i_flags & RTF_CACHE))
1561 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1564 spin_lock_bh(&rt6_exception_lock);
1565 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1566 lockdep_is_held(&rt6_exception_lock));
1567 #ifdef CONFIG_IPV6_SUBTREES
1568 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1569 * and exception table is indexed by a hash of
1570 * both rt6i_dst and rt6i_src.
1571 * Otherwise, the exception table is indexed by
1572 * a hash of only rt6i_dst.
1574 if (from->fib6_src.plen)
1575 src_key = &rt->rt6i_src.addr;
1577 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1581 rt6_remove_exception(bucket, rt6_ex);
1587 spin_unlock_bh(&rt6_exception_lock);
1591 /* Find rt6_ex which contains the passed in rt cache and
1594 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1596 struct rt6_exception_bucket *bucket;
1597 struct fib6_info *from = rt->from;
1598 struct in6_addr *src_key = NULL;
1599 struct rt6_exception *rt6_ex;
1602 !(rt->rt6i_flags & RTF_CACHE))
1606 bucket = rcu_dereference(from->rt6i_exception_bucket);
1608 #ifdef CONFIG_IPV6_SUBTREES
1609 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1610 * and exception table is indexed by a hash of
1611 * both rt6i_dst and rt6i_src.
1612 * Otherwise, the exception table is indexed by
1613 * a hash of only rt6i_dst.
1615 if (from->fib6_src.plen)
1616 src_key = &rt->rt6i_src.addr;
1618 rt6_ex = __rt6_find_exception_rcu(&bucket,
1622 rt6_ex->stamp = jiffies;
1627 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1629 struct rt6_exception_bucket *bucket;
1630 struct rt6_exception *rt6_ex;
1633 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1634 lockdep_is_held(&rt6_exception_lock));
1637 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1638 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1639 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1647 struct rt6_info *rt, int mtu)
1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1650 * lowest MTU in the path: always allow updating the route PMTU to
1651 * reflect PMTU decreases.
1653 * If the new MTU is higher, and the route PMTU is equal to the local
1654 * MTU, this means the old MTU is the lowest in the path, so allow
1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1659 if (dst_mtu(&rt->dst) >= mtu)
1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1669 struct fib6_info *rt, int mtu)
1671 struct rt6_exception_bucket *bucket;
1672 struct rt6_exception *rt6_ex;
1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1676 lockdep_is_held(&rt6_exception_lock));
1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1683 struct rt6_info *entry = rt6_ex->rt6i;
1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1686 * route), the metrics of its rt->from have already
1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1690 rt6_mtu_change_route_allowed(idev, entry, mtu))
1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1700 struct in6_addr *gateway)
1702 struct rt6_exception_bucket *bucket;
1703 struct rt6_exception *rt6_ex;
1704 struct hlist_node *tmp;
1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1710 spin_lock_bh(&rt6_exception_lock);
1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1712 lockdep_is_held(&rt6_exception_lock));
1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1716 hlist_for_each_entry_safe(rt6_ex, tmp,
1717 &bucket->chain, hlist) {
1718 struct rt6_info *entry = rt6_ex->rt6i;
1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1721 RTF_CACHE_GATEWAY &&
1722 ipv6_addr_equal(gateway,
1723 &entry->rt6i_gateway)) {
1724 rt6_remove_exception(bucket, rt6_ex);
1731 spin_unlock_bh(&rt6_exception_lock);
1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1735 struct rt6_exception *rt6_ex,
1736 struct fib6_gc_args *gc_args,
1739 struct rt6_info *rt = rt6_ex->rt6i;
1741 /* we are pruning and obsoleting aged-out and non gateway exceptions
1742 * even if others have still references to them, so that on next
1743 * dst_check() such references can be dropped.
1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1745 * expired, independently from their aging, as per RFC 8201 section 4
1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1749 RT6_TRACE("aging clone %p\n", rt);
1750 rt6_remove_exception(bucket, rt6_ex);
1753 } else if (time_after(jiffies, rt->dst.expires)) {
1754 RT6_TRACE("purging expired route %p\n", rt);
1755 rt6_remove_exception(bucket, rt6_ex);
1759 if (rt->rt6i_flags & RTF_GATEWAY) {
1760 struct neighbour *neigh;
1761 __u8 neigh_flags = 0;
1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1765 neigh_flags = neigh->flags;
1767 if (!(neigh_flags & NTF_ROUTER)) {
1768 RT6_TRACE("purging route %p via non-router but gateway\n",
1770 rt6_remove_exception(bucket, rt6_ex);
1778 void rt6_age_exceptions(struct fib6_info *rt,
1779 struct fib6_gc_args *gc_args,
1782 struct rt6_exception_bucket *bucket;
1783 struct rt6_exception *rt6_ex;
1784 struct hlist_node *tmp;
1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1791 spin_lock(&rt6_exception_lock);
1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1793 lockdep_is_held(&rt6_exception_lock));
1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1797 hlist_for_each_entry_safe(rt6_ex, tmp,
1798 &bucket->chain, hlist) {
1799 rt6_age_examine_exception(bucket, rt6_ex,
1805 spin_unlock(&rt6_exception_lock);
1806 rcu_read_unlock_bh();
1809 /* must be called with rcu lock held */
1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1811 int oif, struct flowi6 *fl6, int strict)
1813 struct fib6_node *fn, *saved_fn;
1814 struct fib6_info *f6i;
1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1823 f6i = rt6_select(net, fn, oif, strict);
1824 if (f6i == net->ipv6.fib6_null_entry) {
1825 fn = fib6_backtrack(fn, &fl6->saddr);
1827 goto redo_rt6_select;
1828 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1829 /* also consider unreachable route */
1830 strict &= ~RT6_LOOKUP_F_REACHABLE;
1832 goto redo_rt6_select;
1836 trace_fib6_table_lookup(net, f6i, table, fl6);
1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1842 int oif, struct flowi6 *fl6,
1843 const struct sk_buff *skb, int flags)
1845 struct fib6_info *f6i;
1846 struct rt6_info *rt;
1849 strict |= flags & RT6_LOOKUP_F_IFACE;
1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1851 if (net->ipv6.devconf_all->forwarding == 0)
1852 strict |= RT6_LOOKUP_F_REACHABLE;
1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1857 if (f6i->fib6_nsiblings)
1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1860 if (f6i == net->ipv6.fib6_null_entry) {
1861 rt = net->ipv6.ip6_null_entry;
1867 /*Search through exception table */
1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1870 if (ip6_hold_safe(net, &rt, true))
1871 dst_use_noref(&rt->dst, jiffies);
1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1876 !(f6i->fib6_flags & RTF_GATEWAY))) {
1877 /* Create a RTF_CACHE clone which will not be
1878 * owned by the fib6 tree. It is for the special case where
1879 * the daddr in the skb during the neighbor look-up is different
1880 * from the fl6->daddr used to look-up route here.
1882 struct rt6_info *uncached_rt;
1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1890 * No need for another dst_hold()
1892 rt6_uncached_list_add(uncached_rt);
1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1895 uncached_rt = net->ipv6.ip6_null_entry;
1896 dst_hold(&uncached_rt->dst);
1901 /* Get a percpu copy */
1903 struct rt6_info *pcpu_rt;
1906 pcpu_rt = rt6_get_pcpu_route(f6i);
1909 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1917 EXPORT_SYMBOL_GPL(ip6_pol_route);
1919 static struct rt6_info *ip6_pol_route_input(struct net *net,
1920 struct fib6_table *table,
1922 const struct sk_buff *skb,
1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1928 struct dst_entry *ip6_route_input_lookup(struct net *net,
1929 struct net_device *dev,
1931 const struct sk_buff *skb,
1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1935 flags |= RT6_LOOKUP_F_IFACE;
1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1942 struct flow_keys *keys,
1943 struct flow_keys *flkeys)
1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1946 const struct ipv6hdr *key_iph = outer_iph;
1947 struct flow_keys *_flkeys = flkeys;
1948 const struct ipv6hdr *inner_iph;
1949 const struct icmp6hdr *icmph;
1950 struct ipv6hdr _inner_iph;
1951 struct icmp6hdr _icmph;
1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1957 sizeof(_icmph), &_icmph);
1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1964 icmph->icmp6_type != ICMPV6_PARAMPROB)
1967 inner_iph = skb_header_pointer(skb,
1968 skb_transport_offset(skb) + sizeof(*icmph),
1969 sizeof(_inner_iph), &_inner_iph);
1973 key_iph = inner_iph;
1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1979 keys->tags.flow_label = _flkeys->tags.flow_label;
1980 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1982 keys->addrs.v6addrs.src = key_iph->saddr;
1983 keys->addrs.v6addrs.dst = key_iph->daddr;
1984 keys->tags.flow_label = ip6_flowlabel(key_iph);
1985 keys->basic.ip_proto = key_iph->nexthdr;
1989 /* if skb is set it will be used and fl6 can be NULL */
1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1991 const struct sk_buff *skb, struct flow_keys *flkeys)
1993 struct flow_keys hash_keys;
1996 switch (ip6_multipath_hash_policy(net)) {
1998 memset(&hash_keys, 0, sizeof(hash_keys));
1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2003 hash_keys.addrs.v6addrs.src = fl6->saddr;
2004 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2006 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2012 struct flow_keys keys;
2014 /* short-circuit if we already have L4 hash present */
2016 return skb_get_hash_raw(skb) >> 1;
2018 memset(&hash_keys, 0, sizeof(hash_keys));
2021 skb_flow_dissect_flow_keys(skb, &keys, flag);
2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2027 hash_keys.ports.src = flkeys->ports.src;
2028 hash_keys.ports.dst = flkeys->ports.dst;
2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2031 memset(&hash_keys, 0, sizeof(hash_keys));
2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2033 hash_keys.addrs.v6addrs.src = fl6->saddr;
2034 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035 hash_keys.ports.src = fl6->fl6_sport;
2036 hash_keys.ports.dst = fl6->fl6_dport;
2037 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2041 mhash = flow_hash_from_keys(&hash_keys);
2046 void ip6_route_input(struct sk_buff *skb)
2048 const struct ipv6hdr *iph = ipv6_hdr(skb);
2049 struct net *net = dev_net(skb->dev);
2050 int flags = RT6_LOOKUP_F_HAS_SADDR;
2051 struct ip_tunnel_info *tun_info;
2052 struct flowi6 fl6 = {
2053 .flowi6_iif = skb->dev->ifindex,
2054 .daddr = iph->daddr,
2055 .saddr = iph->saddr,
2056 .flowlabel = ip6_flowinfo(iph),
2057 .flowi6_mark = skb->mark,
2058 .flowi6_proto = iph->nexthdr,
2060 struct flow_keys *flkeys = NULL, _flkeys;
2062 tun_info = skb_tunnel_info(skb);
2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2076 static struct rt6_info *ip6_pol_route_output(struct net *net,
2077 struct fib6_table *table,
2079 const struct sk_buff *skb,
2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2086 struct flowi6 *fl6, int flags)
2090 if (rt6_need_strict(&fl6->daddr)) {
2091 struct dst_entry *dst;
2093 dst = l3mdev_link_scope_lookup(net, fl6);
2098 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2100 any_src = ipv6_addr_any(&fl6->saddr);
2101 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2102 (fl6->flowi6_oif && any_src))
2103 flags |= RT6_LOOKUP_F_IFACE;
2106 flags |= RT6_LOOKUP_F_HAS_SADDR;
2108 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2110 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2112 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2114 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2116 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2117 struct net_device *loopback_dev = net->loopback_dev;
2118 struct dst_entry *new = NULL;
2120 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2121 DST_OBSOLETE_DEAD, 0);
2124 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2128 new->input = dst_discard;
2129 new->output = dst_discard_out;
2131 dst_copy_metrics(new, &ort->dst);
2133 rt->rt6i_idev = in6_dev_get(loopback_dev);
2134 rt->rt6i_gateway = ort->rt6i_gateway;
2135 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2137 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2138 #ifdef CONFIG_IPV6_SUBTREES
2139 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2143 dst_release(dst_orig);
2144 return new ? new : ERR_PTR(-ENOMEM);
2148 * Destination cache support functions
2151 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2155 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2158 if (fib6_check_expired(f6i))
2164 static struct dst_entry *rt6_check(struct rt6_info *rt,
2165 struct fib6_info *from,
2170 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2171 rt_cookie != cookie)
2174 if (rt6_check_expired(rt))
2180 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2181 struct fib6_info *from,
2184 if (!__rt6_check_expired(rt) &&
2185 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2186 fib6_check(from, cookie))
2192 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2194 struct dst_entry *dst_ret;
2195 struct fib6_info *from;
2196 struct rt6_info *rt;
2198 rt = container_of(dst, struct rt6_info, dst);
2202 /* All IPV6 dsts are created with ->obsolete set to the value
2203 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2204 * into this function always.
2207 from = rcu_dereference(rt->from);
2209 if (from && (rt->rt6i_flags & RTF_PCPU ||
2210 unlikely(!list_empty(&rt->rt6i_uncached))))
2211 dst_ret = rt6_dst_from_check(rt, from, cookie);
2213 dst_ret = rt6_check(rt, from, cookie);
2220 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2222 struct rt6_info *rt = (struct rt6_info *) dst;
2225 if (rt->rt6i_flags & RTF_CACHE) {
2227 if (rt6_check_expired(rt)) {
2228 rt6_remove_exception_rt(rt);
2240 static void ip6_link_failure(struct sk_buff *skb)
2242 struct rt6_info *rt;
2244 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2246 rt = (struct rt6_info *) skb_dst(skb);
2249 if (rt->rt6i_flags & RTF_CACHE) {
2250 if (dst_hold_safe(&rt->dst))
2251 rt6_remove_exception_rt(rt);
2253 struct fib6_info *from;
2254 struct fib6_node *fn;
2256 from = rcu_dereference(rt->from);
2258 fn = rcu_dereference(from->fib6_node);
2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2270 struct fib6_info *from;
2273 from = rcu_dereference(rt0->from);
2275 rt0->dst.expires = from->expires;
2279 dst_set_expires(&rt0->dst, timeout);
2280 rt0->rt6i_flags |= RTF_EXPIRES;
2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2285 struct net *net = dev_net(rt->dst.dev);
2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2288 rt->rt6i_flags |= RTF_MODIFIED;
2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2297 from_set = !!rcu_dereference(rt->from);
2300 return !(rt->rt6i_flags & RTF_CACHE) &&
2301 (rt->rt6i_flags & RTF_PCPU || from_set);
2304 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2305 const struct ipv6hdr *iph, u32 mtu)
2307 const struct in6_addr *daddr, *saddr;
2308 struct rt6_info *rt6 = (struct rt6_info *)dst;
2310 if (dst_metric_locked(dst, RTAX_MTU))
2314 daddr = &iph->daddr;
2315 saddr = &iph->saddr;
2317 daddr = &sk->sk_v6_daddr;
2318 saddr = &inet6_sk(sk)->saddr;
2323 dst_confirm_neigh(dst, daddr);
2324 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2325 if (mtu >= dst_mtu(dst))
2328 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2329 rt6_do_update_pmtu(rt6, mtu);
2330 /* update rt6_ex->stamp for cache */
2331 if (rt6->rt6i_flags & RTF_CACHE)
2332 rt6_update_exception_stamp_rt(rt6);
2334 struct fib6_info *from;
2335 struct rt6_info *nrt6;
2338 from = rcu_dereference(rt6->from);
2339 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2341 rt6_do_update_pmtu(nrt6, mtu);
2342 if (rt6_insert_exception(nrt6, from))
2343 dst_release_immediate(&nrt6->dst);
2349 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2350 struct sk_buff *skb, u32 mtu)
2352 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2355 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2356 int oif, u32 mark, kuid_t uid)
2358 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2359 struct dst_entry *dst;
2362 memset(&fl6, 0, sizeof(fl6));
2363 fl6.flowi6_oif = oif;
2364 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2365 fl6.daddr = iph->daddr;
2366 fl6.saddr = iph->saddr;
2367 fl6.flowlabel = ip6_flowinfo(iph);
2368 fl6.flowi6_uid = uid;
2370 dst = ip6_route_output(net, NULL, &fl6);
2372 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2375 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2377 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2379 struct dst_entry *dst;
2381 ip6_update_pmtu(skb, sock_net(sk), mtu,
2382 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2384 dst = __sk_dst_get(sk);
2385 if (!dst || !dst->obsolete ||
2386 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2390 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2391 ip6_datagram_dst_update(sk, false);
2394 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2396 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2397 const struct flowi6 *fl6)
2399 #ifdef CONFIG_IPV6_SUBTREES
2400 struct ipv6_pinfo *np = inet6_sk(sk);
2403 ip6_dst_store(sk, dst,
2404 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2405 &sk->sk_v6_daddr : NULL,
2406 #ifdef CONFIG_IPV6_SUBTREES
2407 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2413 /* Handle redirects */
2414 struct ip6rd_flowi {
2416 struct in6_addr gateway;
2419 static struct rt6_info *__ip6_route_redirect(struct net *net,
2420 struct fib6_table *table,
2422 const struct sk_buff *skb,
2425 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2426 struct rt6_info *ret = NULL, *rt_cache;
2427 struct fib6_info *rt;
2428 struct fib6_node *fn;
2430 /* Get the "current" route for this destination and
2431 * check if the redirect has come from appropriate router.
2433 * RFC 4861 specifies that redirects should only be
2434 * accepted if they come from the nexthop to the target.
2435 * Due to the way the routes are chosen, this notion
2436 * is a bit fuzzy and one might need to check all possible
2441 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2443 for_each_fib6_node_rt_rcu(fn) {
2444 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2446 if (fib6_check_expired(rt))
2448 if (rt->fib6_flags & RTF_REJECT)
2450 if (!(rt->fib6_flags & RTF_GATEWAY))
2452 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2454 /* rt_cache's gateway might be different from its 'parent'
2455 * in the case of an ip redirect.
2456 * So we keep searching in the exception table if the gateway
2459 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2460 rt_cache = rt6_find_cached_rt(rt,
2464 ipv6_addr_equal(&rdfl->gateway,
2465 &rt_cache->rt6i_gateway)) {
2475 rt = net->ipv6.fib6_null_entry;
2476 else if (rt->fib6_flags & RTF_REJECT) {
2477 ret = net->ipv6.ip6_null_entry;
2481 if (rt == net->ipv6.fib6_null_entry) {
2482 fn = fib6_backtrack(fn, &fl6->saddr);
2489 dst_hold(&ret->dst);
2491 ret = ip6_create_rt_rcu(rt);
2495 trace_fib6_table_lookup(net, rt, table, fl6);
2499 static struct dst_entry *ip6_route_redirect(struct net *net,
2500 const struct flowi6 *fl6,
2501 const struct sk_buff *skb,
2502 const struct in6_addr *gateway)
2504 int flags = RT6_LOOKUP_F_HAS_SADDR;
2505 struct ip6rd_flowi rdfl;
2508 rdfl.gateway = *gateway;
2510 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2511 flags, __ip6_route_redirect);
2514 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2517 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2518 struct dst_entry *dst;
2521 memset(&fl6, 0, sizeof(fl6));
2522 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2523 fl6.flowi6_oif = oif;
2524 fl6.flowi6_mark = mark;
2525 fl6.daddr = iph->daddr;
2526 fl6.saddr = iph->saddr;
2527 fl6.flowlabel = ip6_flowinfo(iph);
2528 fl6.flowi6_uid = uid;
2530 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2531 rt6_do_redirect(dst, NULL, skb);
2534 EXPORT_SYMBOL_GPL(ip6_redirect);
2536 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2539 const struct ipv6hdr *iph = ipv6_hdr(skb);
2540 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2541 struct dst_entry *dst;
2544 memset(&fl6, 0, sizeof(fl6));
2545 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2546 fl6.flowi6_oif = oif;
2547 fl6.flowi6_mark = mark;
2548 fl6.daddr = msg->dest;
2549 fl6.saddr = iph->daddr;
2550 fl6.flowi6_uid = sock_net_uid(net, NULL);
2552 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2553 rt6_do_redirect(dst, NULL, skb);
2557 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2559 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2562 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2564 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2566 struct net_device *dev = dst->dev;
2567 unsigned int mtu = dst_mtu(dst);
2568 struct net *net = dev_net(dev);
2570 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2572 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2573 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2576 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2577 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2578 * IPV6_MAXPLEN is also valid and means: "any MSS,
2579 * rely only on pmtu discovery"
2581 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2586 static unsigned int ip6_mtu(const struct dst_entry *dst)
2588 struct inet6_dev *idev;
2591 mtu = dst_metric_raw(dst, RTAX_MTU);
2598 idev = __in6_dev_get(dst->dev);
2600 mtu = idev->cnf.mtu6;
2604 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2606 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2610 * 1. mtu on route is locked - use it
2611 * 2. mtu from nexthop exception
2612 * 3. mtu from egress device
2614 * based on ip6_dst_mtu_forward and exception logic of
2615 * rt6_find_cached_rt; called with rcu_read_lock
2617 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2618 struct in6_addr *saddr)
2620 struct rt6_exception_bucket *bucket;
2621 struct rt6_exception *rt6_ex;
2622 struct in6_addr *src_key;
2623 struct inet6_dev *idev;
2626 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2627 mtu = f6i->fib6_pmtu;
2633 #ifdef CONFIG_IPV6_SUBTREES
2634 if (f6i->fib6_src.plen)
2638 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2639 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2640 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2641 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2644 struct net_device *dev = fib6_info_nh_dev(f6i);
2647 idev = __in6_dev_get(dev);
2648 if (idev && idev->cnf.mtu6 > mtu)
2649 mtu = idev->cnf.mtu6;
2652 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2654 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2657 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2660 struct dst_entry *dst;
2661 struct rt6_info *rt;
2662 struct inet6_dev *idev = in6_dev_get(dev);
2663 struct net *net = dev_net(dev);
2665 if (unlikely(!idev))
2666 return ERR_PTR(-ENODEV);
2668 rt = ip6_dst_alloc(net, dev, 0);
2669 if (unlikely(!rt)) {
2671 dst = ERR_PTR(-ENOMEM);
2675 rt->dst.flags |= DST_HOST;
2676 rt->dst.input = ip6_input;
2677 rt->dst.output = ip6_output;
2678 rt->rt6i_gateway = fl6->daddr;
2679 rt->rt6i_dst.addr = fl6->daddr;
2680 rt->rt6i_dst.plen = 128;
2681 rt->rt6i_idev = idev;
2682 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2684 /* Add this dst into uncached_list so that rt6_disable_ip() can
2685 * do proper release of the net_device
2687 rt6_uncached_list_add(rt);
2688 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2690 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2696 static int ip6_dst_gc(struct dst_ops *ops)
2698 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2699 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2700 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2701 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2702 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2703 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2706 entries = dst_entries_get_fast(ops);
2707 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2708 entries <= rt_max_size)
2711 net->ipv6.ip6_rt_gc_expire++;
2712 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2713 entries = dst_entries_get_slow(ops);
2714 if (entries < ops->gc_thresh)
2715 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2717 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2718 return entries > rt_max_size;
2721 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2722 struct fib6_config *cfg)
2724 struct dst_metrics *p;
2729 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2733 refcount_set(&p->refcnt, 1);
2734 rt->fib6_metrics = p;
2736 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2739 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2740 struct fib6_config *cfg,
2741 const struct in6_addr *gw_addr,
2742 u32 tbid, int flags)
2744 struct flowi6 fl6 = {
2745 .flowi6_oif = cfg->fc_ifindex,
2747 .saddr = cfg->fc_prefsrc,
2749 struct fib6_table *table;
2750 struct rt6_info *rt;
2752 table = fib6_get_table(net, tbid);
2756 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2757 flags |= RT6_LOOKUP_F_HAS_SADDR;
2759 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2760 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2762 /* if table lookup failed, fall back to full lookup */
2763 if (rt == net->ipv6.ip6_null_entry) {
2771 static int ip6_route_check_nh_onlink(struct net *net,
2772 struct fib6_config *cfg,
2773 const struct net_device *dev,
2774 struct netlink_ext_ack *extack)
2776 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2777 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2778 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2779 struct rt6_info *grt;
2783 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2785 if (!grt->dst.error &&
2786 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2787 NL_SET_ERR_MSG(extack,
2788 "Nexthop has invalid gateway or device mismatch");
2798 static int ip6_route_check_nh(struct net *net,
2799 struct fib6_config *cfg,
2800 struct net_device **_dev,
2801 struct inet6_dev **idev)
2803 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2804 struct net_device *dev = _dev ? *_dev : NULL;
2805 struct rt6_info *grt = NULL;
2806 int err = -EHOSTUNREACH;
2808 if (cfg->fc_table) {
2809 int flags = RT6_LOOKUP_F_IFACE;
2811 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2812 cfg->fc_table, flags);
2814 if (grt->rt6i_flags & RTF_GATEWAY ||
2815 (dev && dev != grt->dst.dev)) {
2823 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2829 if (dev != grt->dst.dev) {
2834 *_dev = dev = grt->dst.dev;
2835 *idev = grt->rt6i_idev;
2837 in6_dev_hold(grt->rt6i_idev);
2840 if (!(grt->rt6i_flags & RTF_GATEWAY))
2849 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2850 struct net_device **_dev, struct inet6_dev **idev,
2851 struct netlink_ext_ack *extack)
2853 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2854 int gwa_type = ipv6_addr_type(gw_addr);
2855 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2856 const struct net_device *dev = *_dev;
2857 bool need_addr_check = !dev;
2860 /* if gw_addr is local we will fail to detect this in case
2861 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2862 * will return already-added prefix route via interface that
2863 * prefix route was assigned to, which might be non-loopback.
2866 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2867 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2871 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2872 /* IPv6 strictly inhibits using not link-local
2873 * addresses as nexthop address.
2874 * Otherwise, router will not able to send redirects.
2875 * It is very good, but in some (rare!) circumstances
2876 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2877 * some exceptions. --ANK
2878 * We allow IPv4-mapped nexthops to support RFC4798-type
2881 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2882 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2886 if (cfg->fc_flags & RTNH_F_ONLINK)
2887 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2889 err = ip6_route_check_nh(net, cfg, _dev, idev);
2895 /* reload in case device was changed */
2900 NL_SET_ERR_MSG(extack, "Egress device not specified");
2902 } else if (dev->flags & IFF_LOOPBACK) {
2903 NL_SET_ERR_MSG(extack,
2904 "Egress device can not be loopback device for this route");
2908 /* if we did not check gw_addr above, do so now that the
2909 * egress device has been resolved.
2911 if (need_addr_check &&
2912 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2913 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2922 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2924 struct netlink_ext_ack *extack)
2926 struct net *net = cfg->fc_nlinfo.nl_net;
2927 struct fib6_info *rt = NULL;
2928 struct net_device *dev = NULL;
2929 struct inet6_dev *idev = NULL;
2930 struct fib6_table *table;
2934 /* RTF_PCPU is an internal flag; can not be set by userspace */
2935 if (cfg->fc_flags & RTF_PCPU) {
2936 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2940 /* RTF_CACHE is an internal flag; can not be set by userspace */
2941 if (cfg->fc_flags & RTF_CACHE) {
2942 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2946 if (cfg->fc_type > RTN_MAX) {
2947 NL_SET_ERR_MSG(extack, "Invalid route type");
2951 if (cfg->fc_dst_len > 128) {
2952 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2955 if (cfg->fc_src_len > 128) {
2956 NL_SET_ERR_MSG(extack, "Invalid source address length");
2959 #ifndef CONFIG_IPV6_SUBTREES
2960 if (cfg->fc_src_len) {
2961 NL_SET_ERR_MSG(extack,
2962 "Specifying source address requires IPV6_SUBTREES to be enabled");
2966 if (cfg->fc_ifindex) {
2968 dev = dev_get_by_index(net, cfg->fc_ifindex);
2971 idev = in6_dev_get(dev);
2976 if (cfg->fc_metric == 0)
2977 cfg->fc_metric = IP6_RT_PRIO_USER;
2979 if (cfg->fc_flags & RTNH_F_ONLINK) {
2981 NL_SET_ERR_MSG(extack,
2982 "Nexthop device required for onlink");
2987 if (!(dev->flags & IFF_UP)) {
2988 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2995 if (cfg->fc_nlinfo.nlh &&
2996 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2997 table = fib6_get_table(net, cfg->fc_table);
2999 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3000 table = fib6_new_table(net, cfg->fc_table);
3003 table = fib6_new_table(net, cfg->fc_table);
3010 rt = fib6_info_alloc(gfp_flags);
3014 if (cfg->fc_flags & RTF_ADDRCONF)
3015 rt->dst_nocount = true;
3017 err = ip6_convert_metrics(net, rt, cfg);
3021 if (cfg->fc_flags & RTF_EXPIRES)
3022 fib6_set_expires(rt, jiffies +
3023 clock_t_to_jiffies(cfg->fc_expires));
3025 fib6_clean_expires(rt);
3027 if (cfg->fc_protocol == RTPROT_UNSPEC)
3028 cfg->fc_protocol = RTPROT_BOOT;
3029 rt->fib6_protocol = cfg->fc_protocol;
3031 addr_type = ipv6_addr_type(&cfg->fc_dst);
3033 if (cfg->fc_encap) {
3034 struct lwtunnel_state *lwtstate;
3036 err = lwtunnel_build_state(cfg->fc_encap_type,
3037 cfg->fc_encap, AF_INET6, cfg,
3041 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3044 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3045 rt->fib6_dst.plen = cfg->fc_dst_len;
3046 if (rt->fib6_dst.plen == 128)
3047 rt->dst_host = true;
3049 #ifdef CONFIG_IPV6_SUBTREES
3050 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3051 rt->fib6_src.plen = cfg->fc_src_len;
3054 rt->fib6_metric = cfg->fc_metric;
3055 rt->fib6_nh.nh_weight = 1;
3057 rt->fib6_type = cfg->fc_type;
3059 /* We cannot add true routes via loopback here,
3060 they would result in kernel looping; promote them to reject routes
3062 if ((cfg->fc_flags & RTF_REJECT) ||
3063 (dev && (dev->flags & IFF_LOOPBACK) &&
3064 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3065 !(cfg->fc_flags & RTF_LOCAL))) {
3066 /* hold loopback dev/idev if we haven't done so. */
3067 if (dev != net->loopback_dev) {
3072 dev = net->loopback_dev;
3074 idev = in6_dev_get(dev);
3080 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3084 if (cfg->fc_flags & RTF_GATEWAY) {
3085 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3089 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3096 if (idev->cnf.disable_ipv6) {
3097 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3102 if (!(dev->flags & IFF_UP)) {
3103 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3108 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3109 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3110 NL_SET_ERR_MSG(extack, "Invalid source address");
3114 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3115 rt->fib6_prefsrc.plen = 128;
3117 rt->fib6_prefsrc.plen = 0;
3119 rt->fib6_flags = cfg->fc_flags;
3122 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3123 !netif_carrier_ok(dev))
3124 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3125 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3126 rt->fib6_nh.nh_dev = dev;
3127 rt->fib6_table = table;
3129 cfg->fc_nlinfo.nl_net = dev_net(dev);
3141 fib6_info_release(rt);
3142 return ERR_PTR(err);
3145 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3146 struct netlink_ext_ack *extack)
3148 struct fib6_info *rt;
3151 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3155 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3156 fib6_info_release(rt);
3161 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3163 struct net *net = info->nl_net;
3164 struct fib6_table *table;
3167 if (rt == net->ipv6.fib6_null_entry) {
3172 table = rt->fib6_table;
3173 spin_lock_bh(&table->tb6_lock);
3174 err = fib6_del(rt, info);
3175 spin_unlock_bh(&table->tb6_lock);
3178 fib6_info_release(rt);
3182 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3184 struct nl_info info = { .nl_net = net };
3186 return __ip6_del_rt(rt, &info);
3189 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3191 struct nl_info *info = &cfg->fc_nlinfo;
3192 struct net *net = info->nl_net;
3193 struct sk_buff *skb = NULL;
3194 struct fib6_table *table;
3197 if (rt == net->ipv6.fib6_null_entry)
3199 table = rt->fib6_table;
3200 spin_lock_bh(&table->tb6_lock);
3202 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3203 struct fib6_info *sibling, *next_sibling;
3205 /* prefer to send a single notification with all hops */
3206 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3208 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3210 if (rt6_fill_node(net, skb, rt, NULL,
3211 NULL, NULL, 0, RTM_DELROUTE,
3212 info->portid, seq, 0) < 0) {
3216 info->skip_notify = 1;
3219 list_for_each_entry_safe(sibling, next_sibling,
3222 err = fib6_del(sibling, info);
3228 err = fib6_del(rt, info);
3230 spin_unlock_bh(&table->tb6_lock);
3232 fib6_info_release(rt);
3235 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3236 info->nlh, gfp_any());
3241 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3245 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3248 if (cfg->fc_flags & RTF_GATEWAY &&
3249 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3251 if (dst_hold_safe(&rt->dst))
3252 rc = rt6_remove_exception_rt(rt);
3257 static int ip6_route_del(struct fib6_config *cfg,
3258 struct netlink_ext_ack *extack)
3260 struct rt6_info *rt_cache;
3261 struct fib6_table *table;
3262 struct fib6_info *rt;
3263 struct fib6_node *fn;
3266 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3268 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3274 fn = fib6_locate(&table->tb6_root,
3275 &cfg->fc_dst, cfg->fc_dst_len,
3276 &cfg->fc_src, cfg->fc_src_len,
3277 !(cfg->fc_flags & RTF_CACHE));
3280 for_each_fib6_node_rt_rcu(fn) {
3281 if (cfg->fc_flags & RTF_CACHE) {
3284 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3287 rc = ip6_del_cached_rt(rt_cache, cfg);
3295 if (cfg->fc_ifindex &&
3296 (!rt->fib6_nh.nh_dev ||
3297 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3299 if (cfg->fc_flags & RTF_GATEWAY &&
3300 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3302 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3304 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3309 /* if gateway was specified only delete the one hop */
3310 if (cfg->fc_flags & RTF_GATEWAY)
3311 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3313 return __ip6_del_rt_siblings(rt, cfg);
3321 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3323 struct netevent_redirect netevent;
3324 struct rt6_info *rt, *nrt = NULL;
3325 struct ndisc_options ndopts;
3326 struct inet6_dev *in6_dev;
3327 struct neighbour *neigh;
3328 struct fib6_info *from;
3330 int optlen, on_link;
3333 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3334 optlen -= sizeof(*msg);
3337 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3341 msg = (struct rd_msg *)icmp6_hdr(skb);
3343 if (ipv6_addr_is_multicast(&msg->dest)) {
3344 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3349 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3351 } else if (ipv6_addr_type(&msg->target) !=
3352 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3353 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3357 in6_dev = __in6_dev_get(skb->dev);
3360 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3364 * The IP source address of the Redirect MUST be the same as the current
3365 * first-hop router for the specified ICMP Destination Address.
3368 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3369 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3374 if (ndopts.nd_opts_tgt_lladdr) {
3375 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3378 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3383 rt = (struct rt6_info *) dst;
3384 if (rt->rt6i_flags & RTF_REJECT) {
3385 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3389 /* Redirect received -> path was valid.
3390 * Look, redirects are sent only in response to data packets,
3391 * so that this nexthop apparently is reachable. --ANK
3393 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3395 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3400 * We have finally decided to accept it.
3403 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3404 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3405 NEIGH_UPDATE_F_OVERRIDE|
3406 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3407 NEIGH_UPDATE_F_ISROUTER)),
3408 NDISC_REDIRECT, &ndopts);
3411 from = rcu_dereference(rt->from);
3412 fib6_info_hold(from);
3415 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3419 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3421 nrt->rt6i_flags &= ~RTF_GATEWAY;
3423 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3425 /* No need to remove rt from the exception table if rt is
3426 * a cached route because rt6_insert_exception() will
3429 if (rt6_insert_exception(nrt, from)) {
3430 dst_release_immediate(&nrt->dst);
3434 netevent.old = &rt->dst;
3435 netevent.new = &nrt->dst;
3436 netevent.daddr = &msg->dest;
3437 netevent.neigh = neigh;
3438 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3441 fib6_info_release(from);
3442 neigh_release(neigh);
3445 #ifdef CONFIG_IPV6_ROUTE_INFO
3446 static struct fib6_info *rt6_get_route_info(struct net *net,
3447 const struct in6_addr *prefix, int prefixlen,
3448 const struct in6_addr *gwaddr,
3449 struct net_device *dev)
3451 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3452 int ifindex = dev->ifindex;
3453 struct fib6_node *fn;
3454 struct fib6_info *rt = NULL;
3455 struct fib6_table *table;
3457 table = fib6_get_table(net, tb_id);
3462 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3466 for_each_fib6_node_rt_rcu(fn) {
3467 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3469 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3471 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))