Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec
[muen/linux.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67
68 #include <linux/uaccess.h>
69
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73
74 enum rt6_nud_state {
75         RT6_NUD_FAIL_HARD = -3,
76         RT6_NUD_FAIL_PROBE = -2,
77         RT6_NUD_FAIL_DO_RR = -1,
78         RT6_NUD_SUCCEED = 1
79 };
80
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int      ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void             ip6_dst_destroy(struct dst_entry *);
87 static void             ip6_dst_ifdown(struct dst_entry *,
88                                        struct net_device *dev, int how);
89 static int               ip6_dst_gc(struct dst_ops *ops);
90
91 static int              ip6_pkt_discard(struct sk_buff *skb);
92 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int              ip6_pkt_prohibit(struct sk_buff *skb);
94 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void             ip6_link_failure(struct sk_buff *skb);
96 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97                                            struct sk_buff *skb, u32 mtu);
98 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99                                         struct sk_buff *skb);
100 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104                          struct sk_buff *skb, struct rt6_info *rt,
105                          struct in6_addr *dst, struct in6_addr *src,
106                          int iif, int type, u32 portid, u32 seq,
107                          unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109                                            struct in6_addr *daddr,
110                                            struct in6_addr *saddr);
111
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114                                            const struct in6_addr *prefix, int prefixlen,
115                                            const struct in6_addr *gwaddr,
116                                            struct net_device *dev,
117                                            unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119                                            const struct in6_addr *prefix, int prefixlen,
120                                            const struct in6_addr *gwaddr,
121                                            struct net_device *dev);
122 #endif
123
124 struct uncached_list {
125         spinlock_t              lock;
126         struct list_head        head;
127 };
128
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
135         rt->rt6i_uncached_list = ul;
136
137         spin_lock_bh(&ul->lock);
138         list_add_tail(&rt->rt6i_uncached, &ul->head);
139         spin_unlock_bh(&ul->lock);
140 }
141
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144         if (!list_empty(&rt->rt6i_uncached)) {
145                 struct uncached_list *ul = rt->rt6i_uncached_list;
146                 struct net *net = dev_net(rt->dst.dev);
147
148                 spin_lock_bh(&ul->lock);
149                 list_del(&rt->rt6i_uncached);
150                 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151                 spin_unlock_bh(&ul->lock);
152         }
153 }
154
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157         struct net_device *loopback_dev = net->loopback_dev;
158         int cpu;
159
160         if (dev == loopback_dev)
161                 return;
162
163         for_each_possible_cpu(cpu) {
164                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165                 struct rt6_info *rt;
166
167                 spin_lock_bh(&ul->lock);
168                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169                         struct inet6_dev *rt_idev = rt->rt6i_idev;
170                         struct net_device *rt_dev = rt->dst.dev;
171
172                         if (rt_idev->dev == dev) {
173                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
174                                 in6_dev_put(rt_idev);
175                         }
176
177                         if (rt_dev == dev) {
178                                 rt->dst.dev = loopback_dev;
179                                 dev_hold(rt->dst.dev);
180                                 dev_put(rt_dev);
181                         }
182                 }
183                 spin_unlock_bh(&ul->lock);
184         }
185 }
186
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189         return dst_metrics_write_ptr(&rt->from->dst);
190 }
191
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194         struct rt6_info *rt = (struct rt6_info *)dst;
195
196         if (rt->rt6i_flags & RTF_PCPU)
197                 return rt6_pcpu_cow_metrics(rt);
198         else if (rt->rt6i_flags & RTF_CACHE)
199                 return NULL;
200         else
201                 return dst_cow_metrics_generic(dst, old);
202 }
203
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205                                              struct sk_buff *skb,
206                                              const void *daddr)
207 {
208         struct in6_addr *p = &rt->rt6i_gateway;
209
210         if (!ipv6_addr_any(p))
211                 return (const void *) p;
212         else if (skb)
213                 return &ipv6_hdr(skb)->daddr;
214         return daddr;
215 }
216
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218                                           struct sk_buff *skb,
219                                           const void *daddr)
220 {
221         struct rt6_info *rt = (struct rt6_info *) dst;
222         struct neighbour *n;
223
224         daddr = choose_neigh_daddr(rt, skb, daddr);
225         n = __ipv6_neigh_lookup(dst->dev, daddr);
226         if (n)
227                 return n;
228         return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233         struct net_device *dev = dst->dev;
234         struct rt6_info *rt = (struct rt6_info *)dst;
235
236         daddr = choose_neigh_daddr(rt, NULL, daddr);
237         if (!daddr)
238                 return;
239         if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240                 return;
241         if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242                 return;
243         __ipv6_confirm_neigh(dev, daddr);
244 }
245
246 static struct dst_ops ip6_dst_ops_template = {
247         .family                 =       AF_INET6,
248         .gc                     =       ip6_dst_gc,
249         .gc_thresh              =       1024,
250         .check                  =       ip6_dst_check,
251         .default_advmss         =       ip6_default_advmss,
252         .mtu                    =       ip6_mtu,
253         .cow_metrics            =       ipv6_cow_metrics,
254         .destroy                =       ip6_dst_destroy,
255         .ifdown                 =       ip6_dst_ifdown,
256         .negative_advice        =       ip6_negative_advice,
257         .link_failure           =       ip6_link_failure,
258         .update_pmtu            =       ip6_rt_update_pmtu,
259         .redirect               =       rt6_do_redirect,
260         .local_out              =       __ip6_local_out,
261         .neigh_lookup           =       ip6_neigh_lookup,
262         .confirm_neigh          =       ip6_confirm_neigh,
263 };
264
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269         return mtu ? : dst->dev->mtu;
270 }
271
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273                                          struct sk_buff *skb, u32 mtu)
274 {
275 }
276
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278                                       struct sk_buff *skb)
279 {
280 }
281
282 static struct dst_ops ip6_dst_blackhole_ops = {
283         .family                 =       AF_INET6,
284         .destroy                =       ip6_dst_destroy,
285         .check                  =       ip6_dst_check,
286         .mtu                    =       ip6_blackhole_mtu,
287         .default_advmss         =       ip6_default_advmss,
288         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
289         .redirect               =       ip6_rt_blackhole_redirect,
290         .cow_metrics            =       dst_cow_metrics_generic,
291         .neigh_lookup           =       ip6_neigh_lookup,
292 };
293
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295         [RTAX_HOPLIMIT - 1] = 0,
296 };
297
298 static const struct rt6_info ip6_null_entry_template = {
299         .dst = {
300                 .__refcnt       = ATOMIC_INIT(1),
301                 .__use          = 1,
302                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
303                 .error          = -ENETUNREACH,
304                 .input          = ip6_pkt_discard,
305                 .output         = ip6_pkt_discard_out,
306         },
307         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
308         .rt6i_protocol  = RTPROT_KERNEL,
309         .rt6i_metric    = ~(u32) 0,
310         .rt6i_ref       = ATOMIC_INIT(1),
311 };
312
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
315 static const struct rt6_info ip6_prohibit_entry_template = {
316         .dst = {
317                 .__refcnt       = ATOMIC_INIT(1),
318                 .__use          = 1,
319                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
320                 .error          = -EACCES,
321                 .input          = ip6_pkt_prohibit,
322                 .output         = ip6_pkt_prohibit_out,
323         },
324         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
325         .rt6i_protocol  = RTPROT_KERNEL,
326         .rt6i_metric    = ~(u32) 0,
327         .rt6i_ref       = ATOMIC_INIT(1),
328 };
329
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331         .dst = {
332                 .__refcnt       = ATOMIC_INIT(1),
333                 .__use          = 1,
334                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
335                 .error          = -EINVAL,
336                 .input          = dst_discard,
337                 .output         = dst_discard_out,
338         },
339         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
340         .rt6i_protocol  = RTPROT_KERNEL,
341         .rt6i_metric    = ~(u32) 0,
342         .rt6i_ref       = ATOMIC_INIT(1),
343 };
344
345 #endif
346
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349         struct dst_entry *dst = &rt->dst;
350
351         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352         INIT_LIST_HEAD(&rt->rt6i_siblings);
353         INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358                                         struct net_device *dev,
359                                         int flags)
360 {
361         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362                                         1, DST_OBSOLETE_FORCE_CHK, flags);
363
364         if (rt) {
365                 rt6_info_init(rt);
366                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367         }
368
369         return rt;
370 }
371
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373                                struct net_device *dev,
374                                int flags)
375 {
376         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377
378         if (rt) {
379                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380                 if (!rt->rt6i_pcpu) {
381                         dst_release_immediate(&rt->dst);
382                         return NULL;
383                 }
384         }
385
386         return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392         struct rt6_info *rt = (struct rt6_info *)dst;
393         struct rt6_exception_bucket *bucket;
394         struct rt6_info *from = rt->from;
395         struct inet6_dev *idev;
396
397         dst_destroy_metrics_generic(dst);
398         free_percpu(rt->rt6i_pcpu);
399         rt6_uncached_list_del(rt);
400
401         idev = rt->rt6i_idev;
402         if (idev) {
403                 rt->rt6i_idev = NULL;
404                 in6_dev_put(idev);
405         }
406         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407         if (bucket) {
408                 rt->rt6i_exception_bucket = NULL;
409                 kfree(bucket);
410         }
411
412         rt->from = NULL;
413         dst_release(&from->dst);
414 }
415
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417                            int how)
418 {
419         struct rt6_info *rt = (struct rt6_info *)dst;
420         struct inet6_dev *idev = rt->rt6i_idev;
421         struct net_device *loopback_dev =
422                 dev_net(dev)->loopback_dev;
423
424         if (idev && idev->dev != loopback_dev) {
425                 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426                 if (loopback_idev) {
427                         rt->rt6i_idev = loopback_idev;
428                         in6_dev_put(idev);
429                 }
430         }
431 }
432
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435         if (rt->rt6i_flags & RTF_EXPIRES)
436                 return time_after(jiffies, rt->dst.expires);
437         else
438                 return false;
439 }
440
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443         if (rt->rt6i_flags & RTF_EXPIRES) {
444                 if (time_after(jiffies, rt->dst.expires))
445                         return true;
446         } else if (rt->from) {
447                 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448                         rt6_check_expired(rt->from);
449         }
450         return false;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458
459         /* We might have already computed the hash for ICMPv6 errors. In such
460          * case it will always be non-zero. Otherwise now is the time to do it.
461          */
462         if (!fl6->mp_hash)
463                 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
464
465         if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466                 return match;
467
468         list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469                                  rt6i_siblings) {
470                 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471                         continue;
472                 if (rt6_score_route(sibling, oif, strict) < 0)
473                         break;
474                 match = sibling;
475                 break;
476         }
477
478         return match;
479 }
480
481 /*
482  *      Route lookup. rcu_read_lock() should be held.
483  */
484
485 static inline struct rt6_info *rt6_device_match(struct net *net,
486                                                     struct rt6_info *rt,
487                                                     const struct in6_addr *saddr,
488                                                     int oif,
489                                                     int flags)
490 {
491         struct rt6_info *local = NULL;
492         struct rt6_info *sprt;
493
494         if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
495                 return rt;
496
497         for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
498                 struct net_device *dev = sprt->dst.dev;
499
500                 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
501                         continue;
502
503                 if (oif) {
504                         if (dev->ifindex == oif)
505                                 return sprt;
506                         if (dev->flags & IFF_LOOPBACK) {
507                                 if (!sprt->rt6i_idev ||
508                                     sprt->rt6i_idev->dev->ifindex != oif) {
509                                         if (flags & RT6_LOOKUP_F_IFACE)
510                                                 continue;
511                                         if (local &&
512                                             local->rt6i_idev->dev->ifindex == oif)
513                                                 continue;
514                                 }
515                                 local = sprt;
516                         }
517                 } else {
518                         if (ipv6_chk_addr(net, saddr, dev,
519                                           flags & RT6_LOOKUP_F_IFACE))
520                                 return sprt;
521                 }
522         }
523
524         if (oif) {
525                 if (local)
526                         return local;
527
528                 if (flags & RT6_LOOKUP_F_IFACE)
529                         return net->ipv6.ip6_null_entry;
530         }
531
532         return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
533 }
534
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537         struct work_struct work;
538         struct in6_addr target;
539         struct net_device *dev;
540 };
541
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544         struct in6_addr mcaddr;
545         struct __rt6_probe_work *work =
546                 container_of(w, struct __rt6_probe_work, work);
547
548         addrconf_addr_solict_mult(&work->target, &mcaddr);
549         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550         dev_put(work->dev);
551         kfree(work);
552 }
553
554 static void rt6_probe(struct rt6_info *rt)
555 {
556         struct __rt6_probe_work *work;
557         struct neighbour *neigh;
558         /*
559          * Okay, this does not seem to be appropriate
560          * for now, however, we need to check if it
561          * is really so; aka Router Reachability Probing.
562          *
563          * Router Reachability Probe MUST be rate-limited
564          * to no more than one per minute.
565          */
566         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567                 return;
568         rcu_read_lock_bh();
569         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570         if (neigh) {
571                 if (neigh->nud_state & NUD_VALID)
572                         goto out;
573
574                 work = NULL;
575                 write_lock(&neigh->lock);
576                 if (!(neigh->nud_state & NUD_VALID) &&
577                     time_after(jiffies,
578                                neigh->updated +
579                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
580                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
581                         if (work)
582                                 __neigh_set_probe_once(neigh);
583                 }
584                 write_unlock(&neigh->lock);
585         } else {
586                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
587         }
588
589         if (work) {
590                 INIT_WORK(&work->work, rt6_probe_deferred);
591                 work->target = rt->rt6i_gateway;
592                 dev_hold(rt->dst.dev);
593                 work->dev = rt->dst.dev;
594                 schedule_work(&work->work);
595         }
596
597 out:
598         rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611         struct net_device *dev = rt->dst.dev;
612         if (!oif || dev->ifindex == oif)
613                 return 2;
614         if ((dev->flags & IFF_LOOPBACK) &&
615             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616                 return 1;
617         return 0;
618 }
619
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622         struct neighbour *neigh;
623         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624
625         if (rt->rt6i_flags & RTF_NONEXTHOP ||
626             !(rt->rt6i_flags & RTF_GATEWAY))
627                 return RT6_NUD_SUCCEED;
628
629         rcu_read_lock_bh();
630         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631         if (neigh) {
632                 read_lock(&neigh->lock);
633                 if (neigh->nud_state & NUD_VALID)
634                         ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636                 else if (!(neigh->nud_state & NUD_FAILED))
637                         ret = RT6_NUD_SUCCEED;
638                 else
639                         ret = RT6_NUD_FAIL_PROBE;
640 #endif
641                 read_unlock(&neigh->lock);
642         } else {
643                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645         }
646         rcu_read_unlock_bh();
647
648         return ret;
649 }
650
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652                            int strict)
653 {
654         int m;
655
656         m = rt6_check_dev(rt, oif);
657         if (!m && (strict & RT6_LOOKUP_F_IFACE))
658                 return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662         if (strict & RT6_LOOKUP_F_REACHABLE) {
663                 int n = rt6_check_neigh(rt);
664                 if (n < 0)
665                         return n;
666         }
667         return m;
668 }
669
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671                                    int *mpri, struct rt6_info *match,
672                                    bool *do_rr)
673 {
674         int m;
675         bool match_do_rr = false;
676         struct inet6_dev *idev = rt->rt6i_idev;
677
678         if (rt->rt6i_nh_flags & RTNH_F_DEAD)
679                 goto out;
680
681         if (idev->cnf.ignore_routes_with_linkdown &&
682             rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
683             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684                 goto out;
685
686         if (rt6_check_expired(rt))
687                 goto out;
688
689         m = rt6_score_route(rt, oif, strict);
690         if (m == RT6_NUD_FAIL_DO_RR) {
691                 match_do_rr = true;
692                 m = 0; /* lowest valid score */
693         } else if (m == RT6_NUD_FAIL_HARD) {
694                 goto out;
695         }
696
697         if (strict & RT6_LOOKUP_F_REACHABLE)
698                 rt6_probe(rt);
699
700         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
701         if (m > *mpri) {
702                 *do_rr = match_do_rr;
703                 *mpri = m;
704                 match = rt;
705         }
706 out:
707         return match;
708 }
709
710 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
711                                      struct rt6_info *leaf,
712                                      struct rt6_info *rr_head,
713                                      u32 metric, int oif, int strict,
714                                      bool *do_rr)
715 {
716         struct rt6_info *rt, *match, *cont;
717         int mpri = -1;
718
719         match = NULL;
720         cont = NULL;
721         for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
722                 if (rt->rt6i_metric != metric) {
723                         cont = rt;
724                         break;
725                 }
726
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728         }
729
730         for (rt = leaf; rt && rt != rr_head;
731              rt = rcu_dereference(rt->rt6_next)) {
732                 if (rt->rt6i_metric != metric) {
733                         cont = rt;
734                         break;
735                 }
736
737                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
738         }
739
740         if (match || !cont)
741                 return match;
742
743         for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
744                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
745
746         return match;
747 }
748
749 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750                                    int oif, int strict)
751 {
752         struct rt6_info *leaf = rcu_dereference(fn->leaf);
753         struct rt6_info *match, *rt0;
754         bool do_rr = false;
755         int key_plen;
756
757         if (!leaf || leaf == net->ipv6.ip6_null_entry)
758                 return net->ipv6.ip6_null_entry;
759
760         rt0 = rcu_dereference(fn->rr_ptr);
761         if (!rt0)
762                 rt0 = leaf;
763
764         /* Double check to make sure fn is not an intermediate node
765          * and fn->leaf does not points to its child's leaf
766          * (This might happen if all routes under fn are deleted from
767          * the tree and fib6_repair_tree() is called on the node.)
768          */
769         key_plen = rt0->rt6i_dst.plen;
770 #ifdef CONFIG_IPV6_SUBTREES
771         if (rt0->rt6i_src.plen)
772                 key_plen = rt0->rt6i_src.plen;
773 #endif
774         if (fn->fn_bit != key_plen)
775                 return net->ipv6.ip6_null_entry;
776
777         match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
778                              &do_rr);
779
780         if (do_rr) {
781                 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
782
783                 /* no entries matched; do round-robin */
784                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
785                         next = leaf;
786
787                 if (next != rt0) {
788                         spin_lock_bh(&leaf->rt6i_table->tb6_lock);
789                         /* make sure next is not being deleted from the tree */
790                         if (next->rt6i_node)
791                                 rcu_assign_pointer(fn->rr_ptr, next);
792                         spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
793                 }
794         }
795
796         return match ? match : net->ipv6.ip6_null_entry;
797 }
798
799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
800 {
801         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802 }
803
804 #ifdef CONFIG_IPV6_ROUTE_INFO
805 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
806                   const struct in6_addr *gwaddr)
807 {
808         struct net *net = dev_net(dev);
809         struct route_info *rinfo = (struct route_info *) opt;
810         struct in6_addr prefix_buf, *prefix;
811         unsigned int pref;
812         unsigned long lifetime;
813         struct rt6_info *rt;
814
815         if (len < sizeof(struct route_info)) {
816                 return -EINVAL;
817         }
818
819         /* Sanity check for prefix_len and length */
820         if (rinfo->length > 3) {
821                 return -EINVAL;
822         } else if (rinfo->prefix_len > 128) {
823                 return -EINVAL;
824         } else if (rinfo->prefix_len > 64) {
825                 if (rinfo->length < 2) {
826                         return -EINVAL;
827                 }
828         } else if (rinfo->prefix_len > 0) {
829                 if (rinfo->length < 1) {
830                         return -EINVAL;
831                 }
832         }
833
834         pref = rinfo->route_pref;
835         if (pref == ICMPV6_ROUTER_PREF_INVALID)
836                 return -EINVAL;
837
838         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
839
840         if (rinfo->length == 3)
841                 prefix = (struct in6_addr *)rinfo->prefix;
842         else {
843                 /* this function is safe */
844                 ipv6_addr_prefix(&prefix_buf,
845                                  (struct in6_addr *)rinfo->prefix,
846                                  rinfo->prefix_len);
847                 prefix = &prefix_buf;
848         }
849
850         if (rinfo->prefix_len == 0)
851                 rt = rt6_get_dflt_router(gwaddr, dev);
852         else
853                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854                                         gwaddr, dev);
855
856         if (rt && !lifetime) {
857                 ip6_del_rt(rt);
858                 rt = NULL;
859         }
860
861         if (!rt && lifetime)
862                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863                                         dev, pref);
864         else if (rt)
865                 rt->rt6i_flags = RTF_ROUTEINFO |
866                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867
868         if (rt) {
869                 if (!addrconf_finite_timeout(lifetime))
870                         rt6_clean_expires(rt);
871                 else
872                         rt6_set_expires(rt, jiffies + HZ * lifetime);
873
874                 ip6_rt_put(rt);
875         }
876         return 0;
877 }
878 #endif
879
880 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
881                                         struct in6_addr *saddr)
882 {
883         struct fib6_node *pn, *sn;
884         while (1) {
885                 if (fn->fn_flags & RTN_TL_ROOT)
886                         return NULL;
887                 pn = rcu_dereference(fn->parent);
888                 sn = FIB6_SUBTREE(pn);
889                 if (sn && sn != fn)
890                         fn = fib6_lookup(sn, NULL, saddr);
891                 else
892                         fn = pn;
893                 if (fn->fn_flags & RTN_RTINFO)
894                         return fn;
895         }
896 }
897
898 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899                           bool null_fallback)
900 {
901         struct rt6_info *rt = *prt;
902
903         if (dst_hold_safe(&rt->dst))
904                 return true;
905         if (null_fallback) {
906                 rt = net->ipv6.ip6_null_entry;
907                 dst_hold(&rt->dst);
908         } else {
909                 rt = NULL;
910         }
911         *prt = rt;
912         return false;
913 }
914
915 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
916                                              struct fib6_table *table,
917                                              struct flowi6 *fl6, int flags)
918 {
919         struct rt6_info *rt, *rt_cache;
920         struct fib6_node *fn;
921
922         rcu_read_lock();
923         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
924 restart:
925         rt = rcu_dereference(fn->leaf);
926         if (!rt) {
927                 rt = net->ipv6.ip6_null_entry;
928         } else {
929                 rt = rt6_device_match(net, rt, &fl6->saddr,
930                                       fl6->flowi6_oif, flags);
931                 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
932                         rt = rt6_multipath_select(rt, fl6,
933                                                   fl6->flowi6_oif, flags);
934         }
935         if (rt == net->ipv6.ip6_null_entry) {
936                 fn = fib6_backtrack(fn, &fl6->saddr);
937                 if (fn)
938                         goto restart;
939         }
940         /* Search through exception table */
941         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
942         if (rt_cache)
943                 rt = rt_cache;
944
945         if (ip6_hold_safe(net, &rt, true))
946                 dst_use_noref(&rt->dst, jiffies);
947
948         rcu_read_unlock();
949
950         trace_fib6_table_lookup(net, rt, table, fl6);
951
952         return rt;
953
954 }
955
956 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
957                                     int flags)
958 {
959         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
960 }
961 EXPORT_SYMBOL_GPL(ip6_route_lookup);
962
963 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
964                             const struct in6_addr *saddr, int oif, int strict)
965 {
966         struct flowi6 fl6 = {
967                 .flowi6_oif = oif,
968                 .daddr = *daddr,
969         };
970         struct dst_entry *dst;
971         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
972
973         if (saddr) {
974                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
975                 flags |= RT6_LOOKUP_F_HAS_SADDR;
976         }
977
978         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
979         if (dst->error == 0)
980                 return (struct rt6_info *) dst;
981
982         dst_release(dst);
983
984         return NULL;
985 }
986 EXPORT_SYMBOL(rt6_lookup);
987
988 /* ip6_ins_rt is called with FREE table->tb6_lock.
989  * It takes new route entry, the addition fails by any reason the
990  * route is released.
991  * Caller must hold dst before calling it.
992  */
993
994 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
995                         struct mx6_config *mxc,
996                         struct netlink_ext_ack *extack)
997 {
998         int err;
999         struct fib6_table *table;
1000
1001         table = rt->rt6i_table;
1002         spin_lock_bh(&table->tb6_lock);
1003         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1004         spin_unlock_bh(&table->tb6_lock);
1005
1006         return err;
1007 }
1008
1009 int ip6_ins_rt(struct rt6_info *rt)
1010 {
1011         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1012         struct mx6_config mxc = { .mx = NULL, };
1013
1014         /* Hold dst to account for the reference from the fib6 tree */
1015         dst_hold(&rt->dst);
1016         return __ip6_ins_rt(rt, &info, &mxc, NULL);
1017 }
1018
1019 /* called with rcu_lock held */
1020 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1021 {
1022         struct net_device *dev = rt->dst.dev;
1023
1024         if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1025                 /* for copies of local routes, dst->dev needs to be the
1026                  * device if it is a master device, the master device if
1027                  * device is enslaved, and the loopback as the default
1028                  */
1029                 if (netif_is_l3_slave(dev) &&
1030                     !rt6_need_strict(&rt->rt6i_dst.addr))
1031                         dev = l3mdev_master_dev_rcu(dev);
1032                 else if (!netif_is_l3_master(dev))
1033                         dev = dev_net(dev)->loopback_dev;
1034                 /* last case is netif_is_l3_master(dev) is true in which
1035                  * case we want dev returned to be dev
1036                  */
1037         }
1038
1039         return dev;
1040 }
1041
1042 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1043                                            const struct in6_addr *daddr,
1044                                            const struct in6_addr *saddr)
1045 {
1046         struct net_device *dev;
1047         struct rt6_info *rt;
1048
1049         /*
1050          *      Clone the route.
1051          */
1052
1053         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1054                 ort = ort->from;
1055
1056         rcu_read_lock();
1057         dev = ip6_rt_get_dev_rcu(ort);
1058         rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1059         rcu_read_unlock();
1060         if (!rt)
1061                 return NULL;
1062
1063         ip6_rt_copy_init(rt, ort);
1064         rt->rt6i_flags |= RTF_CACHE;
1065         rt->rt6i_metric = 0;
1066         rt->dst.flags |= DST_HOST;
1067         rt->rt6i_dst.addr = *daddr;
1068         rt->rt6i_dst.plen = 128;
1069
1070         if (!rt6_is_gw_or_nonexthop(ort)) {
1071                 if (ort->rt6i_dst.plen != 128 &&
1072                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1073                         rt->rt6i_flags |= RTF_ANYCAST;
1074 #ifdef CONFIG_IPV6_SUBTREES
1075                 if (rt->rt6i_src.plen && saddr) {
1076                         rt->rt6i_src.addr = *saddr;
1077                         rt->rt6i_src.plen = 128;
1078                 }
1079 #endif
1080         }
1081
1082         return rt;
1083 }
1084
1085 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1086 {
1087         struct net_device *dev;
1088         struct rt6_info *pcpu_rt;
1089
1090         rcu_read_lock();
1091         dev = ip6_rt_get_dev_rcu(rt);
1092         pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1093         rcu_read_unlock();
1094         if (!pcpu_rt)
1095                 return NULL;
1096         ip6_rt_copy_init(pcpu_rt, rt);
1097         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1098         pcpu_rt->rt6i_flags |= RTF_PCPU;
1099         return pcpu_rt;
1100 }
1101
1102 /* It should be called with rcu_read_lock() acquired */
1103 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1104 {
1105         struct rt6_info *pcpu_rt, **p;
1106
1107         p = this_cpu_ptr(rt->rt6i_pcpu);
1108         pcpu_rt = *p;
1109
1110         if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1111                 rt6_dst_from_metrics_check(pcpu_rt);
1112
1113         return pcpu_rt;
1114 }
1115
1116 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1117 {
1118         struct rt6_info *pcpu_rt, *prev, **p;
1119
1120         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1121         if (!pcpu_rt) {
1122                 struct net *net = dev_net(rt->dst.dev);
1123
1124                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1125                 return net->ipv6.ip6_null_entry;
1126         }
1127
1128         dst_hold(&pcpu_rt->dst);
1129         p = this_cpu_ptr(rt->rt6i_pcpu);
1130         prev = cmpxchg(p, NULL, pcpu_rt);
1131         BUG_ON(prev);
1132
1133         rt6_dst_from_metrics_check(pcpu_rt);
1134         return pcpu_rt;
1135 }
1136
1137 /* exception hash table implementation
1138  */
1139 static DEFINE_SPINLOCK(rt6_exception_lock);
1140
1141 /* Remove rt6_ex from hash table and free the memory
1142  * Caller must hold rt6_exception_lock
1143  */
1144 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1145                                  struct rt6_exception *rt6_ex)
1146 {
1147         struct net *net;
1148
1149         if (!bucket || !rt6_ex)
1150                 return;
1151
1152         net = dev_net(rt6_ex->rt6i->dst.dev);
1153         rt6_ex->rt6i->rt6i_node = NULL;
1154         hlist_del_rcu(&rt6_ex->hlist);
1155         rt6_release(rt6_ex->rt6i);
1156         kfree_rcu(rt6_ex, rcu);
1157         WARN_ON_ONCE(!bucket->depth);
1158         bucket->depth--;
1159         net->ipv6.rt6_stats->fib_rt_cache--;
1160 }
1161
1162 /* Remove oldest rt6_ex in bucket and free the memory
1163  * Caller must hold rt6_exception_lock
1164  */
1165 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1166 {
1167         struct rt6_exception *rt6_ex, *oldest = NULL;
1168
1169         if (!bucket)
1170                 return;
1171
1172         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1173                 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1174                         oldest = rt6_ex;
1175         }
1176         rt6_remove_exception(bucket, oldest);
1177 }
1178
1179 static u32 rt6_exception_hash(const struct in6_addr *dst,
1180                               const struct in6_addr *src)
1181 {
1182         static u32 seed __read_mostly;
1183         u32 val;
1184
1185         net_get_random_once(&seed, sizeof(seed));
1186         val = jhash(dst, sizeof(*dst), seed);
1187
1188 #ifdef CONFIG_IPV6_SUBTREES
1189         if (src)
1190                 val = jhash(src, sizeof(*src), val);
1191 #endif
1192         return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1193 }
1194
1195 /* Helper function to find the cached rt in the hash table
1196  * and update bucket pointer to point to the bucket for this
1197  * (daddr, saddr) pair
1198  * Caller must hold rt6_exception_lock
1199  */
1200 static struct rt6_exception *
1201 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1202                               const struct in6_addr *daddr,
1203                               const struct in6_addr *saddr)
1204 {
1205         struct rt6_exception *rt6_ex;
1206         u32 hval;
1207
1208         if (!(*bucket) || !daddr)
1209                 return NULL;
1210
1211         hval = rt6_exception_hash(daddr, saddr);
1212         *bucket += hval;
1213
1214         hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1215                 struct rt6_info *rt6 = rt6_ex->rt6i;
1216                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1217
1218 #ifdef CONFIG_IPV6_SUBTREES
1219                 if (matched && saddr)
1220                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1221 #endif
1222                 if (matched)
1223                         return rt6_ex;
1224         }
1225         return NULL;
1226 }
1227
1228 /* Helper function to find the cached rt in the hash table
1229  * and update bucket pointer to point to the bucket for this
1230  * (daddr, saddr) pair
1231  * Caller must hold rcu_read_lock()
1232  */
1233 static struct rt6_exception *
1234 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1235                          const struct in6_addr *daddr,
1236                          const struct in6_addr *saddr)
1237 {
1238         struct rt6_exception *rt6_ex;
1239         u32 hval;
1240
1241         WARN_ON_ONCE(!rcu_read_lock_held());
1242
1243         if (!(*bucket) || !daddr)
1244                 return NULL;
1245
1246         hval = rt6_exception_hash(daddr, saddr);
1247         *bucket += hval;
1248
1249         hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1250                 struct rt6_info *rt6 = rt6_ex->rt6i;
1251                 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1252
1253 #ifdef CONFIG_IPV6_SUBTREES
1254                 if (matched && saddr)
1255                         matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1256 #endif
1257                 if (matched)
1258                         return rt6_ex;
1259         }
1260         return NULL;
1261 }
1262
1263 static int rt6_insert_exception(struct rt6_info *nrt,
1264                                 struct rt6_info *ort)
1265 {
1266         struct net *net = dev_net(ort->dst.dev);
1267         struct rt6_exception_bucket *bucket;
1268         struct in6_addr *src_key = NULL;
1269         struct rt6_exception *rt6_ex;
1270         int err = 0;
1271
1272         /* ort can't be a cache or pcpu route */
1273         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1274                 ort = ort->from;
1275         WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1276
1277         spin_lock_bh(&rt6_exception_lock);
1278
1279         if (ort->exception_bucket_flushed) {
1280                 err = -EINVAL;
1281                 goto out;
1282         }
1283
1284         bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1285                                         lockdep_is_held(&rt6_exception_lock));
1286         if (!bucket) {
1287                 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1288                                  GFP_ATOMIC);
1289                 if (!bucket) {
1290                         err = -ENOMEM;
1291                         goto out;
1292                 }
1293                 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1294         }
1295
1296 #ifdef CONFIG_IPV6_SUBTREES
1297         /* rt6i_src.plen != 0 indicates ort is in subtree
1298          * and exception table is indexed by a hash of
1299          * both rt6i_dst and rt6i_src.
1300          * Otherwise, the exception table is indexed by
1301          * a hash of only rt6i_dst.
1302          */
1303         if (ort->rt6i_src.plen)
1304                 src_key = &nrt->rt6i_src.addr;
1305 #endif
1306
1307         /* Update rt6i_prefsrc as it could be changed
1308          * in rt6_remove_prefsrc()
1309          */
1310         nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1311         /* rt6_mtu_change() might lower mtu on ort.
1312          * Only insert this exception route if its mtu
1313          * is less than ort's mtu value.
1314          */
1315         if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1316                 err = -EINVAL;
1317                 goto out;
1318         }
1319
1320         rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1321                                                src_key);
1322         if (rt6_ex)
1323                 rt6_remove_exception(bucket, rt6_ex);
1324
1325         rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1326         if (!rt6_ex) {
1327                 err = -ENOMEM;
1328                 goto out;
1329         }
1330         rt6_ex->rt6i = nrt;
1331         rt6_ex->stamp = jiffies;
1332         atomic_inc(&nrt->rt6i_ref);
1333         nrt->rt6i_node = ort->rt6i_node;
1334         hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1335         bucket->depth++;
1336         net->ipv6.rt6_stats->fib_rt_cache++;
1337
1338         if (bucket->depth > FIB6_MAX_DEPTH)
1339                 rt6_exception_remove_oldest(bucket);
1340
1341 out:
1342         spin_unlock_bh(&rt6_exception_lock);
1343
1344         /* Update fn->fn_sernum to invalidate all cached dst */
1345         if (!err) {
1346                 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1347                 fib6_update_sernum(ort);
1348                 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1349                 fib6_force_start_gc(net);
1350         }
1351
1352         return err;
1353 }
1354
1355 void rt6_flush_exceptions(struct rt6_info *rt)
1356 {
1357         struct rt6_exception_bucket *bucket;
1358         struct rt6_exception *rt6_ex;
1359         struct hlist_node *tmp;
1360         int i;
1361
1362         spin_lock_bh(&rt6_exception_lock);
1363         /* Prevent rt6_insert_exception() to recreate the bucket list */
1364         rt->exception_bucket_flushed = 1;
1365
1366         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1367                                     lockdep_is_held(&rt6_exception_lock));
1368         if (!bucket)
1369                 goto out;
1370
1371         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1372                 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1373                         rt6_remove_exception(bucket, rt6_ex);
1374                 WARN_ON_ONCE(bucket->depth);
1375                 bucket++;
1376         }
1377
1378 out:
1379         spin_unlock_bh(&rt6_exception_lock);
1380 }
1381
1382 /* Find cached rt in the hash table inside passed in rt
1383  * Caller has to hold rcu_read_lock()
1384  */
1385 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1386                                            struct in6_addr *daddr,
1387                                            struct in6_addr *saddr)
1388 {
1389         struct rt6_exception_bucket *bucket;
1390         struct in6_addr *src_key = NULL;
1391         struct rt6_exception *rt6_ex;
1392         struct rt6_info *res = NULL;
1393
1394         bucket = rcu_dereference(rt->rt6i_exception_bucket);
1395
1396 #ifdef CONFIG_IPV6_SUBTREES
1397         /* rt6i_src.plen != 0 indicates rt is in subtree
1398          * and exception table is indexed by a hash of
1399          * both rt6i_dst and rt6i_src.
1400          * Otherwise, the exception table is indexed by
1401          * a hash of only rt6i_dst.
1402          */
1403         if (rt->rt6i_src.plen)
1404                 src_key = saddr;
1405 #endif
1406         rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1407
1408         if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1409                 res = rt6_ex->rt6i;
1410
1411         return res;
1412 }
1413
1414 /* Remove the passed in cached rt from the hash table that contains it */
1415 int rt6_remove_exception_rt(struct rt6_info *rt)
1416 {
1417         struct rt6_exception_bucket *bucket;
1418         struct rt6_info *from = rt->from;
1419         struct in6_addr *src_key = NULL;
1420         struct rt6_exception *rt6_ex;
1421         int err;
1422
1423         if (!from ||
1424             !(rt->rt6i_flags & RTF_CACHE))
1425                 return -EINVAL;
1426
1427         if (!rcu_access_pointer(from->rt6i_exception_bucket))
1428                 return -ENOENT;
1429
1430         spin_lock_bh(&rt6_exception_lock);
1431         bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1432                                     lockdep_is_held(&rt6_exception_lock));
1433 #ifdef CONFIG_IPV6_SUBTREES
1434         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1435          * and exception table is indexed by a hash of
1436          * both rt6i_dst and rt6i_src.
1437          * Otherwise, the exception table is indexed by
1438          * a hash of only rt6i_dst.
1439          */
1440         if (from->rt6i_src.plen)
1441                 src_key = &rt->rt6i_src.addr;
1442 #endif
1443         rt6_ex = __rt6_find_exception_spinlock(&bucket,
1444                                                &rt->rt6i_dst.addr,
1445                                                src_key);
1446         if (rt6_ex) {
1447                 rt6_remove_exception(bucket, rt6_ex);
1448                 err = 0;
1449         } else {
1450                 err = -ENOENT;
1451         }
1452
1453         spin_unlock_bh(&rt6_exception_lock);
1454         return err;
1455 }
1456
1457 /* Find rt6_ex which contains the passed in rt cache and
1458  * refresh its stamp
1459  */
1460 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1461 {
1462         struct rt6_exception_bucket *bucket;
1463         struct rt6_info *from = rt->from;
1464         struct in6_addr *src_key = NULL;
1465         struct rt6_exception *rt6_ex;
1466
1467         if (!from ||
1468             !(rt->rt6i_flags & RTF_CACHE))
1469                 return;
1470
1471         rcu_read_lock();
1472         bucket = rcu_dereference(from->rt6i_exception_bucket);
1473
1474 #ifdef CONFIG_IPV6_SUBTREES
1475         /* rt6i_src.plen != 0 indicates 'from' is in subtree
1476          * and exception table is indexed by a hash of
1477          * both rt6i_dst and rt6i_src.
1478          * Otherwise, the exception table is indexed by
1479          * a hash of only rt6i_dst.
1480          */
1481         if (from->rt6i_src.plen)
1482                 src_key = &rt->rt6i_src.addr;
1483 #endif
1484         rt6_ex = __rt6_find_exception_rcu(&bucket,
1485                                           &rt->rt6i_dst.addr,
1486                                           src_key);
1487         if (rt6_ex)
1488                 rt6_ex->stamp = jiffies;
1489
1490         rcu_read_unlock();
1491 }
1492
1493 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1494 {
1495         struct rt6_exception_bucket *bucket;
1496         struct rt6_exception *rt6_ex;
1497         int i;
1498
1499         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500                                         lockdep_is_held(&rt6_exception_lock));
1501
1502         if (bucket) {
1503                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1504                         hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1505                                 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1506                         }
1507                         bucket++;
1508                 }
1509         }
1510 }
1511
1512 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1513                                          struct rt6_info *rt, int mtu)
1514 {
1515         /* If the new MTU is lower than the route PMTU, this new MTU will be the
1516          * lowest MTU in the path: always allow updating the route PMTU to
1517          * reflect PMTU decreases.
1518          *
1519          * If the new MTU is higher, and the route PMTU is equal to the local
1520          * MTU, this means the old MTU is the lowest in the path, so allow
1521          * updating it: if other nodes now have lower MTUs, PMTU discovery will
1522          * handle this.
1523          */
1524
1525         if (dst_mtu(&rt->dst) >= mtu)
1526                 return true;
1527
1528         if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1529                 return true;
1530
1531         return false;
1532 }
1533
1534 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1535                                        struct rt6_info *rt, int mtu)
1536 {
1537         struct rt6_exception_bucket *bucket;
1538         struct rt6_exception *rt6_ex;
1539         int i;
1540
1541         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1542                                         lockdep_is_held(&rt6_exception_lock));
1543
1544         if (!bucket)
1545                 return;
1546
1547         for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1548                 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1549                         struct rt6_info *entry = rt6_ex->rt6i;
1550
1551                         /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1552                          * route), the metrics of its rt->dst.from have already
1553                          * been updated.
1554                          */
1555                         if (entry->rt6i_pmtu &&
1556                             rt6_mtu_change_route_allowed(idev, entry, mtu))
1557                                 entry->rt6i_pmtu = mtu;
1558                 }
1559                 bucket++;
1560         }
1561 }
1562
1563 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
1564
1565 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1566                                         struct in6_addr *gateway)
1567 {
1568         struct rt6_exception_bucket *bucket;
1569         struct rt6_exception *rt6_ex;
1570         struct hlist_node *tmp;
1571         int i;
1572
1573         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1574                 return;
1575
1576         spin_lock_bh(&rt6_exception_lock);
1577         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1578                                      lockdep_is_held(&rt6_exception_lock));
1579
1580         if (bucket) {
1581                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1582                         hlist_for_each_entry_safe(rt6_ex, tmp,
1583                                                   &bucket->chain, hlist) {
1584                                 struct rt6_info *entry = rt6_ex->rt6i;
1585
1586                                 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1587                                     RTF_CACHE_GATEWAY &&
1588                                     ipv6_addr_equal(gateway,
1589                                                     &entry->rt6i_gateway)) {
1590                                         rt6_remove_exception(bucket, rt6_ex);
1591                                 }
1592                         }
1593                         bucket++;
1594                 }
1595         }
1596
1597         spin_unlock_bh(&rt6_exception_lock);
1598 }
1599
1600 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1601                                       struct rt6_exception *rt6_ex,
1602                                       struct fib6_gc_args *gc_args,
1603                                       unsigned long now)
1604 {
1605         struct rt6_info *rt = rt6_ex->rt6i;
1606
1607         /* we are pruning and obsoleting aged-out and non gateway exceptions
1608          * even if others have still references to them, so that on next
1609          * dst_check() such references can be dropped.
1610          * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1611          * expired, independently from their aging, as per RFC 8201 section 4
1612          */
1613         if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1614                 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1615                         RT6_TRACE("aging clone %p\n", rt);
1616                         rt6_remove_exception(bucket, rt6_ex);
1617                         return;
1618                 }
1619         } else if (time_after(jiffies, rt->dst.expires)) {
1620                 RT6_TRACE("purging expired route %p\n", rt);
1621                 rt6_remove_exception(bucket, rt6_ex);
1622                 return;
1623         }
1624
1625         if (rt->rt6i_flags & RTF_GATEWAY) {
1626                 struct neighbour *neigh;
1627                 __u8 neigh_flags = 0;
1628
1629                 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1630                 if (neigh) {
1631                         neigh_flags = neigh->flags;
1632                         neigh_release(neigh);
1633                 }
1634                 if (!(neigh_flags & NTF_ROUTER)) {
1635                         RT6_TRACE("purging route %p via non-router but gateway\n",
1636                                   rt);
1637                         rt6_remove_exception(bucket, rt6_ex);
1638                         return;
1639                 }
1640         }
1641
1642         gc_args->more++;
1643 }
1644
1645 void rt6_age_exceptions(struct rt6_info *rt,
1646                         struct fib6_gc_args *gc_args,
1647                         unsigned long now)
1648 {
1649         struct rt6_exception_bucket *bucket;
1650         struct rt6_exception *rt6_ex;
1651         struct hlist_node *tmp;
1652         int i;
1653
1654         if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1655                 return;
1656
1657         spin_lock_bh(&rt6_exception_lock);
1658         bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1659                                     lockdep_is_held(&rt6_exception_lock));
1660
1661         if (bucket) {
1662                 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1663                         hlist_for_each_entry_safe(rt6_ex, tmp,
1664                                                   &bucket->chain, hlist) {
1665                                 rt6_age_examine_exception(bucket, rt6_ex,
1666                                                           gc_args, now);
1667                         }
1668                         bucket++;
1669                 }
1670         }
1671         spin_unlock_bh(&rt6_exception_lock);
1672 }
1673
1674 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1675                                int oif, struct flowi6 *fl6, int flags)
1676 {
1677         struct fib6_node *fn, *saved_fn;
1678         struct rt6_info *rt, *rt_cache;
1679         int strict = 0;
1680
1681         strict |= flags & RT6_LOOKUP_F_IFACE;
1682         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1683         if (net->ipv6.devconf_all->forwarding == 0)
1684                 strict |= RT6_LOOKUP_F_REACHABLE;
1685
1686         rcu_read_lock();
1687
1688         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1689         saved_fn = fn;
1690
1691         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1692                 oif = 0;
1693
1694 redo_rt6_select:
1695         rt = rt6_select(net, fn, oif, strict);
1696         if (rt->rt6i_nsiblings)
1697                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1698         if (rt == net->ipv6.ip6_null_entry) {
1699                 fn = fib6_backtrack(fn, &fl6->saddr);
1700                 if (fn)
1701                         goto redo_rt6_select;
1702                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1703                         /* also consider unreachable route */
1704                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1705                         fn = saved_fn;
1706                         goto redo_rt6_select;
1707                 }
1708         }
1709
1710         /*Search through exception table */
1711         rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1712         if (rt_cache)
1713                 rt = rt_cache;
1714
1715         if (rt == net->ipv6.ip6_null_entry) {
1716                 rcu_read_unlock();
1717                 dst_hold(&rt->dst);
1718                 trace_fib6_table_lookup(net, rt, table, fl6);
1719                 return rt;
1720         } else if (rt->rt6i_flags & RTF_CACHE) {
1721                 if (ip6_hold_safe(net, &rt, true)) {
1722                         dst_use_noref(&rt->dst, jiffies);
1723                         rt6_dst_from_metrics_check(rt);
1724                 }
1725                 rcu_read_unlock();
1726                 trace_fib6_table_lookup(net, rt, table, fl6);
1727                 return rt;
1728         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1729                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1730                 /* Create a RTF_CACHE clone which will not be
1731                  * owned by the fib6 tree.  It is for the special case where
1732                  * the daddr in the skb during the neighbor look-up is different
1733                  * from the fl6->daddr used to look-up route here.
1734                  */
1735
1736                 struct rt6_info *uncached_rt;
1737
1738                 if (ip6_hold_safe(net, &rt, true)) {
1739                         dst_use_noref(&rt->dst, jiffies);
1740                 } else {
1741                         rcu_read_unlock();
1742                         uncached_rt = rt;
1743                         goto uncached_rt_out;
1744                 }
1745                 rcu_read_unlock();
1746
1747                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1748                 dst_release(&rt->dst);
1749
1750                 if (uncached_rt) {
1751                         /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1752                          * No need for another dst_hold()
1753                          */
1754                         rt6_uncached_list_add(uncached_rt);
1755                         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1756                 } else {
1757                         uncached_rt = net->ipv6.ip6_null_entry;
1758                         dst_hold(&uncached_rt->dst);
1759                 }
1760
1761 uncached_rt_out:
1762                 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1763                 return uncached_rt;
1764
1765         } else {
1766                 /* Get a percpu copy */
1767
1768                 struct rt6_info *pcpu_rt;
1769
1770                 dst_use_noref(&rt->dst, jiffies);
1771                 local_bh_disable();
1772                 pcpu_rt = rt6_get_pcpu_route(rt);
1773
1774                 if (!pcpu_rt) {
1775                         /* atomic_inc_not_zero() is needed when using rcu */
1776                         if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1777                                 /* No dst_hold() on rt is needed because grabbing
1778                                  * rt->rt6i_ref makes sure rt can't be released.
1779                                  */
1780                                 pcpu_rt = rt6_make_pcpu_route(rt);
1781                                 rt6_release(rt);
1782                         } else {
1783                                 /* rt is already removed from tree */
1784                                 pcpu_rt = net->ipv6.ip6_null_entry;
1785                                 dst_hold(&pcpu_rt->dst);
1786                         }
1787                 }
1788                 local_bh_enable();
1789                 rcu_read_unlock();
1790                 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1791                 return pcpu_rt;
1792         }
1793 }
1794 EXPORT_SYMBOL_GPL(ip6_pol_route);
1795
1796 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1797                                             struct flowi6 *fl6, int flags)
1798 {
1799         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1800 }
1801
1802 struct dst_entry *ip6_route_input_lookup(struct net *net,
1803                                          struct net_device *dev,
1804                                          struct flowi6 *fl6, int flags)
1805 {
1806         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1807                 flags |= RT6_LOOKUP_F_IFACE;
1808
1809         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1810 }
1811 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1812
1813 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1814                                   struct flow_keys *keys)
1815 {
1816         const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1817         const struct ipv6hdr *key_iph = outer_iph;
1818         const struct ipv6hdr *inner_iph;
1819         const struct icmp6hdr *icmph;
1820         struct ipv6hdr _inner_iph;
1821
1822         if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1823                 goto out;
1824
1825         icmph = icmp6_hdr(skb);
1826         if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1827             icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1828             icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1829             icmph->icmp6_type != ICMPV6_PARAMPROB)
1830                 goto out;
1831
1832         inner_iph = skb_header_pointer(skb,
1833                                        skb_transport_offset(skb) + sizeof(*icmph),
1834                                        sizeof(_inner_iph), &_inner_iph);
1835         if (!inner_iph)
1836                 goto out;
1837
1838         key_iph = inner_iph;
1839 out:
1840         memset(keys, 0, sizeof(*keys));
1841         keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1842         keys->addrs.v6addrs.src = key_iph->saddr;
1843         keys->addrs.v6addrs.dst = key_iph->daddr;
1844         keys->tags.flow_label = ip6_flowinfo(key_iph);
1845         keys->basic.ip_proto = key_iph->nexthdr;
1846 }
1847
1848 /* if skb is set it will be used and fl6 can be NULL */
1849 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1850 {
1851         struct flow_keys hash_keys;
1852
1853         if (skb) {
1854                 ip6_multipath_l3_keys(skb, &hash_keys);
1855                 return flow_hash_from_keys(&hash_keys) >> 1;
1856         }
1857
1858         return get_hash_from_flowi6(fl6) >> 1;
1859 }
1860
1861 void ip6_route_input(struct sk_buff *skb)
1862 {
1863         const struct ipv6hdr *iph = ipv6_hdr(skb);
1864         struct net *net = dev_net(skb->dev);
1865         int flags = RT6_LOOKUP_F_HAS_SADDR;
1866         struct ip_tunnel_info *tun_info;
1867         struct flowi6 fl6 = {
1868                 .flowi6_iif = skb->dev->ifindex,
1869                 .daddr = iph->daddr,
1870                 .saddr = iph->saddr,
1871                 .flowlabel = ip6_flowinfo(iph),
1872                 .flowi6_mark = skb->mark,
1873                 .flowi6_proto = iph->nexthdr,
1874         };
1875
1876         tun_info = skb_tunnel_info(skb);
1877         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1878                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1879         if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1880                 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1881         skb_dst_drop(skb);
1882         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1883 }
1884
1885 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1886                                              struct flowi6 *fl6, int flags)
1887 {
1888         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1889 }
1890
1891 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1892                                          struct flowi6 *fl6, int flags)
1893 {
1894         bool any_src;
1895
1896         if (rt6_need_strict(&fl6->daddr)) {
1897                 struct dst_entry *dst;
1898
1899                 dst = l3mdev_link_scope_lookup(net, fl6);
1900                 if (dst)
1901                         return dst;
1902         }
1903
1904         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1905
1906         any_src = ipv6_addr_any(&fl6->saddr);
1907         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1908             (fl6->flowi6_oif && any_src))
1909                 flags |= RT6_LOOKUP_F_IFACE;
1910
1911         if (!any_src)
1912                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1913         else if (sk)
1914                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1915
1916         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1917 }
1918 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1919
1920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1921 {
1922         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1923         struct net_device *loopback_dev = net->loopback_dev;
1924         struct dst_entry *new = NULL;
1925
1926         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1927                        DST_OBSOLETE_DEAD, 0);
1928         if (rt) {
1929                 rt6_info_init(rt);
1930                 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1931
1932                 new = &rt->dst;
1933                 new->__use = 1;
1934                 new->input = dst_discard;
1935                 new->output = dst_discard_out;
1936
1937                 dst_copy_metrics(new, &ort->dst);
1938
1939                 rt->rt6i_idev = in6_dev_get(loopback_dev);
1940                 rt->rt6i_gateway = ort->rt6i_gateway;
1941                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1942                 rt->rt6i_metric = 0;
1943
1944                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1945 #ifdef CONFIG_IPV6_SUBTREES
1946                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1947 #endif
1948         }
1949
1950         dst_release(dst_orig);
1951         return new ? new : ERR_PTR(-ENOMEM);
1952 }
1953
1954 /*
1955  *      Destination cache support functions
1956  */
1957
1958 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1959 {
1960         if (rt->from &&
1961             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1962                 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1963 }
1964
1965 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1966 {
1967         u32 rt_cookie = 0;
1968
1969         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1970                 return NULL;
1971
1972         if (rt6_check_expired(rt))
1973                 return NULL;
1974
1975         return &rt->dst;
1976 }
1977
1978 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1979 {
1980         if (!__rt6_check_expired(rt) &&
1981             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1982             rt6_check(rt->from, cookie))
1983                 return &rt->dst;
1984         else
1985                 return NULL;
1986 }
1987
1988 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1989 {
1990         struct rt6_info *rt;
1991
1992         rt = (struct rt6_info *) dst;
1993
1994         /* All IPV6 dsts are created with ->obsolete set to the value
1995          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1996          * into this function always.
1997          */
1998
1999         rt6_dst_from_metrics_check(rt);
2000
2001         if (rt->rt6i_flags & RTF_PCPU ||
2002             (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2003                 return rt6_dst_from_check(rt, cookie);
2004         else
2005                 return rt6_check(rt, cookie);
2006 }
2007
2008 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2009 {
2010         struct rt6_info *rt = (struct rt6_info *) dst;
2011
2012         if (rt) {
2013                 if (rt->rt6i_flags & RTF_CACHE) {
2014                         if (rt6_check_expired(rt)) {
2015                                 ip6_del_rt(rt);
2016                                 dst = NULL;
2017                         }
2018                 } else {
2019                         dst_release(dst);
2020                         dst = NULL;
2021                 }
2022         }
2023         return dst;
2024 }
2025
2026 static void ip6_link_failure(struct sk_buff *skb)
2027 {
2028         struct rt6_info *rt;
2029
2030         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2031
2032         rt = (struct rt6_info *) skb_dst(skb);
2033         if (rt) {
2034                 if (rt->rt6i_flags & RTF_CACHE) {
2035                         if (dst_hold_safe(&rt->dst))
2036                                 ip6_del_rt(rt);
2037                 } else {
2038                         struct fib6_node *fn;
2039
2040                         rcu_read_lock();
2041                         fn = rcu_dereference(rt->rt6i_node);
2042                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2043                                 fn->fn_sernum = -1;
2044                         rcu_read_unlock();
2045                 }
2046         }
2047 }
2048
2049 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2050 {
2051         struct net *net = dev_net(rt->dst.dev);
2052
2053         rt->rt6i_flags |= RTF_MODIFIED;
2054         rt->rt6i_pmtu = mtu;
2055         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2056 }
2057
2058 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2059 {
2060         return !(rt->rt6i_flags & RTF_CACHE) &&
2061                 (rt->rt6i_flags & RTF_PCPU ||
2062                  rcu_access_pointer(rt->rt6i_node));
2063 }
2064
2065 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2066                                  const struct ipv6hdr *iph, u32 mtu)
2067 {
2068         const struct in6_addr *daddr, *saddr;
2069         struct rt6_info *rt6 = (struct rt6_info *)dst;
2070
2071         if (rt6->rt6i_flags & RTF_LOCAL)
2072                 return;
2073
2074         if (dst_metric_locked(dst, RTAX_MTU))
2075                 return;
2076
2077         if (iph) {
2078                 daddr = &iph->daddr;
2079                 saddr = &iph->saddr;
2080         } else if (sk) {
2081                 daddr = &sk->sk_v6_daddr;
2082                 saddr = &inet6_sk(sk)->saddr;
2083         } else {
2084                 daddr = NULL;
2085                 saddr = NULL;
2086         }
2087         dst_confirm_neigh(dst, daddr);
2088         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2089         if (mtu >= dst_mtu(dst))
2090                 return;
2091
2092         if (!rt6_cache_allowed_for_pmtu(rt6)) {
2093                 rt6_do_update_pmtu(rt6, mtu);
2094                 /* update rt6_ex->stamp for cache */
2095                 if (rt6->rt6i_flags & RTF_CACHE)
2096                         rt6_update_exception_stamp_rt(rt6);
2097         } else if (daddr) {
2098                 struct rt6_info *nrt6;
2099
2100                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2101                 if (nrt6) {
2102                         rt6_do_update_pmtu(nrt6, mtu);
2103                         if (rt6_insert_exception(nrt6, rt6))
2104                                 dst_release_immediate(&nrt6->dst);
2105                 }
2106         }
2107 }
2108
2109 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2110                                struct sk_buff *skb, u32 mtu)
2111 {
2112         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2113 }
2114
2115 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2116                      int oif, u32 mark, kuid_t uid)
2117 {
2118         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2119         struct dst_entry *dst;
2120         struct flowi6 fl6;
2121
2122         memset(&fl6, 0, sizeof(fl6));
2123         fl6.flowi6_oif = oif;
2124         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2125         fl6.daddr = iph->daddr;
2126         fl6.saddr = iph->saddr;
2127         fl6.flowlabel = ip6_flowinfo(iph);
2128         fl6.flowi6_uid = uid;
2129
2130         dst = ip6_route_output(net, NULL, &fl6);
2131         if (!dst->error)
2132                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2133         dst_release(dst);
2134 }
2135 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2136
2137 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2138 {
2139         struct dst_entry *dst;
2140
2141         ip6_update_pmtu(skb, sock_net(sk), mtu,
2142                         sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2143
2144         dst = __sk_dst_get(sk);
2145         if (!dst || !dst->obsolete ||
2146             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2147                 return;
2148
2149         bh_lock_sock(sk);
2150         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2151                 ip6_datagram_dst_update(sk, false);
2152         bh_unlock_sock(sk);
2153 }
2154 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2155
2156 /* Handle redirects */
2157 struct ip6rd_flowi {
2158         struct flowi6 fl6;
2159         struct in6_addr gateway;
2160 };
2161
2162 static struct rt6_info *__ip6_route_redirect(struct net *net,
2163                                              struct fib6_table *table,
2164                                              struct flowi6 *fl6,
2165                                              int flags)
2166 {
2167         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2168         struct rt6_info *rt, *rt_cache;
2169         struct fib6_node *fn;
2170
2171         /* Get the "current" route for this destination and
2172          * check if the redirect has come from appropriate router.
2173          *
2174          * RFC 4861 specifies that redirects should only be
2175          * accepted if they come from the nexthop to the target.
2176          * Due to the way the routes are chosen, this notion
2177          * is a bit fuzzy and one might need to check all possible
2178          * routes.
2179          */
2180
2181         rcu_read_lock();
2182         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2183 restart:
2184         for_each_fib6_node_rt_rcu(fn) {
2185                 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2186                         continue;
2187                 if (rt6_check_expired(rt))
2188                         continue;
2189                 if (rt->dst.error)
2190                         break;
2191                 if (!(rt->rt6i_flags & RTF_GATEWAY))
2192                         continue;
2193                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2194                         continue;
2195                 /* rt_cache's gateway might be different from its 'parent'
2196                  * in the case of an ip redirect.
2197                  * So we keep searching in the exception table if the gateway
2198                  * is different.
2199                  */
2200                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2201                         rt_cache = rt6_find_cached_rt(rt,
2202                                                       &fl6->daddr,
2203                                                       &fl6->saddr);
2204                         if (rt_cache &&
2205                             ipv6_addr_equal(&rdfl->gateway,
2206                                             &rt_cache->rt6i_gateway)) {
2207                                 rt = rt_cache;
2208                                 break;
2209                         }
2210                         continue;
2211                 }
2212                 break;
2213         }
2214
2215         if (!rt)
2216                 rt = net->ipv6.ip6_null_entry;
2217         else if (rt->dst.error) {
2218                 rt = net->ipv6.ip6_null_entry;
2219                 goto out;
2220         }
2221
2222         if (rt == net->ipv6.ip6_null_entry) {
2223                 fn = fib6_backtrack(fn, &fl6->saddr);
2224                 if (fn)
2225                         goto restart;
2226         }
2227
2228 out:
2229         ip6_hold_safe(net, &rt, true);
2230
2231         rcu_read_unlock();
2232
2233         trace_fib6_table_lookup(net, rt, table, fl6);
2234         return rt;
2235 };
2236
2237 static struct dst_entry *ip6_route_redirect(struct net *net,
2238                                         const struct flowi6 *fl6,
2239                                         const struct in6_addr *gateway)
2240 {
2241         int flags = RT6_LOOKUP_F_HAS_SADDR;
2242         struct ip6rd_flowi rdfl;
2243
2244         rdfl.fl6 = *fl6;
2245         rdfl.gateway = *gateway;
2246
2247         return fib6_rule_lookup(net, &rdfl.fl6,
2248                                 flags, __ip6_route_redirect);
2249 }
2250
2251 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2252                   kuid_t uid)
2253 {
2254         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2255         struct dst_entry *dst;
2256         struct flowi6 fl6;
2257
2258         memset(&fl6, 0, sizeof(fl6));
2259         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2260         fl6.flowi6_oif = oif;
2261         fl6.flowi6_mark = mark;
2262         fl6.daddr = iph->daddr;
2263         fl6.saddr = iph->saddr;
2264         fl6.flowlabel = ip6_flowinfo(iph);
2265         fl6.flowi6_uid = uid;
2266
2267         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2268         rt6_do_redirect(dst, NULL, skb);
2269         dst_release(dst);
2270 }
2271 EXPORT_SYMBOL_GPL(ip6_redirect);
2272
2273 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2274                             u32 mark)
2275 {
2276         const struct ipv6hdr *iph = ipv6_hdr(skb);
2277         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2278         struct dst_entry *dst;
2279         struct flowi6 fl6;
2280
2281         memset(&fl6, 0, sizeof(fl6));
2282         fl6.flowi6_iif = LOOPBACK_IFINDEX;
2283         fl6.flowi6_oif = oif;
2284         fl6.flowi6_mark = mark;
2285         fl6.daddr = msg->dest;
2286         fl6.saddr = iph->daddr;
2287         fl6.flowi6_uid = sock_net_uid(net, NULL);
2288
2289         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2290         rt6_do_redirect(dst, NULL, skb);
2291         dst_release(dst);
2292 }
2293
2294 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2295 {
2296         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2297                      sk->sk_uid);
2298 }
2299 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2300
2301 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2302 {
2303         struct net_device *dev = dst->dev;
2304         unsigned int mtu = dst_mtu(dst);
2305         struct net *net = dev_net(dev);
2306
2307         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2308
2309         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2310                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2311
2312         /*
2313          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2314          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2315          * IPV6_MAXPLEN is also valid and means: "any MSS,
2316          * rely only on pmtu discovery"
2317          */
2318         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2319                 mtu = IPV6_MAXPLEN;
2320         return mtu;
2321 }
2322
2323 static unsigned int ip6_mtu(const struct dst_entry *dst)
2324 {
2325         const struct rt6_info *rt = (const struct rt6_info *)dst;
2326         unsigned int mtu = rt->rt6i_pmtu;
2327         struct inet6_dev *idev;
2328
2329         if (mtu)
2330                 goto out;
2331
2332         mtu = dst_metric_raw(dst, RTAX_MTU);
2333         if (mtu)
2334                 goto out;
2335
2336         mtu = IPV6_MIN_MTU;
2337
2338         rcu_read_lock();
2339         idev = __in6_dev_get(dst->dev);
2340         if (idev)
2341                 mtu = idev->cnf.mtu6;
2342         rcu_read_unlock();
2343
2344 out:
2345         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2346
2347         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2348 }
2349
2350 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2351                                   struct flowi6 *fl6)
2352 {
2353         struct dst_entry *dst;
2354         struct rt6_info *rt;
2355         struct inet6_dev *idev = in6_dev_get(dev);
2356         struct net *net = dev_net(dev);
2357
2358         if (unlikely(!idev))
2359                 return ERR_PTR(-ENODEV);
2360
2361         rt = ip6_dst_alloc(net, dev, 0);
2362         if (unlikely(!rt)) {
2363                 in6_dev_put(idev);
2364                 dst = ERR_PTR(-ENOMEM);
2365                 goto out;
2366         }
2367
2368         rt->dst.flags |= DST_HOST;
2369         rt->dst.input = ip6_input;
2370         rt->dst.output  = ip6_output;
2371         rt->rt6i_gateway  = fl6->daddr;
2372         rt->rt6i_dst.addr = fl6->daddr;
2373         rt->rt6i_dst.plen = 128;
2374         rt->rt6i_idev     = idev;
2375         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2376
2377         /* Add this dst into uncached_list so that rt6_disable_ip() can
2378          * do proper release of the net_device
2379          */
2380         rt6_uncached_list_add(rt);
2381         atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2382
2383         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2384
2385 out:
2386         return dst;
2387 }
2388
2389 static int ip6_dst_gc(struct dst_ops *ops)
2390 {
2391         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2392         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2393         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2394         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2395         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2396         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2397         int entries;
2398
2399         entries = dst_entries_get_fast(ops);
2400         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2401             entries <= rt_max_size)
2402                 goto out;
2403
2404         net->ipv6.ip6_rt_gc_expire++;
2405         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2406         entries = dst_entries_get_slow(ops);
2407         if (entries < ops->gc_thresh)
2408                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2409 out:
2410         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2411         return entries > rt_max_size;
2412 }
2413
2414 static int ip6_convert_metrics(struct mx6_config *mxc,
2415                                const struct fib6_config *cfg)
2416 {
2417         struct net *net = cfg->fc_nlinfo.nl_net;
2418         bool ecn_ca = false;
2419         struct nlattr *nla;
2420         int remaining;
2421         u32 *mp;
2422
2423         if (!cfg->fc_mx)
2424                 return 0;
2425
2426         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2427         if (unlikely(!mp))
2428                 return -ENOMEM;
2429
2430         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2431                 int type = nla_type(nla);
2432                 u32 val;
2433
2434                 if (!type)
2435                         continue;
2436                 if (unlikely(type > RTAX_MAX))
2437                         goto err;
2438
2439                 if (type == RTAX_CC_ALGO) {
2440                         char tmp[TCP_CA_NAME_MAX];
2441
2442                         nla_strlcpy(tmp, nla, sizeof(tmp));
2443                         val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2444                         if (val == TCP_CA_UNSPEC)
2445                                 goto err;
2446                 } else {
2447                         val = nla_get_u32(nla);
2448                 }
2449                 if (type == RTAX_HOPLIMIT && val > 255)
2450                         val = 255;
2451                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2452                         goto err;
2453
2454                 mp[type - 1] = val;
2455                 __set_bit(type - 1, mxc->mx_valid);
2456         }
2457
2458         if (ecn_ca) {
2459                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2460                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2461         }
2462
2463         mxc->mx = mp;
2464         return 0;
2465  err:
2466         kfree(mp);
2467         return -EINVAL;
2468 }
2469
2470 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2471                                             struct fib6_config *cfg,
2472                                             const struct in6_addr *gw_addr,
2473                                             u32 tbid, int flags)
2474 {
2475         struct flowi6 fl6 = {
2476                 .flowi6_oif = cfg->fc_ifindex,
2477                 .daddr = *gw_addr,
2478                 .saddr = cfg->fc_prefsrc,
2479         };
2480         struct fib6_table *table;
2481         struct rt6_info *rt;
2482
2483         table = fib6_get_table(net, tbid);
2484         if (!table)
2485                 return NULL;
2486
2487         if (!ipv6_addr_any(&cfg->fc_prefsrc))
2488                 flags |= RT6_LOOKUP_F_HAS_SADDR;
2489
2490         flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2491         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2492
2493         /* if table lookup failed, fall back to full lookup */
2494         if (rt == net->ipv6.ip6_null_entry) {
2495                 ip6_rt_put(rt);
2496                 rt = NULL;
2497         }
2498
2499         return rt;
2500 }
2501
2502 static int ip6_route_check_nh_onlink(struct net *net,
2503                                      struct fib6_config *cfg,
2504                                      struct net_device *dev,
2505                                      struct netlink_ext_ack *extack)
2506 {
2507         u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2508         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2509         u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2510         struct rt6_info *grt;
2511         int err;
2512
2513         err = 0;
2514         grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2515         if (grt) {
2516                 if (!grt->dst.error &&
2517                     (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2518                         NL_SET_ERR_MSG(extack,
2519                                        "Nexthop has invalid gateway or device mismatch");
2520                         err = -EINVAL;
2521                 }
2522
2523                 ip6_rt_put(grt);
2524         }
2525
2526         return err;
2527 }
2528
2529 static int ip6_route_check_nh(struct net *net,
2530                               struct fib6_config *cfg,
2531                               struct net_device **_dev,
2532                               struct inet6_dev **idev)
2533 {
2534         const struct in6_addr *gw_addr = &cfg->fc_gateway;
2535         struct net_device *dev = _dev ? *_dev : NULL;
2536         struct rt6_info *grt = NULL;
2537         int err = -EHOSTUNREACH;
2538
2539         if (cfg->fc_table) {
2540                 int flags = RT6_LOOKUP_F_IFACE;
2541
2542                 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2543                                           cfg->fc_table, flags);
2544                 if (grt) {
2545                         if (grt->rt6i_flags & RTF_GATEWAY ||
2546                             (dev && dev != grt->dst.dev)) {
2547                                 ip6_rt_put(grt);
2548                                 grt = NULL;
2549                         }
2550                 }
2551         }
2552
2553         if (!grt)
2554                 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
2555
2556         if (!grt)
2557                 goto out;
2558
2559         if (dev) {
2560                 if (dev != grt->dst.dev) {
2561                         ip6_rt_put(grt);
2562                         goto out;
2563                 }
2564         } else {
2565                 *_dev = dev = grt->dst.dev;
2566                 *idev = grt->rt6i_idev;
2567                 dev_hold(dev);
2568                 in6_dev_hold(grt->rt6i_idev);
2569         }
2570
2571         if (!(grt->rt6i_flags & RTF_GATEWAY))
2572                 err = 0;
2573
2574         ip6_rt_put(grt);
2575
2576 out:
2577         return err;
2578 }
2579
2580 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2581                                               struct netlink_ext_ack *extack)
2582 {
2583         struct net *net = cfg->fc_nlinfo.nl_net;
2584         struct rt6_info *rt = NULL;
2585         struct net_device *dev = NULL;
2586         struct inet6_dev *idev = NULL;
2587         struct fib6_table *table;
2588         int addr_type;
2589         int err = -EINVAL;
2590
2591         /* RTF_PCPU is an internal flag; can not be set by userspace */
2592         if (cfg->fc_flags & RTF_PCPU) {
2593                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2594                 goto out;
2595         }
2596
2597         /* RTF_CACHE is an internal flag; can not be set by userspace */
2598         if (cfg->fc_flags & RTF_CACHE) {
2599                 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2600                 goto out;
2601         }
2602
2603         if (cfg->fc_dst_len > 128) {
2604                 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2605                 goto out;
2606         }
2607         if (cfg->fc_src_len > 128) {
2608                 NL_SET_ERR_MSG(extack, "Invalid source address length");
2609                 goto out;
2610         }
2611 #ifndef CONFIG_IPV6_SUBTREES
2612         if (cfg->fc_src_len) {
2613                 NL_SET_ERR_MSG(extack,
2614                                "Specifying source address requires IPV6_SUBTREES to be enabled");
2615                 goto out;
2616         }
2617 #endif
2618         if (cfg->fc_ifindex) {
2619                 err = -ENODEV;
2620                 dev = dev_get_by_index(net, cfg->fc_ifindex);
2621                 if (!dev)
2622                         goto out;
2623                 idev = in6_dev_get(dev);
2624                 if (!idev)
2625                         goto out;
2626         }
2627
2628         if (cfg->fc_metric == 0)
2629                 cfg->fc_metric = IP6_RT_PRIO_USER;
2630
2631         if (cfg->fc_flags & RTNH_F_ONLINK) {
2632                 if (!dev) {
2633                         NL_SET_ERR_MSG(extack,
2634                                        "Nexthop device required for onlink");
2635                         err = -ENODEV;
2636                         goto out;
2637                 }
2638
2639                 if (!(dev->flags & IFF_UP)) {
2640                         NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2641                         err = -ENETDOWN;
2642                         goto out;
2643                 }
2644         }
2645
2646         err = -ENOBUFS;
2647         if (cfg->fc_nlinfo.nlh &&
2648             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2649                 table = fib6_get_table(net, cfg->fc_table);
2650                 if (!table) {
2651                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2652                         table = fib6_new_table(net, cfg->fc_table);
2653                 }
2654         } else {
2655                 table = fib6_new_table(net, cfg->fc_table);
2656         }
2657
2658         if (!table)
2659                 goto out;
2660
2661         rt = ip6_dst_alloc(net, NULL,
2662                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2663
2664         if (!rt) {
2665                 err = -ENOMEM;
2666                 goto out;
2667         }
2668
2669         if (cfg->fc_flags & RTF_EXPIRES)
2670                 rt6_set_expires(rt, jiffies +
2671                                 clock_t_to_jiffies(cfg->fc_expires));
2672         else
2673                 rt6_clean_expires(rt);
2674
2675         if (cfg->fc_protocol == RTPROT_UNSPEC)
2676                 cfg->fc_protocol = RTPROT_BOOT;
2677         rt->rt6i_protocol = cfg->fc_protocol;
2678
2679         addr_type = ipv6_addr_type(&cfg->fc_dst);
2680
2681         if (addr_type & IPV6_ADDR_MULTICAST)
2682                 rt->dst.input = ip6_mc_input;
2683         else if (cfg->fc_flags & RTF_LOCAL)
2684                 rt->dst.input = ip6_input;
2685         else
2686                 rt->dst.input = ip6_forward;
2687
2688         rt->dst.output = ip6_output;
2689
2690         if (cfg->fc_encap) {
2691                 struct lwtunnel_state *lwtstate;
2692
2693                 err = lwtunnel_build_state(cfg->fc_encap_type,
2694                                            cfg->fc_encap, AF_INET6, cfg,
2695                                            &lwtstate, extack);
2696                 if (err)
2697                         goto out;
2698                 rt->dst.lwtstate = lwtstate_get(lwtstate);
2699                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2700                         rt->dst.lwtstate->orig_output = rt->dst.output;
2701                         rt->dst.output = lwtunnel_output;
2702                 }
2703                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2704                         rt->dst.lwtstate->orig_input = rt->dst.input;
2705                         rt->dst.input = lwtunnel_input;
2706                 }
2707         }
2708
2709         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2710         rt->rt6i_dst.plen = cfg->fc_dst_len;
2711         if (rt->rt6i_dst.plen == 128)
2712                 rt->dst.flags |= DST_HOST;
2713
2714 #ifdef CONFIG_IPV6_SUBTREES
2715         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2716         rt->rt6i_src.plen = cfg->fc_src_len;
2717 #endif
2718
2719         rt->rt6i_metric = cfg->fc_metric;
2720         rt->rt6i_nh_weight = 1;
2721
2722         /* We cannot add true routes via loopback here,
2723            they would result in kernel looping; promote them to reject routes
2724          */
2725         if ((cfg->fc_flags & RTF_REJECT) ||
2726             (dev && (dev->flags & IFF_LOOPBACK) &&
2727              !(addr_type & IPV6_ADDR_LOOPBACK) &&
2728              !(cfg->fc_flags & RTF_LOCAL))) {
2729                 /* hold loopback dev/idev if we haven't done so. */
2730                 if (dev != net->loopback_dev) {
2731                         if (dev) {
2732                                 dev_put(dev);
2733                                 in6_dev_put(idev);
2734                         }
2735                         dev = net->loopback_dev;
2736                         dev_hold(dev);
2737                         idev = in6_dev_get(dev);
2738                         if (!idev) {
2739                                 err = -ENODEV;
2740                                 goto out;
2741                         }
2742                 }
2743                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2744                 switch (cfg->fc_type) {
2745                 case RTN_BLACKHOLE:
2746                         rt->dst.error = -EINVAL;
2747                         rt->dst.output = dst_discard_out;
2748                         rt->dst.input = dst_discard;
2749                         break;
2750                 case RTN_PROHIBIT:
2751                         rt->dst.error = -EACCES;
2752                         rt->dst.output = ip6_pkt_prohibit_out;
2753                         rt->dst.input = ip6_pkt_prohibit;
2754                         break;
2755                 case RTN_THROW:
2756                 case RTN_UNREACHABLE:
2757                 default:
2758                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2759                                         : (cfg->fc_type == RTN_UNREACHABLE)
2760                                         ? -EHOSTUNREACH : -ENETUNREACH;
2761                         rt->dst.output = ip6_pkt_discard_out;
2762                         rt->dst.input = ip6_pkt_discard;
2763                         break;
2764                 }
2765                 goto install_route;
2766         }
2767
2768         if (cfg->fc_flags & RTF_GATEWAY) {
2769                 const struct in6_addr *gw_addr;
2770                 int gwa_type;
2771
2772                 gw_addr = &cfg->fc_gateway;
2773                 gwa_type = ipv6_addr_type(gw_addr);
2774
2775                 /* if gw_addr is local we will fail to detect this in case
2776                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
2777                  * will return already-added prefix route via interface that
2778                  * prefix route was assigned to, which might be non-loopback.
2779                  */
2780                 err = -EINVAL;
2781                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2782                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2783                                             dev : NULL, 0, 0)) {
2784                         NL_SET_ERR_MSG(extack, "Invalid gateway address");
2785                         goto out;
2786                 }
2787                 rt->rt6i_gateway = *gw_addr;
2788
2789                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2790                         /* IPv6 strictly inhibits using not link-local
2791                            addresses as nexthop address.
2792                            Otherwise, router will not able to send redirects.
2793                            It is very good, but in some (rare!) circumstances
2794                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2795                            some exceptions. --ANK
2796                            We allow IPv4-mapped nexthops to support RFC4798-type
2797                            addressing
2798                          */
2799                         if (!(gwa_type & (IPV6_ADDR_UNICAST |
2800                                           IPV6_ADDR_MAPPED))) {
2801                                 NL_SET_ERR_MSG(extack,
2802                                                "Invalid gateway address");
2803                                 goto out;
2804                         }
2805
2806                         if (cfg->fc_flags & RTNH_F_ONLINK) {
2807                                 err = ip6_route_check_nh_onlink(net, cfg, dev,
2808                                                                 extack);
2809                         } else {
2810                                 err = ip6_route_check_nh(net, cfg, &dev, &idev);
2811                         }
2812                         if (err)
2813                                 goto out;
2814                 }
2815                 err = -EINVAL;
2816                 if (!dev) {
2817                         NL_SET_ERR_MSG(extack, "Egress device not specified");
2818                         goto out;
2819                 } else if (dev->flags & IFF_LOOPBACK) {
2820                         NL_SET_ERR_MSG(extack,
2821                                        "Egress device can not be loopback device for this route");
2822                         goto out;
2823                 }
2824         }
2825
2826         err = -ENODEV;
2827         if (!dev)
2828                 goto out;
2829
2830         if (!(dev->flags & IFF_UP)) {
2831                 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2832                 err = -ENETDOWN;
2833                 goto out;
2834         }
2835
2836         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2837                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2838                         NL_SET_ERR_MSG(extack, "Invalid source address");
2839                         err = -EINVAL;
2840                         goto out;
2841                 }
2842                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2843                 rt->rt6i_prefsrc.plen = 128;
2844         } else
2845                 rt->rt6i_prefsrc.plen = 0;
2846
2847         rt->rt6i_flags = cfg->fc_flags;
2848
2849 install_route:
2850         if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2851             !netif_carrier_ok(dev))
2852                 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2853         rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2854         rt->dst.dev = dev;
2855         rt->rt6i_idev = idev;
2856         rt->rt6i_table = table;
2857
2858         cfg->fc_nlinfo.nl_net = dev_net(dev);
2859
2860         return rt;
2861 out:
2862         if (dev)
2863                 dev_put(dev);
2864         if (idev)
2865                 in6_dev_put(idev);
2866         if (rt)
2867                 dst_release_immediate(&rt->dst);
2868
2869         return ERR_PTR(err);
2870 }
2871
2872 int ip6_route_add(struct fib6_config *cfg,
2873                   struct netlink_ext_ack *extack)
2874 {
2875         struct mx6_config mxc = { .mx = NULL, };
2876         struct rt6_info *rt;
2877         int err;
2878
2879         rt = ip6_route_info_create(cfg, extack);
2880         if (IS_ERR(rt)) {
2881                 err = PTR_ERR(rt);
2882                 rt = NULL;
2883                 goto out;
2884         }
2885
2886         err = ip6_convert_metrics(&mxc, cfg);
2887         if (err)
2888                 goto out;
2889
2890         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2891
2892         kfree(mxc.mx);
2893
2894         return err;
2895 out:
2896         if (rt)
2897                 dst_release_immediate(&rt->dst);
2898
2899         return err;
2900 }
2901
2902 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2903 {
2904         int err;
2905         struct fib6_table *table;
2906         struct net *net = dev_net(rt->dst.dev);
2907
2908         if (rt == net->ipv6.ip6_null_entry) {
2909                 err = -ENOENT;
2910                 goto out;
2911         }
2912
2913         table = rt->rt6i_table;
2914         spin_lock_bh(&table->tb6_lock);
2915         err = fib6_del(rt, info);
2916         spin_unlock_bh(&table->tb6_lock);
2917
2918 out:
2919         ip6_rt_put(rt);
2920         return err;
2921 }
2922
2923 int ip6_del_rt(struct rt6_info *rt)
2924 {
2925         struct nl_info info = {
2926                 .nl_net = dev_net(rt->dst.dev),
2927         };
2928         return __ip6_del_rt(rt, &info);
2929 }
2930
2931 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2932 {
2933         struct nl_info *info = &cfg->fc_nlinfo;
2934         struct net *net = info->nl_net;
2935         struct sk_buff *skb = NULL;
2936         struct fib6_table *table;
2937         int err = -ENOENT;
2938
2939         if (rt == net->ipv6.ip6_null_entry)
2940                 goto out_put;
2941         table = rt->rt6i_table;
2942         spin_lock_bh(&table->tb6_lock);
2943
2944         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2945                 struct rt6_info *sibling, *next_sibling;
2946
2947                 /* prefer to send a single notification with all hops */
2948                 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2949                 if (skb) {
2950                         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2951
2952                         if (rt6_fill_node(net, skb, rt,
2953                                           NULL, NULL, 0, RTM_DELROUTE,
2954                                           info->portid, seq, 0) < 0) {
2955                                 kfree_skb(skb);
2956                                 skb = NULL;
2957                         } else
2958                                 info->skip_notify = 1;
2959                 }
2960
2961                 list_for_each_entry_safe(sibling, next_sibling,
2962                                          &rt->rt6i_siblings,
2963                                          rt6i_siblings) {
2964                         err = fib6_del(sibling, info);
2965                         if (err)
2966                                 goto out_unlock;
2967                 }
2968         }
2969
2970         err = fib6_del(rt, info);
2971 out_unlock:
2972         spin_unlock_bh(&table->tb6_lock);
2973 out_put:
2974         ip6_rt_put(rt);
2975
2976         if (skb) {
2977                 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2978                             info->nlh, gfp_any());
2979         }
2980         return err;
2981 }
2982
2983 static int ip6_route_del(struct fib6_config *cfg,
2984                          struct netlink_ext_ack *extack)
2985 {
2986         struct rt6_info *rt, *rt_cache;
2987         struct fib6_table *table;
2988         struct fib6_node *fn;
2989         int err = -ESRCH;
2990
2991         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2992         if (!table) {
2993                 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2994                 return err;
2995         }
2996
2997         rcu_read_lock();
2998
2999         fn = fib6_locate(&table->tb6_root,
3000                          &cfg->fc_dst, cfg->fc_dst_len,
3001                          &cfg->fc_src, cfg->fc_src_len,
3002                          !(cfg->fc_flags & RTF_CACHE));
3003
3004         if (fn) {
3005                 for_each_fib6_node_rt_rcu(fn) {
3006                         if (cfg->fc_flags & RTF_CACHE) {
3007                                 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3008                                                               &cfg->fc_src);
3009                                 if (!rt_cache)
3010                                         continue;
3011                                 rt = rt_cache;
3012                         }
3013                         if (cfg->fc_ifindex &&
3014                             (!rt->dst.dev ||
3015                              rt->dst.dev->ifindex != cfg->fc_ifindex))
3016                                 continue;
3017                         if (cfg->fc_flags & RTF_GATEWAY &&
3018                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3019                                 continue;
3020                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3021                                 continue;
3022                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3023                                 continue;
3024                         if (!dst_hold_safe(&rt->dst))
3025                                 break;
3026                         rcu_read_unlock();
3027
3028                         /* if gateway was specified only delete the one hop */
3029                         if (cfg->fc_flags & RTF_GATEWAY)
3030                                 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3031
3032                         return __ip6_del_rt_siblings(rt, cfg);
3033                 }
3034         }
3035         rcu_read_unlock();
3036
3037         return err;
3038 }
3039
3040 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3041 {
3042         struct netevent_redirect netevent;
3043         struct rt6_info *rt, *nrt = NULL;
3044         struct ndisc_options ndopts;
3045         struct inet6_dev *in6_dev;
3046         struct neighbour *neigh;
3047         struct rd_msg *msg;
3048         int optlen, on_link;
3049         u8 *lladdr;
3050
3051         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3052         optlen -= sizeof(*msg);
3053
3054         if (optlen < 0) {
3055                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3056                 return;
3057         }
3058
3059         msg = (struct rd_msg *)icmp6_hdr(skb);
3060
3061         if (ipv6_addr_is_multicast(&msg->dest)) {
3062                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3063                 return;
3064         }
3065
3066         on_link = 0;
3067         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3068                 on_link = 1;
3069         } else if (ipv6_addr_type(&msg->target) !=
3070                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3071                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3072                 return;
3073         }
3074
3075         in6_dev = __in6_dev_get(skb->dev);
3076         if (!in6_dev)
3077                 return;
3078         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3079                 return;
3080
3081         /* RFC2461 8.1:
3082          *      The IP source address of the Redirect MUST be the same as the current
3083          *      first-hop router for the specified ICMP Destination Address.
3084          */
3085
3086         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3087                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3088                 return;
3089         }
3090
3091         lladdr = NULL;
3092         if (ndopts.nd_opts_tgt_lladdr) {
3093                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3094                                              skb->dev);
3095                 if (!lladdr) {
3096                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3097                         return;
3098                 }
3099         }
3100
3101         rt = (struct rt6_info *) dst;
3102         if (rt->rt6i_flags & RTF_REJECT) {
3103                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3104                 return;
3105         }
3106
3107         /* Redirect received -> path was valid.
3108          * Look, redirects are sent only in response to data packets,
3109          * so that this nexthop apparently is reachable. --ANK
3110          */
3111         dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3112
3113         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3114         if (!neigh)
3115                 return;
3116
3117         /*
3118          *      We have finally decided to accept it.
3119          */
3120
3121         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3122                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
3123                      NEIGH_UPDATE_F_OVERRIDE|
3124                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3125                                      NEIGH_UPDATE_F_ISROUTER)),
3126                      NDISC_REDIRECT, &ndopts);
3127
3128         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3129         if (!nrt)
3130                 goto out;
3131
3132         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3133         if (on_link)
3134                 nrt->rt6i_flags &= ~RTF_GATEWAY;
3135
3136         nrt->rt6i_protocol = RTPROT_REDIRECT;
3137         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3138
3139         /* No need to remove rt from the exception table if rt is
3140          * a cached route because rt6_insert_exception() will
3141          * takes care of it
3142          */
3143         if (rt6_insert_exception(nrt, rt)) {
3144                 dst_release_immediate(&nrt->dst);
3145                 goto out;
3146         }
3147
3148         netevent.old = &rt->dst;
3149         netevent.new = &nrt->dst;
3150         netevent.daddr = &msg->dest;
3151         netevent.neigh = neigh;
3152         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3153
3154 out:
3155         neigh_release(neigh);
3156 }
3157
3158 /*
3159  *      Misc support functions
3160  */
3161
3162 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3163 {
3164         BUG_ON(from->from);
3165
3166         rt->rt6i_flags &= ~RTF_EXPIRES;
3167         dst_hold(&from->dst);
3168         rt->from = from;
3169         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3170 }
3171
3172 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3173 {
3174         rt->dst.input = ort->dst.input;
3175         rt->dst.output = ort->dst.output;
3176         rt->rt6i_dst = ort->rt6i_dst;
3177         rt->dst.error = ort->dst.error;
3178         rt->rt6i_idev = ort->rt6i_idev;
3179         if (rt->rt6i_idev)
3180                 in6_dev_hold(rt->rt6i_idev);
3181         rt->dst.lastuse = jiffies;
3182         rt->rt6i_gateway = ort->rt6i_gateway;
3183         rt->rt6i_flags = ort->rt6i_flags;
3184         rt6_set_from(rt, ort);
3185         rt->rt6i_metric = ort->rt6i_metric;
3186 #ifdef CONFIG_IPV6_SUBTREES
3187         rt->rt6i_src = ort->rt6i_src;
3188 #endif
3189         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3190         rt->rt6i_table = ort->rt6i_table;
3191         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3192 }
3193
3194 #ifdef CONFIG_IPV6_ROUTE_INFO
3195 static struct rt6_info *rt6_get_route_info(struct net *net,
3196                                            const struct in6_addr *prefix, int prefixlen,
3197                                            const struct in6_addr *gwaddr,
3198                                            struct net_device *dev)
3199 {
3200         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3201         int ifindex = dev->ifindex;
3202         struct fib6_node *fn;
3203         struct rt6_info *rt = NULL;
3204         struct fib6_table *table;
3205
3206         table = fib6_get_table(net, tb_id);
3207         if (!table)
3208                 return NULL;
3209
3210         rcu_read_lock();
3211         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3212         if (!fn)
3213                 goto out;
3214
3215         for_each_fib6_node_rt_rcu(fn) {
3216                 if (rt->dst.dev->ifindex != ifindex)
3217                         continue;
3218                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3219                         continue;
3220                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3221                         continue;
3222                 ip6_hold_safe(NULL, &rt, false);
3223                 break;
3224         }
3225 out:
3226         rcu_read_unlock();
3227         return rt;
3228 }
3229
3230 static struct rt6_info *rt6_add_route_info(struct net *net,
3231                                            const struct in6_addr *prefix, int prefixlen,
3232                                            const struct in6_addr *gwaddr,
3233                                            struct net_device *dev,
3234                                            unsigned int pref)
3235 {
3236         struct fib6_config cfg = {
3237                 .fc_metric      = IP6_RT_PRIO_USER,
3238                 .fc_ifindex     = dev->ifindex,
3239                 .fc_dst_len     = prefixlen,
3240                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3241                                   RTF_UP | RTF_PREF(pref),
3242                 .fc_protocol = RTPROT_RA,
3243                 .fc_nlinfo.portid = 0,
3244                 .fc_nlinfo.nlh = NULL,
3245                 .fc_nlinfo.nl_net = net,
3246         };
3247
3248         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3249         cfg.fc_dst = *prefix;
3250         cfg.fc_gateway = *gwaddr;
3251
3252         /* We should treat it as a default route if prefix length is 0. */
3253         if (!prefixlen)
3254                 cfg.fc_flags |= RTF_DEFAULT;
3255
3256         ip6_route_add(&cfg, NULL);
3257
3258         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3259 }
3260 #endif
3261
3262 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3263 {
3264         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3265         struct rt6_info *rt;
3266         struct fib6_table *table;
3267
3268         table = fib6_get_table(dev_net(dev), tb_id);
3269         if (!table)
3270                 return NULL;
3271
3272         rcu_read_lock();
3273         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3274                 if (dev == rt->dst.dev &&
3275                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3276                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
3277                         break;
3278         }
3279         if (rt)
3280                 ip6_hold_safe(NULL, &rt, false);
3281         rcu_read_unlock();
3282         return rt;
3283 }
3284
3285 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3286                                      struct net_device *dev,
3287                                      unsigned int pref)
3288 {
3289         struct fib6_config cfg = {
3290                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3291                 .fc_metric      = IP6_RT_PRIO_USER,
3292                 .fc_ifindex     = dev->ifindex,
3293                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3294                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3295                 .fc_protocol = RTPROT_RA,
3296                 .fc_nlinfo.portid = 0,
3297                 .fc_nlinfo.nlh = NULL,
3298                 .fc_nlinfo.nl_net = dev_net(dev),
3299         };
3300
3301         cfg.fc_gateway = *gwaddr;
3302
3303         if (!ip6_route_add(&cfg, NULL)) {
3304                 struct fib6_table *table;
3305
3306                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3307                 if (table)
3308                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3309         }
3310
3311         return rt6_get_dflt_router(gwaddr, dev);
3312 }
3313
3314 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3315 {
3316         struct rt6_info *rt;
3317
3318 restart:
3319         rcu_read_lock();
3320         for_each_fib6_node_rt_rcu(&table->tb6_root) {
3321                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3322                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3323                         if (dst_hold_safe(&rt->dst)) {
3324                                 rcu_read_unlock();
3325                                 ip6_del_rt(rt);
3326                         } else {
3327                                 rcu_read_unlock();
3328                         }
3329                         goto restart;
3330                 }
3331         }
3332         rcu_read_unlock();
3333
3334         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3335 }
3336
3337 void rt6_purge_dflt_routers(struct net *net)
3338 {
3339         struct fib6_table *table;
3340         struct hlist_head *head;
3341         unsigned int h;
3342
3343         rcu_read_lock();
3344
3345         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3346                 head = &net->ipv6.fib_table_hash[h];
3347                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3348                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3349                                 __rt6_purge_dflt_routers(table);
3350                 }
3351         }
3352
3353         rcu_read_unlock();
3354 }
3355
3356 static void rtmsg_to_fib6_config(struct net *net,
3357                                  struct in6_rtmsg *rtmsg,
3358                                  struct fib6_config *cfg)
3359 {
3360         memset(cfg, 0, sizeof(*cfg));
3361
3362         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3363                          : RT6_TABLE_MAIN;
3364         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3365         cfg->fc_metric = rtmsg->rtmsg_metric;
3366         cfg->fc_expires = rtmsg->rtmsg_info;
3367         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3368         cfg->fc_src_len = rtmsg->rtmsg_src_len;
3369         cfg->fc_flags = rtmsg->rtmsg_flags;
3370
3371         cfg->fc_nlinfo.nl_net = net;
3372
3373         cfg->fc_dst = rtmsg->rtmsg_dst;
3374         cfg->fc_src = rtmsg->rtmsg_src;
3375         cfg->fc_gateway = rtmsg->rtmsg_gateway;
3376 }
3377
3378 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3379 {
3380         struct fib6_config cfg;
3381         struct in6_rtmsg rtmsg;
3382         int err;
3383
3384         switch (cmd) {
3385         case SIOCADDRT:         /* Add a route */
3386         case SIOCDELRT:         /* Delete a route */
3387                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3388                         return -EPERM;
3389                 err = copy_from_user(&rtmsg, arg,
3390                                      sizeof(struct in6_rtmsg));
3391                 if (err)
3392                         return -EFAULT;
3393
3394                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3395
3396                 rtnl_lock();
3397                 switch (cmd) {
3398                 case SIOCADDRT:
3399                         err = ip6_route_add(&cfg, NULL);
3400                         break;
3401                 case SIOCDELRT:
3402                         err = ip6_route_del(&cfg, NULL);
3403                         break;
3404                 default:
3405                         err = -EINVAL;
3406                 }
3407                 rtnl_unlock();
3408
3409                 return err;
3410         }
3411
3412         return -EINVAL;
3413 }
3414
3415 /*
3416  *      Drop the packet on the floor
3417  */
3418
3419 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3420 {
3421         int type;
3422         struct dst_entry *dst = skb_dst(skb);
3423         switch (ipstats_mib_noroutes) {
3424         case IPSTATS_MIB_INNOROUTES:
3425                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3426                 if (type == IPV6_ADDR_ANY) {
3427                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3428                                       IPSTATS_MIB_INADDRERRORS);
3429                         break;
3430                 }
3431                 /* FALLTHROUGH */
3432         case IPSTATS_MIB_OUTNOROUTES:
3433                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3434                               ipstats_mib_noroutes);
3435                 break;
3436         }
3437         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3438         kfree_skb(skb);
3439         return 0;
3440 }
3441
3442 static int ip6_pkt_discard(struct sk_buff *skb)
3443 {
3444         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3445 }
3446