net: ipv4: don't let PMTU updates increase route MTU
[muen/linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 #include <net/ip_tunnels.h>
114 #include <net/l3mdev.h>
115
116 #include "fib_lookup.h"
117
118 #define RT_FL_TOS(oldflp4) \
119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145                                            struct sk_buff *skb, u32 mtu);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160
161 static struct dst_ops ipv4_dst_ops = {
162         .family =               AF_INET,
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174         .confirm_neigh =        ipv4_confirm_neigh,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .open    = rt_cpu_seq_open,
334         .read    = seq_read,
335         .llseek  = seq_lseek,
336         .release = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363 #endif
364
365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367         struct proc_dir_entry *pde;
368
369         pde = proc_create("rt_cache", 0444, net->proc_net,
370                           &rt_cache_seq_fops);
371         if (!pde)
372                 goto err1;
373
374         pde = proc_create("rt_cache", 0444,
375                           net->proc_net_stat, &rt_cpu_seq_fops);
376         if (!pde)
377                 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380         pde = proc_create_single("rt_acct", 0, net->proc_net,
381                         rt_acct_proc_show);
382         if (!pde)
383                 goto err3;
384 #endif
385         return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389         remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392         remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394         return -ENOMEM;
395 }
396
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400         remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402         remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407         .init = ip_rt_do_proc_init,
408         .exit = ip_rt_do_proc_exit,
409 };
410
411 static int __init ip_rt_proc_init(void)
412 {
413         return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
417 static inline int ip_rt_proc_init(void)
418 {
419         return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
428 void rt_cache_flush(struct net *net)
429 {
430         rt_genid_bump_ipv4(net);
431 }
432
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434                                            struct sk_buff *skb,
435                                            const void *daddr)
436 {
437         struct net_device *dev = dst->dev;
438         const __be32 *pkey = daddr;
439         const struct rtable *rt;
440         struct neighbour *n;
441
442         rt = (const struct rtable *) dst;
443         if (rt->rt_gateway)
444                 pkey = (const __be32 *) &rt->rt_gateway;
445         else if (skb)
446                 pkey = &ip_hdr(skb)->daddr;
447
448         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
449         if (n)
450                 return n;
451         return neigh_create(&arp_tbl, pkey, dev);
452 }
453
454 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
455 {
456         struct net_device *dev = dst->dev;
457         const __be32 *pkey = daddr;
458         const struct rtable *rt;
459
460         rt = (const struct rtable *)dst;
461         if (rt->rt_gateway)
462                 pkey = (const __be32 *)&rt->rt_gateway;
463         else if (!daddr ||
464                  (rt->rt_flags &
465                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
466                 return;
467
468         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
469 }
470
471 #define IP_IDENTS_SZ 2048u
472
473 static atomic_t *ip_idents __read_mostly;
474 static u32 *ip_tstamps __read_mostly;
475
476 /* In order to protect privacy, we add a perturbation to identifiers
477  * if one generator is seldom used. This makes hard for an attacker
478  * to infer how many packets were sent between two points in time.
479  */
480 u32 ip_idents_reserve(u32 hash, int segs)
481 {
482         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
483         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
484         u32 old = READ_ONCE(*p_tstamp);
485         u32 now = (u32)jiffies;
486         u32 new, delta = 0;
487
488         if (old != now && cmpxchg(p_tstamp, old, now) == old)
489                 delta = prandom_u32_max(now - old);
490
491         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
492         do {
493                 old = (u32)atomic_read(p_id);
494                 new = old + delta + segs;
495         } while (atomic_cmpxchg(p_id, old, new) != old);
496
497         return new - segs;
498 }
499 EXPORT_SYMBOL(ip_idents_reserve);
500
501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
502 {
503         static u32 ip_idents_hashrnd __read_mostly;
504         u32 hash, id;
505
506         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
507
508         hash = jhash_3words((__force u32)iph->daddr,
509                             (__force u32)iph->saddr,
510                             iph->protocol ^ net_hash_mix(net),
511                             ip_idents_hashrnd);
512         id = ip_idents_reserve(hash, segs);
513         iph->id = htons(id);
514 }
515 EXPORT_SYMBOL(__ip_select_ident);
516
517 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
518                              const struct sock *sk,
519                              const struct iphdr *iph,
520                              int oif, u8 tos,
521                              u8 prot, u32 mark, int flow_flags)
522 {
523         if (sk) {
524                 const struct inet_sock *inet = inet_sk(sk);
525
526                 oif = sk->sk_bound_dev_if;
527                 mark = sk->sk_mark;
528                 tos = RT_CONN_FLAGS(sk);
529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
530         }
531         flowi4_init_output(fl4, oif, mark, tos,
532                            RT_SCOPE_UNIVERSE, prot,
533                            flow_flags,
534                            iph->daddr, iph->saddr, 0, 0,
535                            sock_net_uid(net, sk));
536 }
537
538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
539                                const struct sock *sk)
540 {
541         const struct net *net = dev_net(skb->dev);
542         const struct iphdr *iph = ip_hdr(skb);
543         int oif = skb->dev->ifindex;
544         u8 tos = RT_TOS(iph->tos);
545         u8 prot = iph->protocol;
546         u32 mark = skb->mark;
547
548         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
549 }
550
551 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
552 {
553         const struct inet_sock *inet = inet_sk(sk);
554         const struct ip_options_rcu *inet_opt;
555         __be32 daddr = inet->inet_daddr;
556
557         rcu_read_lock();
558         inet_opt = rcu_dereference(inet->inet_opt);
559         if (inet_opt && inet_opt->opt.srr)
560                 daddr = inet_opt->opt.faddr;
561         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
562                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
563                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
564                            inet_sk_flowi_flags(sk),
565                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
566         rcu_read_unlock();
567 }
568
569 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
570                                  const struct sk_buff *skb)
571 {
572         if (skb)
573                 build_skb_flow_key(fl4, skb, sk);
574         else
575                 build_sk_flow_key(fl4, sk);
576 }
577
578 static DEFINE_SPINLOCK(fnhe_lock);
579
580 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
581 {
582         struct rtable *rt;
583
584         rt = rcu_dereference(fnhe->fnhe_rth_input);
585         if (rt) {
586                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
587                 dst_dev_put(&rt->dst);
588                 dst_release(&rt->dst);
589         }
590         rt = rcu_dereference(fnhe->fnhe_rth_output);
591         if (rt) {
592                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
593                 dst_dev_put(&rt->dst);
594                 dst_release(&rt->dst);
595         }
596 }
597
598 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
599 {
600         struct fib_nh_exception *fnhe, *oldest;
601
602         oldest = rcu_dereference(hash->chain);
603         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
604              fnhe = rcu_dereference(fnhe->fnhe_next)) {
605                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
606                         oldest = fnhe;
607         }
608         fnhe_flush_routes(oldest);
609         return oldest;
610 }
611
612 static inline u32 fnhe_hashfun(__be32 daddr)
613 {
614         static u32 fnhe_hashrnd __read_mostly;
615         u32 hval;
616
617         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
618         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
619         return hash_32(hval, FNHE_HASH_SHIFT);
620 }
621
622 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
623 {
624         rt->rt_pmtu = fnhe->fnhe_pmtu;
625         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
626         rt->dst.expires = fnhe->fnhe_expires;
627
628         if (fnhe->fnhe_gw) {
629                 rt->rt_flags |= RTCF_REDIRECTED;
630                 rt->rt_gateway = fnhe->fnhe_gw;
631                 rt->rt_uses_gateway = 1;
632         }
633 }
634
635 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
636                                   u32 pmtu, bool lock, unsigned long expires)
637 {
638         struct fnhe_hash_bucket *hash;
639         struct fib_nh_exception *fnhe;
640         struct rtable *rt;
641         u32 genid, hval;
642         unsigned int i;
643         int depth;
644
645         genid = fnhe_genid(dev_net(nh->nh_dev));
646         hval = fnhe_hashfun(daddr);
647
648         spin_lock_bh(&fnhe_lock);
649
650         hash = rcu_dereference(nh->nh_exceptions);
651         if (!hash) {
652                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
653                 if (!hash)
654                         goto out_unlock;
655                 rcu_assign_pointer(nh->nh_exceptions, hash);
656         }
657
658         hash += hval;
659
660         depth = 0;
661         for (fnhe = rcu_dereference(hash->chain); fnhe;
662              fnhe = rcu_dereference(fnhe->fnhe_next)) {
663                 if (fnhe->fnhe_daddr == daddr)
664                         break;
665                 depth++;
666         }
667
668         if (fnhe) {
669                 if (fnhe->fnhe_genid != genid)
670                         fnhe->fnhe_genid = genid;
671                 if (gw)
672                         fnhe->fnhe_gw = gw;
673                 if (pmtu) {
674                         fnhe->fnhe_pmtu = pmtu;
675                         fnhe->fnhe_mtu_locked = lock;
676                 }
677                 fnhe->fnhe_expires = max(1UL, expires);
678                 /* Update all cached dsts too */
679                 rt = rcu_dereference(fnhe->fnhe_rth_input);
680                 if (rt)
681                         fill_route_from_fnhe(rt, fnhe);
682                 rt = rcu_dereference(fnhe->fnhe_rth_output);
683                 if (rt)
684                         fill_route_from_fnhe(rt, fnhe);
685         } else {
686                 if (depth > FNHE_RECLAIM_DEPTH)
687                         fnhe = fnhe_oldest(hash);
688                 else {
689                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
690                         if (!fnhe)
691                                 goto out_unlock;
692
693                         fnhe->fnhe_next = hash->chain;
694                         rcu_assign_pointer(hash->chain, fnhe);
695                 }
696                 fnhe->fnhe_genid = genid;
697                 fnhe->fnhe_daddr = daddr;
698                 fnhe->fnhe_gw = gw;
699                 fnhe->fnhe_pmtu = pmtu;
700                 fnhe->fnhe_mtu_locked = lock;
701                 fnhe->fnhe_expires = max(1UL, expires);
702
703                 /* Exception created; mark the cached routes for the nexthop
704                  * stale, so anyone caching it rechecks if this exception
705                  * applies to them.
706                  */
707                 rt = rcu_dereference(nh->nh_rth_input);
708                 if (rt)
709                         rt->dst.obsolete = DST_OBSOLETE_KILL;
710
711                 for_each_possible_cpu(i) {
712                         struct rtable __rcu **prt;
713                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
714                         rt = rcu_dereference(*prt);
715                         if (rt)
716                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
717                 }
718         }
719
720         fnhe->fnhe_stamp = jiffies;
721
722 out_unlock:
723         spin_unlock_bh(&fnhe_lock);
724 }
725
726 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
727                              bool kill_route)
728 {
729         __be32 new_gw = icmp_hdr(skb)->un.gateway;
730         __be32 old_gw = ip_hdr(skb)->saddr;
731         struct net_device *dev = skb->dev;
732         struct in_device *in_dev;
733         struct fib_result res;
734         struct neighbour *n;
735         struct net *net;
736
737         switch (icmp_hdr(skb)->code & 7) {
738         case ICMP_REDIR_NET:
739         case ICMP_REDIR_NETTOS:
740         case ICMP_REDIR_HOST:
741         case ICMP_REDIR_HOSTTOS:
742                 break;
743
744         default:
745                 return;
746         }
747
748         if (rt->rt_gateway != old_gw)
749                 return;
750
751         in_dev = __in_dev_get_rcu(dev);
752         if (!in_dev)
753                 return;
754
755         net = dev_net(dev);
756         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
757             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
758             ipv4_is_zeronet(new_gw))
759                 goto reject_redirect;
760
761         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
762                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
763                         goto reject_redirect;
764                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
765                         goto reject_redirect;
766         } else {
767                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
768                         goto reject_redirect;
769         }
770
771         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
772         if (!n)
773                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
774         if (!IS_ERR(n)) {
775                 if (!(n->nud_state & NUD_VALID)) {
776                         neigh_event_send(n, NULL);
777                 } else {
778                         if (fib_lookup(net, fl4, &res, 0) == 0) {
779                                 struct fib_nh *nh = &FIB_RES_NH(res);
780
781                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
782                                                 0, false,
783                                                 jiffies + ip_rt_gc_timeout);
784                         }
785                         if (kill_route)
786                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
787                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
788                 }
789                 neigh_release(n);
790         }
791         return;
792
793 reject_redirect:
794 #ifdef CONFIG_IP_ROUTE_VERBOSE
795         if (IN_DEV_LOG_MARTIANS(in_dev)) {
796                 const struct iphdr *iph = (const struct iphdr *) skb->data;
797                 __be32 daddr = iph->daddr;
798                 __be32 saddr = iph->saddr;
799
800                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
801                                      "  Advised path = %pI4 -> %pI4\n",
802                                      &old_gw, dev->name, &new_gw,
803                                      &saddr, &daddr);
804         }
805 #endif
806         ;
807 }
808
809 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
810 {
811         struct rtable *rt;
812         struct flowi4 fl4;
813         const struct iphdr *iph = (const struct iphdr *) skb->data;
814         struct net *net = dev_net(skb->dev);
815         int oif = skb->dev->ifindex;
816         u8 tos = RT_TOS(iph->tos);
817         u8 prot = iph->protocol;
818         u32 mark = skb->mark;
819
820         rt = (struct rtable *) dst;
821
822         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
823         __ip_do_redirect(rt, skb, &fl4, true);
824 }
825
826 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
827 {
828         struct rtable *rt = (struct rtable *)dst;
829         struct dst_entry *ret = dst;
830
831         if (rt) {
832                 if (dst->obsolete > 0) {
833                         ip_rt_put(rt);
834                         ret = NULL;
835                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
836                            rt->dst.expires) {
837                         ip_rt_put(rt);
838                         ret = NULL;
839                 }
840         }
841         return ret;
842 }
843
844 /*
845  * Algorithm:
846  *      1. The first ip_rt_redirect_number redirects are sent
847  *         with exponential backoff, then we stop sending them at all,
848  *         assuming that the host ignores our redirects.
849  *      2. If we did not see packets requiring redirects
850  *         during ip_rt_redirect_silence, we assume that the host
851  *         forgot redirected route and start to send redirects again.
852  *
853  * This algorithm is much cheaper and more intelligent than dumb load limiting
854  * in icmp.c.
855  *
856  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
857  * and "frag. need" (breaks PMTU discovery) in icmp.c.
858  */
859
860 void ip_rt_send_redirect(struct sk_buff *skb)
861 {
862         struct rtable *rt = skb_rtable(skb);
863         struct in_device *in_dev;
864         struct inet_peer *peer;
865         struct net *net;
866         int log_martians;
867         int vif;
868
869         rcu_read_lock();
870         in_dev = __in_dev_get_rcu(rt->dst.dev);
871         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
872                 rcu_read_unlock();
873                 return;
874         }
875         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
876         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
877         rcu_read_unlock();
878
879         net = dev_net(rt->dst.dev);
880         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
881         if (!peer) {
882                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
883                           rt_nexthop(rt, ip_hdr(skb)->daddr));
884                 return;
885         }
886
887         /* No redirected packets during ip_rt_redirect_silence;
888          * reset the algorithm.
889          */
890         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
891                 peer->rate_tokens = 0;
892
893         /* Too many ignored redirects; do not send anything
894          * set dst.rate_last to the last seen redirected packet.
895          */
896         if (peer->rate_tokens >= ip_rt_redirect_number) {
897                 peer->rate_last = jiffies;
898                 goto out_put_peer;
899         }
900
901         /* Check for load limit; set rate_last to the latest sent
902          * redirect.
903          */
904         if (peer->rate_tokens == 0 ||
905             time_after(jiffies,
906                        (peer->rate_last +
907                         (ip_rt_redirect_load << peer->rate_tokens)))) {
908                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
909
910                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
911                 peer->rate_last = jiffies;
912                 ++peer->rate_tokens;
913 #ifdef CONFIG_IP_ROUTE_VERBOSE
914                 if (log_martians &&
915                     peer->rate_tokens == ip_rt_redirect_number)
916                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
917                                              &ip_hdr(skb)->saddr, inet_iif(skb),
918                                              &ip_hdr(skb)->daddr, &gw);
919 #endif
920         }
921 out_put_peer:
922         inet_putpeer(peer);
923 }
924
925 static int ip_error(struct sk_buff *skb)
926 {
927         struct rtable *rt = skb_rtable(skb);
928         struct net_device *dev = skb->dev;
929         struct in_device *in_dev;
930         struct inet_peer *peer;
931         unsigned long now;
932         struct net *net;
933         bool send;
934         int code;
935
936         if (netif_is_l3_master(skb->dev)) {
937                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
938                 if (!dev)
939                         goto out;
940         }
941
942         in_dev = __in_dev_get_rcu(dev);
943
944         /* IP on this device is disabled. */
945         if (!in_dev)
946                 goto out;
947
948         net = dev_net(rt->dst.dev);
949         if (!IN_DEV_FORWARD(in_dev)) {
950                 switch (rt->dst.error) {
951                 case EHOSTUNREACH:
952                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
953                         break;
954
955                 case ENETUNREACH:
956                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
957                         break;
958                 }
959                 goto out;
960         }
961
962         switch (rt->dst.error) {
963         case EINVAL:
964         default:
965                 goto out;
966         case EHOSTUNREACH:
967                 code = ICMP_HOST_UNREACH;
968                 break;
969         case ENETUNREACH:
970                 code = ICMP_NET_UNREACH;
971                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
972                 break;
973         case EACCES:
974                 code = ICMP_PKT_FILTERED;
975                 break;
976         }
977
978         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
979                                l3mdev_master_ifindex(skb->dev), 1);
980
981         send = true;
982         if (peer) {
983                 now = jiffies;
984                 peer->rate_tokens += now - peer->rate_last;
985                 if (peer->rate_tokens > ip_rt_error_burst)
986                         peer->rate_tokens = ip_rt_error_burst;
987                 peer->rate_last = now;
988                 if (peer->rate_tokens >= ip_rt_error_cost)
989                         peer->rate_tokens -= ip_rt_error_cost;
990                 else
991                         send = false;
992                 inet_putpeer(peer);
993         }
994         if (send)
995                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
996
997 out:    kfree_skb(skb);
998         return 0;
999 }
1000
1001 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1002 {
1003         struct dst_entry *dst = &rt->dst;
1004         u32 old_mtu = ipv4_mtu(dst);
1005         struct fib_result res;
1006         bool lock = false;
1007
1008         if (ip_mtu_locked(dst))
1009                 return;
1010
1011         if (old_mtu < mtu)
1012                 return;
1013
1014         if (mtu < ip_rt_min_pmtu) {
1015                 lock = true;
1016                 mtu = min(old_mtu, ip_rt_min_pmtu);
1017         }
1018
1019         if (rt->rt_pmtu == mtu && !lock &&
1020             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1021                 return;
1022
1023         rcu_read_lock();
1024         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1025                 struct fib_nh *nh = &FIB_RES_NH(res);
1026
1027                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1028                                       jiffies + ip_rt_mtu_expires);
1029         }
1030         rcu_read_unlock();
1031 }
1032
1033 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1034                               struct sk_buff *skb, u32 mtu)
1035 {
1036         struct rtable *rt = (struct rtable *) dst;
1037         struct flowi4 fl4;
1038
1039         ip_rt_build_flow_key(&fl4, sk, skb);
1040         __ip_rt_update_pmtu(rt, &fl4, mtu);
1041 }
1042
1043 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1044                       int oif, u32 mark, u8 protocol, int flow_flags)
1045 {
1046         const struct iphdr *iph = (const struct iphdr *) skb->data;
1047         struct flowi4 fl4;
1048         struct rtable *rt;
1049
1050         if (!mark)
1051                 mark = IP4_REPLY_MARK(net, skb->mark);
1052
1053         __build_flow_key(net, &fl4, NULL, iph, oif,
1054                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1055         rt = __ip_route_output_key(net, &fl4);
1056         if (!IS_ERR(rt)) {
1057                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1058                 ip_rt_put(rt);
1059         }
1060 }
1061 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1062
1063 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1064 {
1065         const struct iphdr *iph = (const struct iphdr *) skb->data;
1066         struct flowi4 fl4;
1067         struct rtable *rt;
1068
1069         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1070
1071         if (!fl4.flowi4_mark)
1072                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1073
1074         rt = __ip_route_output_key(sock_net(sk), &fl4);
1075         if (!IS_ERR(rt)) {
1076                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1077                 ip_rt_put(rt);
1078         }
1079 }
1080
1081 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1082 {
1083         const struct iphdr *iph = (const struct iphdr *) skb->data;
1084         struct flowi4 fl4;
1085         struct rtable *rt;
1086         struct dst_entry *odst = NULL;
1087         bool new = false;
1088         struct net *net = sock_net(sk);
1089
1090         bh_lock_sock(sk);
1091
1092         if (!ip_sk_accept_pmtu(sk))
1093                 goto out;
1094
1095         odst = sk_dst_get(sk);
1096
1097         if (sock_owned_by_user(sk) || !odst) {
1098                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1099                 goto out;
1100         }
1101
1102         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1103
1104         rt = (struct rtable *)odst;
1105         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1106                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1107                 if (IS_ERR(rt))
1108                         goto out;
1109
1110                 new = true;
1111         }
1112
1113         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1114
1115         if (!dst_check(&rt->dst, 0)) {
1116                 if (new)
1117                         dst_release(&rt->dst);
1118
1119                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1120                 if (IS_ERR(rt))
1121                         goto out;
1122
1123                 new = true;
1124         }
1125
1126         if (new)
1127                 sk_dst_set(sk, &rt->dst);
1128
1129 out:
1130         bh_unlock_sock(sk);
1131         dst_release(odst);
1132 }
1133 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1134
1135 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1136                    int oif, u32 mark, u8 protocol, int flow_flags)
1137 {
1138         const struct iphdr *iph = (const struct iphdr *) skb->data;
1139         struct flowi4 fl4;
1140         struct rtable *rt;
1141
1142         __build_flow_key(net, &fl4, NULL, iph, oif,
1143                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1144         rt = __ip_route_output_key(net, &fl4);
1145         if (!IS_ERR(rt)) {
1146                 __ip_do_redirect(rt, skb, &fl4, false);
1147                 ip_rt_put(rt);
1148         }
1149 }
1150 EXPORT_SYMBOL_GPL(ipv4_redirect);
1151
1152 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1153 {
1154         const struct iphdr *iph = (const struct iphdr *) skb->data;
1155         struct flowi4 fl4;
1156         struct rtable *rt;
1157         struct net *net = sock_net(sk);
1158
1159         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1160         rt = __ip_route_output_key(net, &fl4);
1161         if (!IS_ERR(rt)) {
1162                 __ip_do_redirect(rt, skb, &fl4, false);
1163                 ip_rt_put(rt);
1164         }
1165 }
1166 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1167
1168 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1169 {
1170         struct rtable *rt = (struct rtable *) dst;
1171
1172         /* All IPV4 dsts are created with ->obsolete set to the value
1173          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1174          * into this function always.
1175          *
1176          * When a PMTU/redirect information update invalidates a route,
1177          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1178          * DST_OBSOLETE_DEAD by dst_free().
1179          */
1180         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1181                 return NULL;
1182         return dst;
1183 }
1184
1185 static void ipv4_link_failure(struct sk_buff *skb)
1186 {
1187         struct rtable *rt;
1188
1189         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1190
1191         rt = skb_rtable(skb);
1192         if (rt)
1193                 dst_set_expires(&rt->dst, 0);
1194 }
1195
1196 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1197 {
1198         pr_debug("%s: %pI4 -> %pI4, %s\n",
1199                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1200                  skb->dev ? skb->dev->name : "?");
1201         kfree_skb(skb);
1202         WARN_ON(1);
1203         return 0;
1204 }
1205
1206 /*
1207    We do not cache source address of outgoing interface,
1208    because it is used only by IP RR, TS and SRR options,
1209    so that it out of fast path.
1210
1211    BTW remember: "addr" is allowed to be not aligned
1212    in IP options!
1213  */
1214
1215 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1216 {
1217         __be32 src;
1218
1219         if (rt_is_output_route(rt))
1220                 src = ip_hdr(skb)->saddr;
1221         else {
1222                 struct fib_result res;
1223                 struct flowi4 fl4;
1224                 struct iphdr *iph;
1225
1226                 iph = ip_hdr(skb);
1227
1228                 memset(&fl4, 0, sizeof(fl4));
1229                 fl4.daddr = iph->daddr;
1230                 fl4.saddr = iph->saddr;
1231                 fl4.flowi4_tos = RT_TOS(iph->tos);
1232                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1233                 fl4.flowi4_iif = skb->dev->ifindex;
1234                 fl4.flowi4_mark = skb->mark;
1235
1236                 rcu_read_lock();
1237                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1238                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1239                 else
1240                         src = inet_select_addr(rt->dst.dev,
1241                                                rt_nexthop(rt, iph->daddr),
1242                                                RT_SCOPE_UNIVERSE);
1243                 rcu_read_unlock();
1244         }
1245         memcpy(addr, &src, 4);
1246 }
1247
1248 #ifdef CONFIG_IP_ROUTE_CLASSID
1249 static void set_class_tag(struct rtable *rt, u32 tag)
1250 {
1251         if (!(rt->dst.tclassid & 0xFFFF))
1252                 rt->dst.tclassid |= tag & 0xFFFF;
1253         if (!(rt->dst.tclassid & 0xFFFF0000))
1254                 rt->dst.tclassid |= tag & 0xFFFF0000;
1255 }
1256 #endif
1257
1258 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1259 {
1260         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1261         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1262                                     ip_rt_min_advmss);
1263
1264         return min(advmss, IPV4_MAX_PMTU - header_size);
1265 }
1266
1267 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1268 {
1269         const struct rtable *rt = (const struct rtable *) dst;
1270         unsigned int mtu = rt->rt_pmtu;
1271
1272         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1273                 mtu = dst_metric_raw(dst, RTAX_MTU);
1274
1275         if (mtu)
1276                 return mtu;
1277
1278         mtu = READ_ONCE(dst->dev->mtu);
1279
1280         if (unlikely(ip_mtu_locked(dst))) {
1281                 if (rt->rt_uses_gateway && mtu > 576)
1282                         mtu = 576;
1283         }
1284
1285         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1286
1287         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1288 }
1289
1290 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1291 {
1292         struct fnhe_hash_bucket *hash;
1293         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1294         u32 hval = fnhe_hashfun(daddr);
1295
1296         spin_lock_bh(&fnhe_lock);
1297
1298         hash = rcu_dereference_protected(nh->nh_exceptions,
1299                                          lockdep_is_held(&fnhe_lock));
1300         hash += hval;
1301
1302         fnhe_p = &hash->chain;
1303         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1304         while (fnhe) {
1305                 if (fnhe->fnhe_daddr == daddr) {
1306                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1307                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1308                         fnhe_flush_routes(fnhe);
1309                         kfree_rcu(fnhe, rcu);
1310                         break;
1311                 }
1312                 fnhe_p = &fnhe->fnhe_next;
1313                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1314                                                  lockdep_is_held(&fnhe_lock));
1315         }
1316
1317         spin_unlock_bh(&fnhe_lock);
1318 }
1319
1320 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1321 {
1322         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1323         struct fib_nh_exception *fnhe;
1324         u32 hval;
1325
1326         if (!hash)
1327                 return NULL;
1328
1329         hval = fnhe_hashfun(daddr);
1330
1331         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1332              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1333                 if (fnhe->fnhe_daddr == daddr) {
1334                         if (fnhe->fnhe_expires &&
1335                             time_after(jiffies, fnhe->fnhe_expires)) {
1336                                 ip_del_fnhe(nh, daddr);
1337                                 break;
1338                         }
1339                         return fnhe;
1340                 }
1341         }
1342         return NULL;
1343 }
1344
1345 /* MTU selection:
1346  * 1. mtu on route is locked - use it
1347  * 2. mtu from nexthop exception
1348  * 3. mtu from egress device
1349  */
1350
1351 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1352 {
1353         struct fib_info *fi = res->fi;
1354         struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1355         struct net_device *dev = nh->nh_dev;
1356         u32 mtu = 0;
1357
1358         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1359             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1360                 mtu = fi->fib_mtu;
1361
1362         if (likely(!mtu)) {
1363                 struct fib_nh_exception *fnhe;
1364
1365                 fnhe = find_exception(nh, daddr);
1366                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1367                         mtu = fnhe->fnhe_pmtu;
1368         }
1369
1370         if (likely(!mtu))
1371                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1372
1373         return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1374 }
1375
1376 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1377                               __be32 daddr, const bool do_cache)
1378 {
1379         bool ret = false;
1380
1381         spin_lock_bh(&fnhe_lock);
1382
1383         if (daddr == fnhe->fnhe_daddr) {
1384                 struct rtable __rcu **porig;
1385                 struct rtable *orig;
1386                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1387
1388                 if (rt_is_input_route(rt))
1389                         porig = &fnhe->fnhe_rth_input;
1390                 else
1391                         porig = &fnhe->fnhe_rth_output;
1392                 orig = rcu_dereference(*porig);
1393
1394                 if (fnhe->fnhe_genid != genid) {
1395                         fnhe->fnhe_genid = genid;
1396                         fnhe->fnhe_gw = 0;
1397                         fnhe->fnhe_pmtu = 0;
1398                         fnhe->fnhe_expires = 0;
1399                         fnhe->fnhe_mtu_locked = false;
1400                         fnhe_flush_routes(fnhe);
1401                         orig = NULL;
1402                 }
1403                 fill_route_from_fnhe(rt, fnhe);
1404                 if (!rt->rt_gateway)
1405                         rt->rt_gateway = daddr;
1406
1407                 if (do_cache) {
1408                         dst_hold(&rt->dst);
1409                         rcu_assign_pointer(*porig, rt);
1410                         if (orig) {
1411                                 dst_dev_put(&orig->dst);
1412                                 dst_release(&orig->dst);
1413                         }
1414                         ret = true;
1415                 }
1416
1417                 fnhe->fnhe_stamp = jiffies;
1418         }
1419         spin_unlock_bh(&fnhe_lock);
1420
1421         return ret;
1422 }
1423
1424 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1425 {
1426         struct rtable *orig, *prev, **p;
1427         bool ret = true;
1428
1429         if (rt_is_input_route(rt)) {
1430                 p = (struct rtable **)&nh->nh_rth_input;
1431         } else {
1432                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1433         }
1434         orig = *p;
1435
1436         /* hold dst before doing cmpxchg() to avoid race condition
1437          * on this dst
1438          */
1439         dst_hold(&rt->dst);
1440         prev = cmpxchg(p, orig, rt);
1441         if (prev == orig) {
1442                 if (orig) {
1443                         dst_dev_put(&orig->dst);
1444                         dst_release(&orig->dst);
1445                 }
1446         } else {
1447                 dst_release(&rt->dst);
1448                 ret = false;
1449         }
1450
1451         return ret;
1452 }
1453
1454 struct uncached_list {
1455         spinlock_t              lock;
1456         struct list_head        head;
1457 };
1458
1459 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1460
1461 void rt_add_uncached_list(struct rtable *rt)
1462 {
1463         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1464
1465         rt->rt_uncached_list = ul;
1466
1467         spin_lock_bh(&ul->lock);
1468         list_add_tail(&rt->rt_uncached, &ul->head);
1469         spin_unlock_bh(&ul->lock);
1470 }
1471
1472 void rt_del_uncached_list(struct rtable *rt)
1473 {
1474         if (!list_empty(&rt->rt_uncached)) {
1475                 struct uncached_list *ul = rt->rt_uncached_list;
1476
1477                 spin_lock_bh(&ul->lock);
1478                 list_del(&rt->rt_uncached);
1479                 spin_unlock_bh(&ul->lock);
1480         }
1481 }
1482
1483 static void ipv4_dst_destroy(struct dst_entry *dst)
1484 {
1485         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1486         struct rtable *rt = (struct rtable *)dst;
1487
1488         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1489                 kfree(p);
1490
1491         rt_del_uncached_list(rt);
1492 }
1493
1494 void rt_flush_dev(struct net_device *dev)
1495 {
1496         struct net *net = dev_net(dev);
1497         struct rtable *rt;
1498         int cpu;
1499
1500         for_each_possible_cpu(cpu) {
1501                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1502
1503                 spin_lock_bh(&ul->lock);
1504                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1505                         if (rt->dst.dev != dev)
1506                                 continue;
1507                         rt->dst.dev = net->loopback_dev;
1508                         dev_hold(rt->dst.dev);
1509                         dev_put(dev);
1510                 }
1511                 spin_unlock_bh(&ul->lock);
1512         }
1513 }
1514
1515 static bool rt_cache_valid(const struct rtable *rt)
1516 {
1517         return  rt &&
1518                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1519                 !rt_is_expired(rt);
1520 }
1521
1522 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1523                            const struct fib_result *res,
1524                            struct fib_nh_exception *fnhe,
1525                            struct fib_info *fi, u16 type, u32 itag,
1526                            const bool do_cache)
1527 {
1528         bool cached = false;
1529
1530         if (fi) {
1531                 struct fib_nh *nh = &FIB_RES_NH(*res);
1532
1533                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1534                         rt->rt_gateway = nh->nh_gw;
1535                         rt->rt_uses_gateway = 1;
1536                 }
1537                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1538                 if (fi->fib_metrics != &dst_default_metrics) {
1539                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1540                         refcount_inc(&fi->fib_metrics->refcnt);
1541                 }
1542 #ifdef CONFIG_IP_ROUTE_CLASSID
1543                 rt->dst.tclassid = nh->nh_tclassid;
1544 #endif
1545                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1546                 if (unlikely(fnhe))
1547                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1548                 else if (do_cache)
1549                         cached = rt_cache_route(nh, rt);
1550                 if (unlikely(!cached)) {
1551                         /* Routes we intend to cache in nexthop exception or
1552                          * FIB nexthop have the DST_NOCACHE bit clear.
1553                          * However, if we are unsuccessful at storing this
1554                          * route into the cache we really need to set it.
1555                          */
1556                         if (!rt->rt_gateway)
1557                                 rt->rt_gateway = daddr;
1558                         rt_add_uncached_list(rt);
1559                 }
1560         } else
1561                 rt_add_uncached_list(rt);
1562
1563 #ifdef CONFIG_IP_ROUTE_CLASSID
1564 #ifdef CONFIG_IP_MULTIPLE_TABLES
1565         set_class_tag(rt, res->tclassid);
1566 #endif
1567         set_class_tag(rt, itag);
1568 #endif
1569 }
1570
1571 struct rtable *rt_dst_alloc(struct net_device *dev,
1572                             unsigned int flags, u16 type,
1573                             bool nopolicy, bool noxfrm, bool will_cache)
1574 {
1575         struct rtable *rt;
1576
1577         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1578                        (will_cache ? 0 : DST_HOST) |
1579                        (nopolicy ? DST_NOPOLICY : 0) |
1580                        (noxfrm ? DST_NOXFRM : 0));
1581
1582         if (rt) {
1583                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1584                 rt->rt_flags = flags;
1585                 rt->rt_type = type;
1586                 rt->rt_is_input = 0;
1587                 rt->rt_iif = 0;
1588                 rt->rt_pmtu = 0;
1589                 rt->rt_mtu_locked = 0;
1590                 rt->rt_gateway = 0;
1591                 rt->rt_uses_gateway = 0;
1592                 INIT_LIST_HEAD(&rt->rt_uncached);
1593
1594                 rt->dst.output = ip_output;
1595                 if (flags & RTCF_LOCAL)
1596                         rt->dst.input = ip_local_deliver;
1597         }
1598
1599         return rt;
1600 }
1601 EXPORT_SYMBOL(rt_dst_alloc);
1602
1603 /* called in rcu_read_lock() section */
1604 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1605                           u8 tos, struct net_device *dev,
1606                           struct in_device *in_dev, u32 *itag)
1607 {
1608         int err;
1609
1610         /* Primary sanity checks. */
1611         if (!in_dev)
1612                 return -EINVAL;
1613
1614         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1615             skb->protocol != htons(ETH_P_IP))
1616                 return -EINVAL;
1617
1618         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1619                 return -EINVAL;
1620
1621         if (ipv4_is_zeronet(saddr)) {
1622                 if (!ipv4_is_local_multicast(daddr))
1623                         return -EINVAL;
1624         } else {
1625                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1626                                           in_dev, itag);
1627                 if (err < 0)
1628                         return err;
1629         }
1630         return 0;
1631 }
1632
1633 /* called in rcu_read_lock() section */
1634 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1635                              u8 tos, struct net_device *dev, int our)
1636 {
1637         struct in_device *in_dev = __in_dev_get_rcu(dev);
1638         unsigned int flags = RTCF_MULTICAST;
1639         struct rtable *rth;
1640         u32 itag = 0;
1641         int err;
1642
1643         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1644         if (err)
1645                 return err;
1646
1647         if (our)
1648                 flags |= RTCF_LOCAL;
1649
1650         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1651                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1652         if (!rth)
1653                 return -ENOBUFS;
1654
1655 #ifdef CONFIG_IP_ROUTE_CLASSID
1656         rth->dst.tclassid = itag;
1657 #endif
1658         rth->dst.output = ip_rt_bug;
1659         rth->rt_is_input= 1;
1660
1661 #ifdef CONFIG_IP_MROUTE
1662         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1663                 rth->dst.input = ip_mr_input;
1664 #endif
1665         RT_CACHE_STAT_INC(in_slow_mc);
1666
1667         skb_dst_set(skb, &rth->dst);
1668         return 0;
1669 }
1670
1671
1672 static void ip_handle_martian_source(struct net_device *dev,
1673                                      struct in_device *in_dev,
1674                                      struct sk_buff *skb,
1675                                      __be32 daddr,
1676                                      __be32 saddr)
1677 {
1678         RT_CACHE_STAT_INC(in_martian_src);
1679 #ifdef CONFIG_IP_ROUTE_VERBOSE
1680         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1681                 /*
1682                  *      RFC1812 recommendation, if source is martian,
1683                  *      the only hint is MAC header.
1684                  */
1685                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1686                         &daddr, &saddr, dev->name);
1687                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1688                         print_hex_dump(KERN_WARNING, "ll header: ",
1689                                        DUMP_PREFIX_OFFSET, 16, 1,
1690                                        skb_mac_header(skb),
1691                                        dev->hard_header_len, true);
1692                 }
1693         }
1694 #endif
1695 }
1696
1697 /* called in rcu_read_lock() section */
1698 static int __mkroute_input(struct sk_buff *skb,
1699                            const struct fib_result *res,
1700                            struct in_device *in_dev,
1701                            __be32 daddr, __be32 saddr, u32 tos)
1702 {
1703         struct fib_nh_exception *fnhe;
1704         struct rtable *rth;
1705         int err;
1706         struct in_device *out_dev;
1707         bool do_cache;
1708         u32 itag = 0;
1709
1710         /* get a working reference to the output device */
1711         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1712         if (!out_dev) {
1713                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1714                 return -EINVAL;
1715         }
1716
1717         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1718                                   in_dev->dev, in_dev, &itag);
1719         if (err < 0) {
1720                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1721                                          saddr);
1722
1723                 goto cleanup;
1724         }
1725
1726         do_cache = res->fi && !itag;
1727         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1728             skb->protocol == htons(ETH_P_IP) &&
1729             (IN_DEV_SHARED_MEDIA(out_dev) ||
1730              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1731                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1732
1733         if (skb->protocol != htons(ETH_P_IP)) {
1734                 /* Not IP (i.e. ARP). Do not create route, if it is
1735                  * invalid for proxy arp. DNAT routes are always valid.
1736                  *
1737                  * Proxy arp feature have been extended to allow, ARP
1738                  * replies back to the same interface, to support
1739                  * Private VLAN switch technologies. See arp.c.
1740                  */
1741                 if (out_dev == in_dev &&
1742                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1743                         err = -EINVAL;
1744                         goto cleanup;
1745                 }
1746         }
1747
1748         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1749         if (do_cache) {
1750                 if (fnhe)
1751                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1752                 else
1753                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1754                 if (rt_cache_valid(rth)) {
1755                         skb_dst_set_noref(skb, &rth->dst);
1756                         goto out;
1757                 }
1758         }
1759
1760         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1761                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1762                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1763         if (!rth) {
1764                 err = -ENOBUFS;
1765                 goto cleanup;
1766         }
1767
1768         rth->rt_is_input = 1;
1769         RT_CACHE_STAT_INC(in_slow_tot);
1770
1771         rth->dst.input = ip_forward;
1772
1773         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1774                        do_cache);
1775         lwtunnel_set_redirect(&rth->dst);
1776         skb_dst_set(skb, &rth->dst);
1777 out:
1778         err = 0;
1779  cleanup:
1780         return err;
1781 }
1782
1783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1784 /* To make ICMP packets follow the right flow, the multipath hash is
1785  * calculated from the inner IP addresses.
1786  */
1787 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1788                                  struct flow_keys *hash_keys)
1789 {
1790         const struct iphdr *outer_iph = ip_hdr(skb);
1791         const struct iphdr *key_iph = outer_iph;
1792         const struct iphdr *inner_iph;
1793         const struct icmphdr *icmph;
1794         struct iphdr _inner_iph;
1795         struct icmphdr _icmph;
1796
1797         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1798                 goto out;
1799
1800         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1801                 goto out;
1802
1803         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1804                                    &_icmph);
1805         if (!icmph)
1806                 goto out;
1807
1808         if (icmph->type != ICMP_DEST_UNREACH &&
1809             icmph->type != ICMP_REDIRECT &&
1810             icmph->type != ICMP_TIME_EXCEEDED &&
1811             icmph->type != ICMP_PARAMETERPROB)
1812                 goto out;
1813
1814         inner_iph = skb_header_pointer(skb,
1815                                        outer_iph->ihl * 4 + sizeof(_icmph),
1816                                        sizeof(_inner_iph), &_inner_iph);
1817         if (!inner_iph)
1818                 goto out;
1819
1820         key_iph = inner_iph;
1821 out:
1822         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1823         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1824 }
1825
1826 /* if skb is set it will be used and fl4 can be NULL */
1827 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1828                        const struct sk_buff *skb, struct flow_keys *flkeys)
1829 {
1830         struct flow_keys hash_keys;
1831         u32 mhash;
1832
1833         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1834         case 0:
1835                 memset(&hash_keys, 0, sizeof(hash_keys));
1836                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1837                 if (skb) {
1838                         ip_multipath_l3_keys(skb, &hash_keys);
1839                 } else {
1840                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1841                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1842                 }
1843                 break;
1844         case 1:
1845                 /* skb is currently provided only when forwarding */
1846                 if (skb) {
1847                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1848                         struct flow_keys keys;
1849
1850                         /* short-circuit if we already have L4 hash present */
1851                         if (skb->l4_hash)
1852                                 return skb_get_hash_raw(skb) >> 1;
1853
1854                         memset(&hash_keys, 0, sizeof(hash_keys));
1855
1856                         if (!flkeys) {
1857                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1858                                 flkeys = &keys;
1859                         }
1860
1861                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1862                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1863                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1864                         hash_keys.ports.src = flkeys->ports.src;
1865                         hash_keys.ports.dst = flkeys->ports.dst;
1866                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1867                 } else {
1868                         memset(&hash_keys, 0, sizeof(hash_keys));
1869                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1870                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1871                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1872                         hash_keys.ports.src = fl4->fl4_sport;
1873                         hash_keys.ports.dst = fl4->fl4_dport;
1874                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1875                 }
1876                 break;
1877         }
1878         mhash = flow_hash_from_keys(&hash_keys);
1879
1880         return mhash >> 1;
1881 }
1882 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1883
1884 static int ip_mkroute_input(struct sk_buff *skb,
1885                             struct fib_result *res,
1886                             struct in_device *in_dev,
1887                             __be32 daddr, __be32 saddr, u32 tos,
1888                             struct flow_keys *hkeys)
1889 {
1890 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1891         if (res->fi && res->fi->fib_nhs > 1) {
1892                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1893
1894                 fib_select_multipath(res, h);
1895         }
1896 #endif
1897
1898         /* create a routing cache entry */
1899         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1900 }
1901
1902 /*
1903  *      NOTE. We drop all the packets that has local source
1904  *      addresses, because every properly looped back packet
1905  *      must have correct destination already attached by output routine.
1906  *
1907  *      Such approach solves two big problems:
1908  *      1. Not simplex devices are handled properly.
1909  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1910  *      called with rcu_read_lock()
1911  */
1912
1913 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1914                                u8 tos, struct net_device *dev,
1915                                struct fib_result *res)
1916 {
1917         struct in_device *in_dev = __in_dev_get_rcu(dev);
1918         struct flow_keys *flkeys = NULL, _flkeys;
1919         struct net    *net = dev_net(dev);
1920         struct ip_tunnel_info *tun_info;
1921         int             err = -EINVAL;
1922         unsigned int    flags = 0;
1923         u32             itag = 0;
1924         struct rtable   *rth;
1925         struct flowi4   fl4;
1926         bool do_cache;
1927
1928         /* IP on this device is disabled. */
1929
1930         if (!in_dev)
1931                 goto out;
1932
1933         /* Check for the most weird martians, which can be not detected
1934            by fib_lookup.
1935          */
1936
1937         tun_info = skb_tunnel_info(skb);
1938         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1939                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1940         else
1941                 fl4.flowi4_tun_key.tun_id = 0;
1942         skb_dst_drop(skb);
1943
1944         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1945                 goto martian_source;
1946
1947         res->fi = NULL;
1948         res->table = NULL;
1949         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1950                 goto brd_input;
1951
1952         /* Accept zero addresses only to limited broadcast;
1953          * I even do not know to fix it or not. Waiting for complains :-)
1954          */
1955         if (ipv4_is_zeronet(saddr))
1956                 goto martian_source;
1957
1958         if (ipv4_is_zeronet(daddr))
1959                 goto martian_destination;
1960
1961         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1962          * and call it once if daddr or/and saddr are loopback addresses
1963          */
1964         if (ipv4_is_loopback(daddr)) {
1965                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1966                         goto martian_destination;
1967         } else if (ipv4_is_loopback(saddr)) {
1968                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1969                         goto martian_source;
1970         }
1971
1972         /*
1973          *      Now we are ready to route packet.
1974          */
1975         fl4.flowi4_oif = 0;
1976         fl4.flowi4_iif = dev->ifindex;
1977         fl4.flowi4_mark = skb->mark;
1978         fl4.flowi4_tos = tos;
1979         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1980         fl4.flowi4_flags = 0;
1981         fl4.daddr = daddr;
1982         fl4.saddr = saddr;
1983         fl4.flowi4_uid = sock_net_uid(net, NULL);
1984
1985         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1986                 flkeys = &_flkeys;
1987         } else {
1988                 fl4.flowi4_proto = 0;
1989                 fl4.fl4_sport = 0;
1990                 fl4.fl4_dport = 0;
1991         }
1992
1993         err = fib_lookup(net, &fl4, res, 0);
1994         if (err != 0) {
1995                 if (!IN_DEV_FORWARD(in_dev))
1996                         err = -EHOSTUNREACH;
1997                 goto no_route;
1998         }
1999
2000         if (res->type == RTN_BROADCAST) {
2001                 if (IN_DEV_BFORWARD(in_dev))
2002                         goto make_route;
2003                 goto brd_input;
2004         }
2005
2006         if (res->type == RTN_LOCAL) {
2007                 err = fib_validate_source(skb, saddr, daddr, tos,
2008                                           0, dev, in_dev, &itag);
2009                 if (err < 0)
2010                         goto martian_source;
2011                 goto local_input;
2012         }
2013
2014         if (!IN_DEV_FORWARD(in_dev)) {
2015                 err = -EHOSTUNREACH;
2016                 goto no_route;
2017         }
2018         if (res->type != RTN_UNICAST)
2019                 goto martian_destination;
2020
2021 make_route:
2022         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2023 out:    return err;
2024
2025 brd_input:
2026         if (skb->protocol != htons(ETH_P_IP))
2027                 goto e_inval;
2028
2029         if (!ipv4_is_zeronet(saddr)) {
2030                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2031                                           in_dev, &itag);
2032                 if (err < 0)
2033                         goto martian_source;
2034         }
2035         flags |= RTCF_BROADCAST;
2036         res->type = RTN_BROADCAST;
2037         RT_CACHE_STAT_INC(in_brd);
2038
2039 local_input:
2040         do_cache = false;
2041         if (res->fi) {
2042                 if (!itag) {
2043                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2044                         if (rt_cache_valid(rth)) {
2045                                 skb_dst_set_noref(skb, &rth->dst);
2046                                 err = 0;
2047                                 goto out;
2048                         }
2049                         do_cache = true;
2050                 }
2051         }
2052
2053         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2054                            flags | RTCF_LOCAL, res->type,
2055                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2056         if (!rth)
2057                 goto e_nobufs;
2058
2059         rth->dst.output= ip_rt_bug;
2060 #ifdef CONFIG_IP_ROUTE_CLASSID
2061         rth->dst.tclassid = itag;
2062 #endif
2063         rth->rt_is_input = 1;
2064
2065         RT_CACHE_STAT_INC(in_slow_tot);
2066         if (res->type == RTN_UNREACHABLE) {
2067                 rth->dst.input= ip_error;
2068                 rth->dst.error= -err;
2069                 rth->rt_flags   &= ~RTCF_LOCAL;
2070         }
2071
2072         if (do_cache) {
2073                 struct fib_nh *nh = &FIB_RES_NH(*res);
2074
2075                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2076                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2077                         WARN_ON(rth->dst.input == lwtunnel_input);
2078                         rth->dst.lwtstate->orig_input = rth->dst.input;
2079                         rth->dst.input = lwtunnel_input;
2080                 }
2081
2082                 if (unlikely(!rt_cache_route(nh, rth)))
2083                         rt_add_uncached_list(rth);
2084         }
2085         skb_dst_set(skb, &rth->dst);
2086         err = 0;
2087         goto out;
2088
2089 no_route:
2090         RT_CACHE_STAT_INC(in_no_route);
2091         res->type = RTN_UNREACHABLE;
2092         res->fi = NULL;
2093         res->table = NULL;
2094         goto local_input;
2095
2096         /*
2097          *      Do not cache martian addresses: they should be logged (RFC1812)
2098          */
2099 martian_destination:
2100         RT_CACHE_STAT_INC(in_martian_dst);
2101 #ifdef CONFIG_IP_ROUTE_VERBOSE
2102         if (IN_DEV_LOG_MARTIANS(in_dev))
2103                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2104                                      &daddr, &saddr, dev->name);
2105 #endif
2106
2107 e_inval:
2108         err = -EINVAL;
2109         goto out;
2110
2111 e_nobufs:
2112         err = -ENOBUFS;
2113         goto out;
2114
2115 martian_source:
2116         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2117         goto out;
2118 }
2119
2120 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2121                          u8 tos, struct net_device *dev)
2122 {
2123         struct fib_result res;
2124         int err;
2125
2126         tos &= IPTOS_RT_MASK;
2127         rcu_read_lock();
2128         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2129         rcu_read_unlock();
2130
2131         return err;
2132 }
2133 EXPORT_SYMBOL(ip_route_input_noref);
2134
2135 /* called with rcu_read_lock held */
2136 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2137                        u8 tos, struct net_device *dev, struct fib_result *res)
2138 {
2139         /* Multicast recognition logic is moved from route cache to here.
2140            The problem was that too many Ethernet cards have broken/missing
2141            hardware multicast filters :-( As result the host on multicasting
2142            network acquires a lot of useless route cache entries, sort of
2143            SDR messages from all the world. Now we try to get rid of them.
2144            Really, provided software IP multicast filter is organized
2145            reasonably (at least, hashed), it does not result in a slowdown
2146            comparing with route cache reject entries.
2147            Note, that multicast routers are not affected, because
2148            route cache entry is created eventually.
2149          */
2150         if (ipv4_is_multicast(daddr)) {
2151                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2152                 int our = 0;
2153                 int err = -EINVAL;
2154
2155                 if (in_dev)
2156                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2157                                               ip_hdr(skb)->protocol);
2158
2159                 /* check l3 master if no match yet */
2160                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2161                         struct in_device *l3_in_dev;
2162
2163                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2164                         if (l3_in_dev)
2165                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2166                                                       ip_hdr(skb)->protocol);
2167                 }
2168
2169                 if (our
2170 #ifdef CONFIG_IP_MROUTE
2171                         ||
2172                     (!ipv4_is_local_multicast(daddr) &&
2173                      IN_DEV_MFORWARD(in_dev))
2174 #endif
2175                    ) {
2176                         err = ip_route_input_mc(skb, daddr, saddr,
2177                                                 tos, dev, our);
2178                 }
2179                 return err;
2180         }
2181
2182         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2183 }
2184
2185 /* called with rcu_read_lock() */
2186 static struct rtable *__mkroute_output(const struct fib_result *res,
2187                                        const struct flowi4 *fl4, int orig_oif,
2188                                        struct net_device *dev_out,
2189                                        unsigned int flags)
2190 {
2191         struct fib_info *fi = res->fi;
2192         struct fib_nh_exception *fnhe;
2193         struct in_device *in_dev;
2194         u16 type = res->type;
2195         struct rtable *rth;
2196         bool do_cache;
2197
2198         in_dev = __in_dev_get_rcu(dev_out);
2199         if (!in_dev)
2200                 return ERR_PTR(-EINVAL);
2201
2202         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2203                 if (ipv4_is_loopback(fl4->saddr) &&
2204                     !(dev_out->flags & IFF_LOOPBACK) &&
2205                     !netif_is_l3_master(dev_out))
2206                         return ERR_PTR(-EINVAL);
2207
2208         if (ipv4_is_lbcast(fl4->daddr))
2209                 type = RTN_BROADCAST;
2210         else if (ipv4_is_multicast(fl4->daddr))
2211                 type = RTN_MULTICAST;
2212         else if (ipv4_is_zeronet(fl4->daddr))
2213                 return ERR_PTR(-EINVAL);
2214
2215         if (dev_out->flags & IFF_LOOPBACK)
2216                 flags |= RTCF_LOCAL;
2217
2218         do_cache = true;
2219         if (type == RTN_BROADCAST) {
2220                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2221                 fi = NULL;
2222         } else if (type == RTN_MULTICAST) {
2223                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2224                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2225                                      fl4->flowi4_proto))
2226                         flags &= ~RTCF_LOCAL;
2227                 else
2228                         do_cache = false;
2229                 /* If multicast route do not exist use
2230                  * default one, but do not gateway in this case.
2231                  * Yes, it is hack.
2232                  */
2233                 if (fi && res->prefixlen < 4)
2234                         fi = NULL;
2235         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2236                    (orig_oif != dev_out->ifindex)) {
2237                 /* For local routes that require a particular output interface
2238                  * we do not want to cache the result.  Caching the result
2239                  * causes incorrect behaviour when there are multiple source
2240                  * addresses on the interface, the end result being that if the
2241                  * intended recipient is waiting on that interface for the
2242                  * packet he won't receive it because it will be delivered on
2243                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2244                  * be set to the loopback interface as well.
2245                  */
2246                 do_cache = false;
2247         }
2248
2249         fnhe = NULL;
2250         do_cache &= fi != NULL;
2251         if (fi) {
2252                 struct rtable __rcu **prth;
2253                 struct fib_nh *nh = &FIB_RES_NH(*res);
2254
2255                 fnhe = find_exception(nh, fl4->daddr);
2256                 if (!do_cache)
2257                         goto add;
2258                 if (fnhe) {
2259                         prth = &fnhe->fnhe_rth_output;
2260                 } else {
2261                         if (unlikely(fl4->flowi4_flags &
2262                                      FLOWI_FLAG_KNOWN_NH &&
2263                                      !(nh->nh_gw &&
2264                                        nh->nh_scope == RT_SCOPE_LINK))) {
2265                                 do_cache = false;
2266                                 goto add;
2267                         }
2268                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2269                 }
2270                 rth = rcu_dereference(*prth);
2271                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2272                         return rth;
2273         }
2274
2275 add:
2276         rth = rt_dst_alloc(dev_out, flags, type,
2277                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2278                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2279                            do_cache);
2280         if (!rth)
2281                 return ERR_PTR(-ENOBUFS);
2282
2283         rth->rt_iif = orig_oif;
2284
2285         RT_CACHE_STAT_INC(out_slow_tot);
2286
2287         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2288                 if (flags & RTCF_LOCAL &&
2289                     !(dev_out->flags & IFF_LOOPBACK)) {
2290                         rth->dst.output = ip_mc_output;
2291                         RT_CACHE_STAT_INC(out_slow_mc);
2292                 }
2293 #ifdef CONFIG_IP_MROUTE
2294                 if (type == RTN_MULTICAST) {
2295                         if (IN_DEV_MFORWARD(in_dev) &&
2296                             !ipv4_is_local_multicast(fl4->daddr)) {
2297                                 rth->dst.input = ip_mr_input;
2298                                 rth->dst.output = ip_mc_output;
2299                         }
2300                 }
2301 #endif
2302         }
2303
2304         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2305         lwtunnel_set_redirect(&rth->dst);
2306
2307         return rth;
2308 }
2309
2310 /*
2311  * Major route resolver routine.
2312  */
2313
2314 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2315                                         const struct sk_buff *skb)
2316 {
2317         __u8 tos = RT_FL_TOS(fl4);
2318         struct fib_result res = {
2319                 .type           = RTN_UNSPEC,
2320                 .fi             = NULL,
2321                 .table          = NULL,
2322                 .tclassid       = 0,
2323         };
2324         struct rtable *rth;
2325
2326         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2327         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2328         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2329                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2330
2331         rcu_read_lock();
2332         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2333         rcu_read_unlock();
2334
2335         return rth;
2336 }
2337 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2338
2339 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2340                                             struct fib_result *res,
2341                                             const struct sk_buff *skb)
2342 {
2343         struct net_device *dev_out = NULL;
2344         int orig_oif = fl4->flowi4_oif;
2345         unsigned int flags = 0;
2346         struct rtable *rth;
2347         int err = -ENETUNREACH;
2348
2349         if (fl4->saddr) {
2350                 rth = ERR_PTR(-EINVAL);
2351                 if (ipv4_is_multicast(fl4->saddr) ||
2352                     ipv4_is_lbcast(fl4->saddr) ||
2353                     ipv4_is_zeronet(fl4->saddr))
2354                         goto out;
2355
2356                 /* I removed check for oif == dev_out->oif here.
2357                    It was wrong for two reasons:
2358                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2359                       is assigned to multiple interfaces.
2360                    2. Moreover, we are allowed to send packets with saddr
2361                       of another iface. --ANK
2362                  */
2363
2364                 if (fl4->flowi4_oif == 0 &&
2365                     (ipv4_is_multicast(fl4->daddr) ||
2366                      ipv4_is_lbcast(fl4->daddr))) {
2367                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2368                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2369                         if (!dev_out)
2370                                 goto out;
2371
2372                         /* Special hack: user can direct multicasts
2373                            and limited broadcast via necessary interface
2374                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2375                            This hack is not just for fun, it allows
2376                            vic,vat and friends to work.
2377                            They bind socket to loopback, set ttl to zero
2378                            and expect that it will work.
2379                            From the viewpoint of routing cache they are broken,
2380                            because we are not allowed to build multicast path
2381                            with loopback source addr (look, routing cache
2382                            cannot know, that ttl is zero, so that packet
2383                            will not leave this host and route is valid).
2384                            Luckily, this hack is good workaround.
2385                          */
2386
2387                         fl4->flowi4_oif = dev_out->ifindex;
2388                         goto make_route;
2389                 }
2390
2391                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2392                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2393                         if (!__ip_dev_find(net, fl4->saddr, false))
2394                                 goto out;
2395                 }
2396         }
2397
2398
2399         if (fl4->flowi4_oif) {
2400                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2401                 rth = ERR_PTR(-ENODEV);
2402                 if (!dev_out)
2403                         goto out;
2404
2405                 /* RACE: Check return value of inet_select_addr instead. */
2406                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2407                         rth = ERR_PTR(-ENETUNREACH);
2408                         goto out;
2409                 }
2410                 if (ipv4_is_local_multicast(fl4->daddr) ||
2411                     ipv4_is_lbcast(fl4->daddr) ||
2412                     fl4->flowi4_proto == IPPROTO_IGMP) {
2413                         if (!fl4->saddr)
2414                                 fl4->saddr = inet_select_addr(dev_out, 0,
2415                                                               RT_SCOPE_LINK);
2416                         goto make_route;
2417                 }
2418                 if (!fl4->saddr) {
2419                         if (ipv4_is_multicast(fl4->daddr))
2420                                 fl4->saddr = inet_select_addr(dev_out, 0,
2421                                                               fl4->flowi4_scope);
2422                         else if (!fl4->daddr)
2423                                 fl4->saddr = inet_select_addr(dev_out, 0,
2424                                                               RT_SCOPE_HOST);
2425                 }
2426         }
2427
2428         if (!fl4->daddr) {
2429                 fl4->daddr = fl4->saddr;
2430                 if (!fl4->daddr)
2431                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2432                 dev_out = net->loopback_dev;
2433                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2434                 res->type = RTN_LOCAL;
2435                 flags |= RTCF_LOCAL;
2436                 goto make_route;
2437         }
2438
2439         err = fib_lookup(net, fl4, res, 0);
2440         if (err) {
2441                 res->fi = NULL;
2442                 res->table = NULL;
2443                 if (fl4->flowi4_oif &&
2444                     (ipv4_is_multicast(fl4->daddr) ||
2445                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2446                         /* Apparently, routing tables are wrong. Assume,
2447                            that the destination is on link.
2448
2449                            WHY? DW.
2450                            Because we are allowed to send to iface
2451                            even if it has NO routes and NO assigned
2452                            addresses. When oif is specified, routing
2453                            tables are looked up with only one purpose:
2454                            to catch if destination is gatewayed, rather than
2455                            direct. Moreover, if MSG_DONTROUTE is set,
2456                            we send packet, ignoring both routing tables
2457                            and ifaddr state. --ANK
2458
2459
2460                            We could make it even if oif is unknown,
2461                            likely IPv6, but we do not.
2462                          */
2463
2464                         if (fl4->saddr == 0)
2465                                 fl4->saddr = inet_select_addr(dev_out, 0,
2466                                                               RT_SCOPE_LINK);
2467                         res->type = RTN_UNICAST;
2468                         goto make_route;
2469                 }
2470                 rth = ERR_PTR(err);
2471                 goto out;
2472         }
2473
2474         if (res->type == RTN_LOCAL) {
2475                 if (!fl4->saddr) {
2476                         if (res->fi->fib_prefsrc)
2477                                 fl4->saddr = res->fi->fib_prefsrc;
2478                         else
2479                                 fl4->saddr = fl4->daddr;
2480                 }
2481
2482                 /* L3 master device is the loopback for that domain */
2483                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2484                         net->loopback_dev;
2485
2486                 /* make sure orig_oif points to fib result device even
2487                  * though packet rx/tx happens over loopback or l3mdev
2488                  */
2489                 orig_oif = FIB_RES_OIF(*res);
2490
2491                 fl4->flowi4_oif = dev_out->ifindex;
2492                 flags |= RTCF_LOCAL;
2493                 goto make_route;
2494         }
2495
2496         fib_select_path(net, res, fl4, skb);
2497
2498         dev_out = FIB_RES_DEV(*res);
2499         fl4->flowi4_oif = dev_out->ifindex;
2500
2501
2502 make_route:
2503         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2504
2505 out:
2506         return rth;
2507 }
2508
2509 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2510 {
2511         return NULL;
2512 }
2513
2514 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2515 {
2516         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2517
2518         return mtu ? : dst->dev->mtu;
2519 }
2520
2521 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2522                                           struct sk_buff *skb, u32 mtu)
2523 {
2524 }
2525
2526 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2527                                        struct sk_buff *skb)
2528 {
2529 }
2530
2531 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2532                                           unsigned long old)
2533 {
2534         return NULL;
2535 }
2536
2537 static struct dst_ops ipv4_dst_blackhole_ops = {
2538         .family                 =       AF_INET,
2539         .check                  =       ipv4_blackhole_dst_check,
2540         .mtu                    =       ipv4_blackhole_mtu,
2541         .default_advmss         =       ipv4_default_advmss,
2542         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2543         .redirect               =       ipv4_rt_blackhole_redirect,
2544         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2545         .neigh_lookup           =       ipv4_neigh_lookup,
2546 };
2547
2548 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2549 {
2550         struct rtable *ort = (struct rtable *) dst_orig;
2551         struct rtable *rt;
2552
2553         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2554         if (rt) {
2555                 struct dst_entry *new = &rt->dst;
2556
2557                 new->__use = 1;
2558                 new->input = dst_discard;
2559                 new->output = dst_discard_out;
2560
2561                 new->dev = net->loopback_dev;
2562                 if (new->dev)
2563                         dev_hold(new->dev);
2564
2565                 rt->rt_is_input = ort->rt_is_input;
2566                 rt->rt_iif = ort->rt_iif;
2567                 rt->rt_pmtu = ort->rt_pmtu;
2568                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2569
2570                 rt->rt_genid = rt_genid_ipv4(net);
2571                 rt->rt_flags = ort->rt_flags;
2572                 rt->rt_type = ort->rt_type;
2573                 rt->rt_gateway = ort->rt_gateway;
2574                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2575
2576                 INIT_LIST_HEAD(&rt->rt_uncached);
2577         }
2578
2579         dst_release(dst_orig);
2580
2581         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2582 }
2583
2584 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2585                                     const struct sock *sk)
2586 {
2587         struct rtable *rt = __ip_route_output_key(net, flp4);
2588
2589         if (IS_ERR(rt))
2590                 return rt;
2591
2592         if (flp4->flowi4_proto)
2593                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2594                                                         flowi4_to_flowi(flp4),
2595                                                         sk, 0);
2596
2597         return rt;
2598 }
2599 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2600
2601 /* called with rcu_read_lock held */
2602 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2603                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2604                         struct sk_buff *skb, u32 portid, u32 seq)
2605 {
2606         struct rtmsg *r;
2607         struct nlmsghdr *nlh;
2608         unsigned long expires = 0;
2609         u32 error;
2610         u32 metrics[RTAX_MAX];
2611
2612         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2613         if (!nlh)
2614                 return -EMSGSIZE;
2615
2616         r = nlmsg_data(nlh);
2617         r->rtm_family    = AF_INET;
2618         r->rtm_dst_len  = 32;
2619         r->rtm_src_len  = 0;
2620         r->rtm_tos      = fl4->flowi4_tos;
2621         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2622         if (nla_put_u32(skb, RTA_TABLE, table_id))
2623                 goto nla_put_failure;
2624         r->rtm_type     = rt->rt_type;
2625         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2626         r->rtm_protocol = RTPROT_UNSPEC;
2627         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2628         if (rt->rt_flags & RTCF_NOTIFY)
2629                 r->rtm_flags |= RTM_F_NOTIFY;
2630         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2631                 r->rtm_flags |= RTCF_DOREDIRECT;
2632
2633         if (nla_put_in_addr(skb, RTA_DST, dst))
2634                 goto nla_put_failure;
2635         if (src) {
2636                 r->rtm_src_len = 32;
2637                 if (nla_put_in_addr(skb, RTA_SRC, src))
2638                         goto nla_put_failure;
2639         }
2640         if (rt->dst.dev &&
2641             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2642                 goto nla_put_failure;
2643 #ifdef CONFIG_IP_ROUTE_CLASSID
2644         if (rt->dst.tclassid &&
2645             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2646                 goto nla_put_failure;
2647 #endif
2648         if (!rt_is_input_route(rt) &&
2649             fl4->saddr != src) {
2650                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2651                         goto nla_put_failure;
2652         }
2653         if (rt->rt_uses_gateway &&
2654             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2655                 goto nla_put_failure;
2656
2657         expires = rt->dst.expires;
2658         if (expires) {
2659                 unsigned long now = jiffies;
2660
2661                 if (time_before(now, expires))
2662                         expires -= now;
2663                 else
2664                         expires = 0;
2665         }
2666
2667         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2668         if (rt->rt_pmtu && expires)
2669                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2670         if (rt->rt_mtu_locked && expires)
2671                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2672         if (rtnetlink_put_metrics(skb, metrics) < 0)
2673                 goto nla_put_failure;
2674
2675         if (fl4->flowi4_mark &&
2676             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2677                 goto nla_put_failure;
2678
2679         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2680             nla_put_u32(skb, RTA_UID,
2681                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2682                 goto nla_put_failure;
2683
2684         error = rt->dst.error;
2685
2686         if (rt_is_input_route(rt)) {
2687 #ifdef CONFIG_IP_MROUTE
2688                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2689                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2690                         int err = ipmr_get_route(net, skb,
2691                                                  fl4->saddr, fl4->daddr,
2692                                                  r, portid);
2693
2694                         if (err <= 0) {
2695                                 if (err == 0)
2696                                         return 0;
2697                                 goto nla_put_failure;
2698                         }
2699                 } else
2700 #endif
2701                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2702                                 goto nla_put_failure;
2703         }
2704
2705         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2706                 goto nla_put_failure;
2707
2708         nlmsg_end(skb, nlh);
2709         return 0;
2710
2711 nla_put_failure:
2712         nlmsg_cancel(skb, nlh);
2713         return -EMSGSIZE;
2714 }
2715
2716 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2717                                                    u8 ip_proto, __be16 sport,
2718                                                    __be16 dport)
2719 {
2720         struct sk_buff *skb;
2721         struct iphdr *iph;
2722
2723         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2724         if (!skb)
2725                 return NULL;
2726
2727         /* Reserve room for dummy headers, this skb can pass
2728          * through good chunk of routing engine.
2729          */
2730         skb_reset_mac_header(skb);
2731         skb_reset_network_header(skb);
2732         skb->protocol = htons(ETH_P_IP);
2733         iph = skb_put(skb, sizeof(struct iphdr));
2734         iph->protocol = ip_proto;
2735         iph->saddr = src;
2736         iph->daddr = dst;
2737         iph->version = 0x4;
2738         iph->frag_off = 0;
2739         iph->ihl = 0x5;
2740         skb_set_transport_header(skb, skb->len);
2741
2742         switch (iph->protocol) {
2743         case IPPROTO_UDP: {
2744                 struct udphdr *udph;
2745
2746                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2747                 udph->source = sport;
2748                 udph->dest = dport;
2749                 udph->len = sizeof(struct udphdr);
2750                 udph->check = 0;
2751                 break;
2752         }
2753         case IPPROTO_TCP: {
2754                 struct tcphdr *tcph;
2755
2756                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2757                 tcph->source    = sport;
2758                 tcph->dest      = dport;
2759                 tcph->doff      = sizeof(struct tcphdr) / 4;
2760                 tcph->rst = 1;
2761                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2762                                             src, dst, 0);
2763                 break;
2764         }
2765         case IPPROTO_ICMP: {
2766                 struct icmphdr *icmph;
2767
2768                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2769                 icmph->type = ICMP_ECHO;
2770                 icmph->code = 0;
2771         }
2772         }
2773
2774         return skb;
2775 }
2776
2777 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2778                              struct netlink_ext_ack *extack)
2779 {
2780         struct net *net = sock_net(in_skb->sk);
2781         struct nlattr *tb[RTA_MAX+1];
2782         u32 table_id = RT_TABLE_MAIN;
2783         __be16 sport = 0, dport = 0;
2784         struct fib_result res = {};
2785         u8 ip_proto = IPPROTO_UDP;
2786         struct rtable *rt = NULL;
2787         struct sk_buff *skb;
2788         struct rtmsg *rtm;
2789         struct flowi4 fl4;
2790         __be32 dst = 0;
2791         __be32 src = 0;
2792         kuid_t uid;
2793         u32 iif;
2794         int err;
2795         int mark;
2796
2797         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2798                           extack);
2799         if (err < 0)
2800                 return err;
2801
2802         rtm = nlmsg_data(nlh);
2803         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2804         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2805         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2806         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2807         if (tb[RTA_UID])
2808                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2809         else
2810                 uid = (iif ? INVALID_UID : current_uid());
2811
2812         if (tb[RTA_IP_PROTO]) {
2813                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2814                                                   &ip_proto, extack);
2815                 if (err)
2816                         return err;
2817         }
2818
2819         if (tb[RTA_SPORT])
2820                 sport = nla_get_be16(tb[RTA_SPORT]);
2821
2822         if (tb[RTA_DPORT])
2823                 dport = nla_get_be16(tb[RTA_DPORT]);
2824
2825         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2826         if (!skb)
2827                 return -ENOBUFS;
2828
2829         memset(&fl4, 0, sizeof(fl4));
2830         fl4.daddr = dst;
2831         fl4.saddr = src;
2832         fl4.flowi4_tos = rtm->rtm_tos;
2833         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2834         fl4.flowi4_mark = mark;
2835         fl4.flowi4_uid = uid;
2836         if (sport)
2837                 fl4.fl4_sport = sport;
2838         if (dport)
2839                 fl4.fl4_dport = dport;
2840         fl4.flowi4_proto = ip_proto;
2841
2842         rcu_read_lock();
2843
2844         if (iif) {
2845                 struct net_device *dev;
2846
2847                 dev = dev_get_by_index_rcu(net, iif);
2848                 if (!dev) {
2849                         err = -ENODEV;
2850                         goto errout_rcu;
2851                 }
2852
2853                 fl4.flowi4_iif = iif; /* for rt_fill_info */
2854                 skb->dev        = dev;
2855                 skb->mark       = mark;
2856                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2857                                          dev, &res);
2858
2859                 rt = skb_rtable(skb);
2860                 if (err == 0 && rt->dst.error)
2861                         err = -rt->dst.error;
2862         } else {
2863                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2864                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2865                 err = 0;
2866                 if (IS_ERR(rt))
2867                         err = PTR_ERR(rt);
2868                 else
2869                         skb_dst_set(skb, &rt->dst);
2870         }
2871
2872         if (err)
2873                 goto errout_rcu;
2874
2875         if (rtm->rtm_flags & RTM_F_NOTIFY)
2876                 rt->rt_flags |= RTCF_NOTIFY;
2877
2878         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2879                 table_id = res.table ? res.table->tb_id : 0;
2880
2881         /* reset skb for netlink reply msg */
2882         skb_trim(skb, 0);
2883         skb_reset_network_header(skb);
2884         skb_reset_transport_header(skb);
2885         skb_reset_mac_header(skb);
2886
2887         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2888                 if (!res.fi) {
2889                         err = fib_props[res.type].error;
2890                         if (!err)
2891                                 err = -EHOSTUNREACH;
2892                         goto errout_rcu;
2893                 }
2894                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2895                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2896                                     rt->rt_type, res.prefix, res.prefixlen,
2897                                     fl4.flowi4_tos, res.fi, 0);
2898         } else {
2899                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2900                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2901         }
2902         if (err < 0)
2903                 goto errout_rcu;
2904
2905         rcu_read_unlock();
2906
2907         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2908
2909 errout_free:
2910         return err;
2911 errout_rcu:
2912         rcu_read_unlock();
2913         kfree_skb(skb);
2914         goto errout_free;
2915 }
2916
2917 void ip_rt_multicast_event(struct in_device *in_dev)
2918 {
2919         rt_cache_flush(dev_net(in_dev->dev));
2920 }
2921
2922 #ifdef CONFIG_SYSCTL
2923 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2924 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2925 static int ip_rt_gc_elasticity __read_mostly    = 8;
2926 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2927
2928 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2929                                         void __user *buffer,
2930                                         size_t *lenp, loff_t *ppos)
2931 {
2932         struct net *net = (struct net *)__ctl->extra1;
2933
2934         if (write) {
2935                 rt_cache_flush(net);
2936                 fnhe_genid_bump(net);
2937                 return 0;
2938         }
2939
2940         return -EINVAL;
2941 }
2942
2943 static struct ctl_table ipv4_route_table[] = {
2944         {
2945                 .procname       = "gc_thresh",
2946                 .data           = &ipv4_dst_ops.gc_thresh,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = proc_dointvec,
2950         },
2951         {
2952                 .procname       = "max_size",
2953                 .data           = &ip_rt_max_size,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = proc_dointvec,
2957         },
2958         {
2959                 /*  Deprecated. Use gc_min_interval_ms */
2960
2961                 .procname       = "gc_min_interval",
2962                 .data           = &ip_rt_gc_min_interval,
2963                 .maxlen         = sizeof(int),
2964                 .mode           = 0644,
2965                 .proc_handler   = proc_dointvec_jiffies,
2966         },
2967         {
2968                 .procname       = "gc_min_interval_ms",
2969                 .data           = &ip_rt_gc_min_interval,
2970                 .maxlen         = sizeof(int),
2971                 .mode           = 0644,
2972                 .proc_handler   = proc_dointvec_ms_jiffies,
2973         },
2974         {
2975                 .procname       = "gc_timeout",
2976                 .data           = &ip_rt_gc_timeout,
2977                 .maxlen         = sizeof(int),
2978                 .mode           = 0644,
2979                 .proc_handler   = proc_dointvec_jiffies,
2980         },
2981         {
2982                 .procname       = "gc_interval",
2983                 .data           = &ip_rt_gc_interval,
2984                 .maxlen         = sizeof(int),
2985                 .mode           = 0644,
2986                 .proc_handler   = proc_dointvec_jiffies,
2987         },
2988         {
2989                 .procname       = "redirect_load",
2990                 .data           = &ip_rt_redirect_load,
2991                 .maxlen         = sizeof(int),
2992                 .mode           = 0644,
2993                 .proc_handler   = proc_dointvec,
2994         },
2995         {
2996                 .procname       = "redirect_number",
2997                 .data           = &ip_rt_redirect_number,
2998                 .maxlen         = sizeof(int),
2999                 .mode           = 0644,
3000                 .proc_handler   = proc_dointvec,
3001         },
3002         {
3003                 .procname       = "redirect_silence",
3004                 .data           = &ip_rt_redirect_silence,
3005                 .maxlen         = sizeof(int),
3006                 .mode           = 0644,
3007                 .proc_handler   = proc_dointvec,
3008         },
3009         {
3010                 .procname       = "error_cost",
3011                 .data           = &ip_rt_error_cost,
3012                 .maxlen         = sizeof(int),
3013                 .mode           = 0644,
3014                 .proc_handler   = proc_dointvec,
3015         },
3016         {
3017                 .procname       = "error_burst",
3018                 .data           = &ip_rt_error_burst,
3019                 .maxlen         = sizeof(int),
3020                 .mode           = 0644,
3021                 .proc_handler   = proc_dointvec,
3022         },
3023         {
3024                 .procname       = "gc_elasticity",
3025                 .data           = &ip_rt_gc_elasticity,
3026                 .maxlen         = sizeof(int),
3027                 .mode           = 0644,
3028                 .proc_handler   = proc_dointvec,
3029         },
3030         {
3031                 .procname       = "mtu_expires",
3032                 .data           = &ip_rt_mtu_expires,
3033                 .maxlen         = sizeof(int),
3034                 .mode           = 0644,
3035                 .proc_handler   = proc_dointvec_jiffies,
3036         },
3037         {
3038                 .procname       = "min_pmtu",
3039                 .data           = &ip_rt_min_pmtu,
3040                 .maxlen         = sizeof(int),
3041                 .mode           = 0644,
3042                 .proc_handler   = proc_dointvec_minmax,
3043                 .extra1         = &ip_min_valid_pmtu,
3044         },
3045         {
3046                 .procname       = "min_adv_mss",
3047                 .data           = &ip_rt_min_advmss,
3048                 .maxlen         = sizeof(int),
3049                 .mode           = 0644,
3050                 .proc_handler   = proc_dointvec,
3051         },
3052         { }
3053 };
3054
3055 static struct ctl_table ipv4_route_flush_table[] = {
3056         {
3057                 .procname       = "flush",
3058                 .maxlen         = sizeof(int),
3059                 .mode           = 0200,
3060                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3061         },
3062         { },
3063 };
3064
3065 static __net_init int sysctl_route_net_init(struct net *net)
3066 {
3067         struct ctl_table *tbl;
3068
3069         tbl = ipv4_route_flush_table;
3070         if (!net_eq(net, &init_net)) {
3071                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3072                 if (!tbl)
3073                         goto err_dup;
3074
3075                 /* Don't export sysctls to unprivileged users */
3076                 if (net->user_ns != &init_user_ns)
3077                         tbl[0].procname = NULL;
3078         }
3079         tbl[0].extra1 = net;
3080
3081         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3082         if (!net->ipv4.route_hdr)
3083                 goto err_reg;
3084         return 0;
3085
3086 err_reg:
3087         if (tbl != ipv4_route_flush_table)
3088                 kfree(tbl);
3089 err_dup:
3090         return -ENOMEM;
3091 }
3092
3093 static __net_exit void sysctl_route_net_exit(struct net *net)
3094 {
3095         struct ctl_table *tbl;
3096
3097         tbl = net->ipv4.route_hdr->ctl_table_arg;
3098         unregister_net_sysctl_table(net->ipv4.route_hdr);
3099         BUG_ON(tbl == ipv4_route_flush_table);
3100         kfree(tbl);
3101 }
3102
3103 static __net_initdata struct pernet_operations sysctl_route_ops = {
3104         .init = sysctl_route_net_init,
3105         .exit = sysctl_route_net_exit,
3106 };
3107 #endif
3108
3109 static __net_init int rt_genid_init(struct net *net)
3110 {
3111         atomic_set(&net->ipv4.rt_genid, 0);
3112         atomic_set(&net->fnhe_genid, 0);
3113         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3114         return 0;
3115 }
3116
3117 static __net_initdata struct pernet_operations rt_genid_ops = {
3118         .init = rt_genid_init,
3119 };
3120
3121 static int __net_init ipv4_inetpeer_init(struct net *net)
3122 {
3123         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3124
3125         if (!bp)
3126                 return -ENOMEM;
3127         inet_peer_base_init(bp);
3128         net->ipv4.peers = bp;
3129         return 0;
3130 }
3131
3132 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3133 {
3134         struct inet_peer_base *bp = net->ipv4.peers;
3135
3136         net->ipv4.peers = NULL;
3137         inetpeer_invalidate_tree(bp);
3138         kfree(bp);
3139 }
3140
3141 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3142         .init   =       ipv4_inetpeer_init,
3143         .exit   =       ipv4_inetpeer_exit,
3144 };
3145
3146 #ifdef CONFIG_IP_ROUTE_CLASSID
3147 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3148 #endif /* CONFIG_IP_ROUTE_CLASSID */
3149
3150 int __init ip_rt_init(void)
3151 {
3152         int cpu;
3153
3154         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3155                                   GFP_KERNEL);
3156         if (!ip_idents)
3157                 panic("IP: failed to allocate ip_idents\n");
3158
3159         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3160
3161         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3162         if (!ip_tstamps)
3163                 panic("IP: failed to allocate ip_tstamps\n");
3164
3165         for_each_possible_cpu(cpu) {
3166                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3167
3168                 INIT_LIST_HEAD(&ul->head);
3169                 spin_lock_init(&ul->lock);
3170         }
3171 #ifdef CONFIG_IP_ROUTE_CLASSID
3172         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3173         if (!ip_rt_acct)
3174                 panic("IP: failed to allocate ip_rt_acct\n");
3175 #endif
3176
3177         ipv4_dst_ops.kmem_cachep =
3178                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3179                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3180
3181         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3182
3183         if (dst_entries_init(&ipv4_dst_ops) < 0)
3184                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3185
3186         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3187                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3188
3189         ipv4_dst_ops.gc_thresh = ~0;
3190         ip_rt_max_size = INT_MAX;
3191
3192         devinet_init();
3193         ip_fib_init();
3194
3195         if (ip_rt_proc_init())
3196                 pr_err("Unable to create route proc files\n");
3197 #ifdef CONFIG_XFRM
3198         xfrm_init();
3199         xfrm4_init();
3200 #endif
3201         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3202                       RTNL_FLAG_DOIT_UNLOCKED);
3203
3204 #ifdef CONFIG_SYSCTL
3205         register_pernet_subsys(&sysctl_route_ops);
3206 #endif
3207         register_pernet_subsys(&rt_genid_ops);
3208         register_pernet_subsys(&ipv4_inetpeer_ops);
3209         return 0;
3210 }
3211
3212 #ifdef CONFIG_SYSCTL
3213 /*
3214  * We really need to sanitize the damn ipv4 init order, then all
3215  * this nonsense will go away.
3216  */
3217 void __init ip_static_sysctl_init(void)
3218 {
3219         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3220 }
3221 #endif