Linux-libre 3.12.18-gnu
[librecmc/linux-libre.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 /* IPv4 datagram length is stored into 16bit field (tot_len) */
116 #define IP_MAX_MTU      0xFFFF
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    st->in_hit,
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    st->out_hit,
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    st->gc_total,
311                    st->gc_ignored,
312                    st->gc_goal_miss,
313                    st->gc_dst_overflow,
314                    st->in_hlist_search,
315                    st->out_hlist_search
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump_ipv4(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 /*
469  * Peer allocation may fail only in serious out-of-memory conditions.  However
470  * we still can generate some output.
471  * Random ID selection looks a bit dangerous because we have no chances to
472  * select ID being unique in a reasonable period of time.
473  * But broken packet identifier may be better than no packet at all.
474  */
475 static void ip_select_fb_ident(struct iphdr *iph)
476 {
477         static DEFINE_SPINLOCK(ip_fb_id_lock);
478         static u32 ip_fallback_id;
479         u32 salt;
480
481         spin_lock_bh(&ip_fb_id_lock);
482         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
483         iph->id = htons(salt & 0xFFFF);
484         ip_fallback_id = salt;
485         spin_unlock_bh(&ip_fb_id_lock);
486 }
487
488 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
489 {
490         struct net *net = dev_net(dst->dev);
491         struct inet_peer *peer;
492
493         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
494         if (peer) {
495                 iph->id = htons(inet_getid(peer, more));
496                 inet_putpeer(peer);
497                 return;
498         }
499
500         ip_select_fb_ident(iph);
501 }
502 EXPORT_SYMBOL(__ip_select_ident);
503
504 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
505                              const struct iphdr *iph,
506                              int oif, u8 tos,
507                              u8 prot, u32 mark, int flow_flags)
508 {
509         if (sk) {
510                 const struct inet_sock *inet = inet_sk(sk);
511
512                 oif = sk->sk_bound_dev_if;
513                 mark = sk->sk_mark;
514                 tos = RT_CONN_FLAGS(sk);
515                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
516         }
517         flowi4_init_output(fl4, oif, mark, tos,
518                            RT_SCOPE_UNIVERSE, prot,
519                            flow_flags,
520                            iph->daddr, iph->saddr, 0, 0);
521 }
522
523 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
524                                const struct sock *sk)
525 {
526         const struct iphdr *iph = ip_hdr(skb);
527         int oif = skb->dev->ifindex;
528         u8 tos = RT_TOS(iph->tos);
529         u8 prot = iph->protocol;
530         u32 mark = skb->mark;
531
532         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
533 }
534
535 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
536 {
537         const struct inet_sock *inet = inet_sk(sk);
538         const struct ip_options_rcu *inet_opt;
539         __be32 daddr = inet->inet_daddr;
540
541         rcu_read_lock();
542         inet_opt = rcu_dereference(inet->inet_opt);
543         if (inet_opt && inet_opt->opt.srr)
544                 daddr = inet_opt->opt.faddr;
545         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
546                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
547                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
548                            inet_sk_flowi_flags(sk),
549                            daddr, inet->inet_saddr, 0, 0);
550         rcu_read_unlock();
551 }
552
553 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
554                                  const struct sk_buff *skb)
555 {
556         if (skb)
557                 build_skb_flow_key(fl4, skb, sk);
558         else
559                 build_sk_flow_key(fl4, sk);
560 }
561
562 static inline void rt_free(struct rtable *rt)
563 {
564         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
565 }
566
567 static DEFINE_SPINLOCK(fnhe_lock);
568
569 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
570 {
571         struct rtable *rt;
572
573         rt = rcu_dereference(fnhe->fnhe_rth_input);
574         if (rt) {
575                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
576                 rt_free(rt);
577         }
578         rt = rcu_dereference(fnhe->fnhe_rth_output);
579         if (rt) {
580                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
581                 rt_free(rt);
582         }
583 }
584
585 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
586 {
587         struct fib_nh_exception *fnhe, *oldest;
588
589         oldest = rcu_dereference(hash->chain);
590         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
591              fnhe = rcu_dereference(fnhe->fnhe_next)) {
592                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
593                         oldest = fnhe;
594         }
595         fnhe_flush_routes(oldest);
596         return oldest;
597 }
598
599 static inline u32 fnhe_hashfun(__be32 daddr)
600 {
601         u32 hval;
602
603         hval = (__force u32) daddr;
604         hval ^= (hval >> 11) ^ (hval >> 22);
605
606         return hval & (FNHE_HASH_SIZE - 1);
607 }
608
609 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
610 {
611         rt->rt_pmtu = fnhe->fnhe_pmtu;
612         rt->dst.expires = fnhe->fnhe_expires;
613
614         if (fnhe->fnhe_gw) {
615                 rt->rt_flags |= RTCF_REDIRECTED;
616                 rt->rt_gateway = fnhe->fnhe_gw;
617                 rt->rt_uses_gateway = 1;
618         }
619 }
620
621 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
622                                   u32 pmtu, unsigned long expires)
623 {
624         struct fnhe_hash_bucket *hash;
625         struct fib_nh_exception *fnhe;
626         struct rtable *rt;
627         unsigned int i;
628         int depth;
629         u32 hval = fnhe_hashfun(daddr);
630
631         spin_lock_bh(&fnhe_lock);
632
633         hash = nh->nh_exceptions;
634         if (!hash) {
635                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
636                 if (!hash)
637                         goto out_unlock;
638                 nh->nh_exceptions = hash;
639         }
640
641         hash += hval;
642
643         depth = 0;
644         for (fnhe = rcu_dereference(hash->chain); fnhe;
645              fnhe = rcu_dereference(fnhe->fnhe_next)) {
646                 if (fnhe->fnhe_daddr == daddr)
647                         break;
648                 depth++;
649         }
650
651         if (fnhe) {
652                 if (gw)
653                         fnhe->fnhe_gw = gw;
654                 if (pmtu) {
655                         fnhe->fnhe_pmtu = pmtu;
656                         fnhe->fnhe_expires = max(1UL, expires);
657                 }
658                 /* Update all cached dsts too */
659                 rt = rcu_dereference(fnhe->fnhe_rth_input);
660                 if (rt)
661                         fill_route_from_fnhe(rt, fnhe);
662                 rt = rcu_dereference(fnhe->fnhe_rth_output);
663                 if (rt)
664                         fill_route_from_fnhe(rt, fnhe);
665         } else {
666                 if (depth > FNHE_RECLAIM_DEPTH)
667                         fnhe = fnhe_oldest(hash);
668                 else {
669                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
670                         if (!fnhe)
671                                 goto out_unlock;
672
673                         fnhe->fnhe_next = hash->chain;
674                         rcu_assign_pointer(hash->chain, fnhe);
675                 }
676                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
677                 fnhe->fnhe_daddr = daddr;
678                 fnhe->fnhe_gw = gw;
679                 fnhe->fnhe_pmtu = pmtu;
680                 fnhe->fnhe_expires = expires;
681
682                 /* Exception created; mark the cached routes for the nexthop
683                  * stale, so anyone caching it rechecks if this exception
684                  * applies to them.
685                  */
686                 rt = rcu_dereference(nh->nh_rth_input);
687                 if (rt)
688                         rt->dst.obsolete = DST_OBSOLETE_KILL;
689
690                 for_each_possible_cpu(i) {
691                         struct rtable __rcu **prt;
692                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
693                         rt = rcu_dereference(*prt);
694                         if (rt)
695                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
696                 }
697         }
698
699         fnhe->fnhe_stamp = jiffies;
700
701 out_unlock:
702         spin_unlock_bh(&fnhe_lock);
703         return;
704 }
705
706 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
707                              bool kill_route)
708 {
709         __be32 new_gw = icmp_hdr(skb)->un.gateway;
710         __be32 old_gw = ip_hdr(skb)->saddr;
711         struct net_device *dev = skb->dev;
712         struct in_device *in_dev;
713         struct fib_result res;
714         struct neighbour *n;
715         struct net *net;
716
717         switch (icmp_hdr(skb)->code & 7) {
718         case ICMP_REDIR_NET:
719         case ICMP_REDIR_NETTOS:
720         case ICMP_REDIR_HOST:
721         case ICMP_REDIR_HOSTTOS:
722                 break;
723
724         default:
725                 return;
726         }
727
728         if (rt->rt_gateway != old_gw)
729                 return;
730
731         in_dev = __in_dev_get_rcu(dev);
732         if (!in_dev)
733                 return;
734
735         net = dev_net(dev);
736         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
737             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
738             ipv4_is_zeronet(new_gw))
739                 goto reject_redirect;
740
741         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
742                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
743                         goto reject_redirect;
744                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
745                         goto reject_redirect;
746         } else {
747                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
748                         goto reject_redirect;
749         }
750
751         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
752         if (n) {
753                 if (!(n->nud_state & NUD_VALID)) {
754                         neigh_event_send(n, NULL);
755                 } else {
756                         if (fib_lookup(net, fl4, &res) == 0) {
757                                 struct fib_nh *nh = &FIB_RES_NH(res);
758
759                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
760                                                       0, 0);
761                         }
762                         if (kill_route)
763                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
764                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
765                 }
766                 neigh_release(n);
767         }
768         return;
769
770 reject_redirect:
771 #ifdef CONFIG_IP_ROUTE_VERBOSE
772         if (IN_DEV_LOG_MARTIANS(in_dev)) {
773                 const struct iphdr *iph = (const struct iphdr *) skb->data;
774                 __be32 daddr = iph->daddr;
775                 __be32 saddr = iph->saddr;
776
777                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
778                                      "  Advised path = %pI4 -> %pI4\n",
779                                      &old_gw, dev->name, &new_gw,
780                                      &saddr, &daddr);
781         }
782 #endif
783         ;
784 }
785
786 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
787 {
788         struct rtable *rt;
789         struct flowi4 fl4;
790         const struct iphdr *iph = (const struct iphdr *) skb->data;
791         int oif = skb->dev->ifindex;
792         u8 tos = RT_TOS(iph->tos);
793         u8 prot = iph->protocol;
794         u32 mark = skb->mark;
795
796         rt = (struct rtable *) dst;
797
798         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
799         __ip_do_redirect(rt, skb, &fl4, true);
800 }
801
802 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
803 {
804         struct rtable *rt = (struct rtable *)dst;
805         struct dst_entry *ret = dst;
806
807         if (rt) {
808                 if (dst->obsolete > 0) {
809                         ip_rt_put(rt);
810                         ret = NULL;
811                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
812                            rt->dst.expires) {
813                         ip_rt_put(rt);
814                         ret = NULL;
815                 }
816         }
817         return ret;
818 }
819
820 /*
821  * Algorithm:
822  *      1. The first ip_rt_redirect_number redirects are sent
823  *         with exponential backoff, then we stop sending them at all,
824  *         assuming that the host ignores our redirects.
825  *      2. If we did not see packets requiring redirects
826  *         during ip_rt_redirect_silence, we assume that the host
827  *         forgot redirected route and start to send redirects again.
828  *
829  * This algorithm is much cheaper and more intelligent than dumb load limiting
830  * in icmp.c.
831  *
832  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
833  * and "frag. need" (breaks PMTU discovery) in icmp.c.
834  */
835
836 void ip_rt_send_redirect(struct sk_buff *skb)
837 {
838         struct rtable *rt = skb_rtable(skb);
839         struct in_device *in_dev;
840         struct inet_peer *peer;
841         struct net *net;
842         int log_martians;
843
844         rcu_read_lock();
845         in_dev = __in_dev_get_rcu(rt->dst.dev);
846         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
847                 rcu_read_unlock();
848                 return;
849         }
850         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
851         rcu_read_unlock();
852
853         net = dev_net(rt->dst.dev);
854         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
855         if (!peer) {
856                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
857                           rt_nexthop(rt, ip_hdr(skb)->daddr));
858                 return;
859         }
860
861         /* No redirected packets during ip_rt_redirect_silence;
862          * reset the algorithm.
863          */
864         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
865                 peer->rate_tokens = 0;
866
867         /* Too many ignored redirects; do not send anything
868          * set dst.rate_last to the last seen redirected packet.
869          */
870         if (peer->rate_tokens >= ip_rt_redirect_number) {
871                 peer->rate_last = jiffies;
872                 goto out_put_peer;
873         }
874
875         /* Check for load limit; set rate_last to the latest sent
876          * redirect.
877          */
878         if (peer->rate_tokens == 0 ||
879             time_after(jiffies,
880                        (peer->rate_last +
881                         (ip_rt_redirect_load << peer->rate_tokens)))) {
882                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
883
884                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
885                 peer->rate_last = jiffies;
886                 ++peer->rate_tokens;
887 #ifdef CONFIG_IP_ROUTE_VERBOSE
888                 if (log_martians &&
889                     peer->rate_tokens == ip_rt_redirect_number)
890                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
891                                              &ip_hdr(skb)->saddr, inet_iif(skb),
892                                              &ip_hdr(skb)->daddr, &gw);
893 #endif
894         }
895 out_put_peer:
896         inet_putpeer(peer);
897 }
898
899 static int ip_error(struct sk_buff *skb)
900 {
901         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
902         struct rtable *rt = skb_rtable(skb);
903         struct inet_peer *peer;
904         unsigned long now;
905         struct net *net;
906         bool send;
907         int code;
908
909         net = dev_net(rt->dst.dev);
910         if (!IN_DEV_FORWARD(in_dev)) {
911                 switch (rt->dst.error) {
912                 case EHOSTUNREACH:
913                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
914                         break;
915
916                 case ENETUNREACH:
917                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
918                         break;
919                 }
920                 goto out;
921         }
922
923         switch (rt->dst.error) {
924         case EINVAL:
925         default:
926                 goto out;
927         case EHOSTUNREACH:
928                 code = ICMP_HOST_UNREACH;
929                 break;
930         case ENETUNREACH:
931                 code = ICMP_NET_UNREACH;
932                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
933                 break;
934         case EACCES:
935                 code = ICMP_PKT_FILTERED;
936                 break;
937         }
938
939         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
940
941         send = true;
942         if (peer) {
943                 now = jiffies;
944                 peer->rate_tokens += now - peer->rate_last;
945                 if (peer->rate_tokens > ip_rt_error_burst)
946                         peer->rate_tokens = ip_rt_error_burst;
947                 peer->rate_last = now;
948                 if (peer->rate_tokens >= ip_rt_error_cost)
949                         peer->rate_tokens -= ip_rt_error_cost;
950                 else
951                         send = false;
952                 inet_putpeer(peer);
953         }
954         if (send)
955                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
956
957 out:    kfree_skb(skb);
958         return 0;
959 }
960
961 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
962 {
963         struct dst_entry *dst = &rt->dst;
964         struct fib_result res;
965
966         if (dst_metric_locked(dst, RTAX_MTU))
967                 return;
968
969         if (dst->dev->mtu < mtu)
970                 return;
971
972         if (mtu < ip_rt_min_pmtu)
973                 mtu = ip_rt_min_pmtu;
974
975         if (rt->rt_pmtu == mtu &&
976             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
977                 return;
978
979         rcu_read_lock();
980         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
981                 struct fib_nh *nh = &FIB_RES_NH(res);
982
983                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
984                                       jiffies + ip_rt_mtu_expires);
985         }
986         rcu_read_unlock();
987 }
988
989 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
990                               struct sk_buff *skb, u32 mtu)
991 {
992         struct rtable *rt = (struct rtable *) dst;
993         struct flowi4 fl4;
994
995         ip_rt_build_flow_key(&fl4, sk, skb);
996         __ip_rt_update_pmtu(rt, &fl4, mtu);
997 }
998
999 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1000                       int oif, u32 mark, u8 protocol, int flow_flags)
1001 {
1002         const struct iphdr *iph = (const struct iphdr *) skb->data;
1003         struct flowi4 fl4;
1004         struct rtable *rt;
1005
1006         __build_flow_key(&fl4, NULL, iph, oif,
1007                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1008         rt = __ip_route_output_key(net, &fl4);
1009         if (!IS_ERR(rt)) {
1010                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1011                 ip_rt_put(rt);
1012         }
1013 }
1014 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1015
1016 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1017 {
1018         const struct iphdr *iph = (const struct iphdr *) skb->data;
1019         struct flowi4 fl4;
1020         struct rtable *rt;
1021
1022         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1023         rt = __ip_route_output_key(sock_net(sk), &fl4);
1024         if (!IS_ERR(rt)) {
1025                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1026                 ip_rt_put(rt);
1027         }
1028 }
1029
1030 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1031 {
1032         const struct iphdr *iph = (const struct iphdr *) skb->data;
1033         struct flowi4 fl4;
1034         struct rtable *rt;
1035         struct dst_entry *dst;
1036         bool new = false;
1037
1038         bh_lock_sock(sk);
1039         rt = (struct rtable *) __sk_dst_get(sk);
1040
1041         if (sock_owned_by_user(sk) || !rt) {
1042                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1043                 goto out;
1044         }
1045
1046         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1047
1048         if (!__sk_dst_check(sk, 0)) {
1049                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1050                 if (IS_ERR(rt))
1051                         goto out;
1052
1053                 new = true;
1054         }
1055
1056         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1057
1058         dst = dst_check(&rt->dst, 0);
1059         if (!dst) {
1060                 if (new)
1061                         dst_release(&rt->dst);
1062
1063                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1064                 if (IS_ERR(rt))
1065                         goto out;
1066
1067                 new = true;
1068         }
1069
1070         if (new)
1071                 __sk_dst_set(sk, &rt->dst);
1072
1073 out:
1074         bh_unlock_sock(sk);
1075 }
1076 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1077
1078 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1079                    int oif, u32 mark, u8 protocol, int flow_flags)
1080 {
1081         const struct iphdr *iph = (const struct iphdr *) skb->data;
1082         struct flowi4 fl4;
1083         struct rtable *rt;
1084
1085         __build_flow_key(&fl4, NULL, iph, oif,
1086                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1087         rt = __ip_route_output_key(net, &fl4);
1088         if (!IS_ERR(rt)) {
1089                 __ip_do_redirect(rt, skb, &fl4, false);
1090                 ip_rt_put(rt);
1091         }
1092 }
1093 EXPORT_SYMBOL_GPL(ipv4_redirect);
1094
1095 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1096 {
1097         const struct iphdr *iph = (const struct iphdr *) skb->data;
1098         struct flowi4 fl4;
1099         struct rtable *rt;
1100
1101         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1102         rt = __ip_route_output_key(sock_net(sk), &fl4);
1103         if (!IS_ERR(rt)) {
1104                 __ip_do_redirect(rt, skb, &fl4, false);
1105                 ip_rt_put(rt);
1106         }
1107 }
1108 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1109
1110 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1111 {
1112         struct rtable *rt = (struct rtable *) dst;
1113
1114         /* All IPV4 dsts are created with ->obsolete set to the value
1115          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1116          * into this function always.
1117          *
1118          * When a PMTU/redirect information update invalidates a route,
1119          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1120          * DST_OBSOLETE_DEAD by dst_free().
1121          */
1122         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1123                 return NULL;
1124         return dst;
1125 }
1126
1127 static void ipv4_link_failure(struct sk_buff *skb)
1128 {
1129         struct rtable *rt;
1130
1131         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1132
1133         rt = skb_rtable(skb);
1134         if (rt)
1135                 dst_set_expires(&rt->dst, 0);
1136 }
1137
1138 static int ip_rt_bug(struct sk_buff *skb)
1139 {
1140         pr_debug("%s: %pI4 -> %pI4, %s\n",
1141                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1142                  skb->dev ? skb->dev->name : "?");
1143         kfree_skb(skb);
1144         WARN_ON(1);
1145         return 0;
1146 }
1147
1148 /*
1149    We do not cache source address of outgoing interface,
1150    because it is used only by IP RR, TS and SRR options,
1151    so that it out of fast path.
1152
1153    BTW remember: "addr" is allowed to be not aligned
1154    in IP options!
1155  */
1156
1157 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1158 {
1159         __be32 src;
1160
1161         if (rt_is_output_route(rt))
1162                 src = ip_hdr(skb)->saddr;
1163         else {
1164                 struct fib_result res;
1165                 struct flowi4 fl4;
1166                 struct iphdr *iph;
1167
1168                 iph = ip_hdr(skb);
1169
1170                 memset(&fl4, 0, sizeof(fl4));
1171                 fl4.daddr = iph->daddr;
1172                 fl4.saddr = iph->saddr;
1173                 fl4.flowi4_tos = RT_TOS(iph->tos);
1174                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1175                 fl4.flowi4_iif = skb->dev->ifindex;
1176                 fl4.flowi4_mark = skb->mark;
1177
1178                 rcu_read_lock();
1179                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1180                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1181                 else
1182                         src = inet_select_addr(rt->dst.dev,
1183                                                rt_nexthop(rt, iph->daddr),
1184                                                RT_SCOPE_UNIVERSE);
1185                 rcu_read_unlock();
1186         }
1187         memcpy(addr, &src, 4);
1188 }
1189
1190 #ifdef CONFIG_IP_ROUTE_CLASSID
1191 static void set_class_tag(struct rtable *rt, u32 tag)
1192 {
1193         if (!(rt->dst.tclassid & 0xFFFF))
1194                 rt->dst.tclassid |= tag & 0xFFFF;
1195         if (!(rt->dst.tclassid & 0xFFFF0000))
1196                 rt->dst.tclassid |= tag & 0xFFFF0000;
1197 }
1198 #endif
1199
1200 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1201 {
1202         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1203
1204         if (advmss == 0) {
1205                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1206                                ip_rt_min_advmss);
1207                 if (advmss > 65535 - 40)
1208                         advmss = 65535 - 40;
1209         }
1210         return advmss;
1211 }
1212
1213 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1214 {
1215         const struct rtable *rt = (const struct rtable *) dst;
1216         unsigned int mtu = rt->rt_pmtu;
1217
1218         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1219                 mtu = dst_metric_raw(dst, RTAX_MTU);
1220
1221         if (mtu)
1222                 return mtu;
1223
1224         mtu = dst->dev->mtu;
1225
1226         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1227                 if (rt->rt_uses_gateway && mtu > 576)
1228                         mtu = 576;
1229         }
1230
1231         return min_t(unsigned int, mtu, IP_MAX_MTU);
1232 }
1233
1234 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1235 {
1236         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1237         struct fib_nh_exception *fnhe;
1238         u32 hval;
1239
1240         if (!hash)
1241                 return NULL;
1242
1243         hval = fnhe_hashfun(daddr);
1244
1245         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1246              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1247                 if (fnhe->fnhe_daddr == daddr)
1248                         return fnhe;
1249         }
1250         return NULL;
1251 }
1252
1253 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1254                               __be32 daddr)
1255 {
1256         bool ret = false;
1257
1258         spin_lock_bh(&fnhe_lock);
1259
1260         if (daddr == fnhe->fnhe_daddr) {
1261                 struct rtable __rcu **porig;
1262                 struct rtable *orig;
1263                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1264
1265                 if (rt_is_input_route(rt))
1266                         porig = &fnhe->fnhe_rth_input;
1267                 else
1268                         porig = &fnhe->fnhe_rth_output;
1269                 orig = rcu_dereference(*porig);
1270
1271                 if (fnhe->fnhe_genid != genid) {
1272                         fnhe->fnhe_genid = genid;
1273                         fnhe->fnhe_gw = 0;
1274                         fnhe->fnhe_pmtu = 0;
1275                         fnhe->fnhe_expires = 0;
1276                         fnhe_flush_routes(fnhe);
1277                         orig = NULL;
1278                 }
1279                 fill_route_from_fnhe(rt, fnhe);
1280                 if (!rt->rt_gateway)
1281                         rt->rt_gateway = daddr;
1282
1283                 if (!(rt->dst.flags & DST_NOCACHE)) {
1284                         rcu_assign_pointer(*porig, rt);
1285                         if (orig)
1286                                 rt_free(orig);
1287                         ret = true;
1288                 }
1289
1290                 fnhe->fnhe_stamp = jiffies;
1291         }
1292         spin_unlock_bh(&fnhe_lock);
1293
1294         return ret;
1295 }
1296
1297 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1298 {
1299         struct rtable *orig, *prev, **p;
1300         bool ret = true;
1301
1302         if (rt_is_input_route(rt)) {
1303                 p = (struct rtable **)&nh->nh_rth_input;
1304         } else {
1305                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1306         }
1307         orig = *p;
1308
1309         prev = cmpxchg(p, orig, rt);
1310         if (prev == orig) {
1311                 if (orig)
1312                         rt_free(orig);
1313         } else
1314                 ret = false;
1315
1316         return ret;
1317 }
1318
1319 static DEFINE_SPINLOCK(rt_uncached_lock);
1320 static LIST_HEAD(rt_uncached_list);
1321
1322 static void rt_add_uncached_list(struct rtable *rt)
1323 {
1324         spin_lock_bh(&rt_uncached_lock);
1325         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1326         spin_unlock_bh(&rt_uncached_lock);
1327 }
1328
1329 static void ipv4_dst_destroy(struct dst_entry *dst)
1330 {
1331         struct rtable *rt = (struct rtable *) dst;
1332
1333         if (!list_empty(&rt->rt_uncached)) {
1334                 spin_lock_bh(&rt_uncached_lock);
1335                 list_del(&rt->rt_uncached);
1336                 spin_unlock_bh(&rt_uncached_lock);
1337         }
1338 }
1339
1340 void rt_flush_dev(struct net_device *dev)
1341 {
1342         if (!list_empty(&rt_uncached_list)) {
1343                 struct net *net = dev_net(dev);
1344                 struct rtable *rt;
1345
1346                 spin_lock_bh(&rt_uncached_lock);
1347                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1348                         if (rt->dst.dev != dev)
1349                                 continue;
1350                         rt->dst.dev = net->loopback_dev;
1351                         dev_hold(rt->dst.dev);
1352                         dev_put(dev);
1353                 }
1354                 spin_unlock_bh(&rt_uncached_lock);
1355         }
1356 }
1357
1358 static bool rt_cache_valid(const struct rtable *rt)
1359 {
1360         return  rt &&
1361                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1362                 !rt_is_expired(rt);
1363 }
1364
1365 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1366                            const struct fib_result *res,
1367                            struct fib_nh_exception *fnhe,
1368                            struct fib_info *fi, u16 type, u32 itag)
1369 {
1370         bool cached = false;
1371
1372         if (fi) {
1373                 struct fib_nh *nh = &FIB_RES_NH(*res);
1374
1375                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1376                         rt->rt_gateway = nh->nh_gw;
1377                         rt->rt_uses_gateway = 1;
1378                 }
1379                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1380 #ifdef CONFIG_IP_ROUTE_CLASSID
1381                 rt->dst.tclassid = nh->nh_tclassid;
1382 #endif
1383                 if (unlikely(fnhe))
1384                         cached = rt_bind_exception(rt, fnhe, daddr);
1385                 else if (!(rt->dst.flags & DST_NOCACHE))
1386                         cached = rt_cache_route(nh, rt);
1387                 if (unlikely(!cached)) {
1388                         /* Routes we intend to cache in nexthop exception or
1389                          * FIB nexthop have the DST_NOCACHE bit clear.
1390                          * However, if we are unsuccessful at storing this
1391                          * route into the cache we really need to set it.
1392                          */
1393                         rt->dst.flags |= DST_NOCACHE;
1394                         if (!rt->rt_gateway)
1395                                 rt->rt_gateway = daddr;
1396                         rt_add_uncached_list(rt);
1397                 }
1398         } else
1399                 rt_add_uncached_list(rt);
1400
1401 #ifdef CONFIG_IP_ROUTE_CLASSID
1402 #ifdef CONFIG_IP_MULTIPLE_TABLES
1403         set_class_tag(rt, res->tclassid);
1404 #endif
1405         set_class_tag(rt, itag);
1406 #endif
1407 }
1408
1409 static struct rtable *rt_dst_alloc(struct net_device *dev,
1410                                    bool nopolicy, bool noxfrm, bool will_cache)
1411 {
1412         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1413                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1414                          (nopolicy ? DST_NOPOLICY : 0) |
1415                          (noxfrm ? DST_NOXFRM : 0));
1416 }
1417
1418 /* called in rcu_read_lock() section */
1419 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1420                                 u8 tos, struct net_device *dev, int our)
1421 {
1422         struct rtable *rth;
1423         struct in_device *in_dev = __in_dev_get_rcu(dev);
1424         u32 itag = 0;
1425         int err;
1426
1427         /* Primary sanity checks. */
1428
1429         if (in_dev == NULL)
1430                 return -EINVAL;
1431
1432         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1433             skb->protocol != htons(ETH_P_IP))
1434                 goto e_inval;
1435
1436         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1437                 if (ipv4_is_loopback(saddr))
1438                         goto e_inval;
1439
1440         if (ipv4_is_zeronet(saddr)) {
1441                 if (!ipv4_is_local_multicast(daddr))
1442                         goto e_inval;
1443         } else {
1444                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1445                                           in_dev, &itag);
1446                 if (err < 0)
1447                         goto e_err;
1448         }
1449         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1450                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1451         if (!rth)
1452                 goto e_nobufs;
1453
1454 #ifdef CONFIG_IP_ROUTE_CLASSID
1455         rth->dst.tclassid = itag;
1456 #endif
1457         rth->dst.output = ip_rt_bug;
1458
1459         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1460         rth->rt_flags   = RTCF_MULTICAST;
1461         rth->rt_type    = RTN_MULTICAST;
1462         rth->rt_is_input= 1;
1463         rth->rt_iif     = 0;
1464         rth->rt_pmtu    = 0;
1465         rth->rt_gateway = 0;
1466         rth->rt_uses_gateway = 0;
1467         INIT_LIST_HEAD(&rth->rt_uncached);
1468         if (our) {
1469                 rth->dst.input= ip_local_deliver;
1470                 rth->rt_flags |= RTCF_LOCAL;
1471         }
1472
1473 #ifdef CONFIG_IP_MROUTE
1474         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1475                 rth->dst.input = ip_mr_input;
1476 #endif
1477         RT_CACHE_STAT_INC(in_slow_mc);
1478
1479         skb_dst_set(skb, &rth->dst);
1480         return 0;
1481
1482 e_nobufs:
1483         return -ENOBUFS;
1484 e_inval:
1485         return -EINVAL;
1486 e_err:
1487         return err;
1488 }
1489
1490
1491 static void ip_handle_martian_source(struct net_device *dev,
1492                                      struct in_device *in_dev,
1493                                      struct sk_buff *skb,
1494                                      __be32 daddr,
1495                                      __be32 saddr)
1496 {
1497         RT_CACHE_STAT_INC(in_martian_src);
1498 #ifdef CONFIG_IP_ROUTE_VERBOSE
1499         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1500                 /*
1501                  *      RFC1812 recommendation, if source is martian,
1502                  *      the only hint is MAC header.
1503                  */
1504                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1505                         &daddr, &saddr, dev->name);
1506                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1507                         print_hex_dump(KERN_WARNING, "ll header: ",
1508                                        DUMP_PREFIX_OFFSET, 16, 1,
1509                                        skb_mac_header(skb),
1510                                        dev->hard_header_len, true);
1511                 }
1512         }
1513 #endif
1514 }
1515
1516 /* called in rcu_read_lock() section */
1517 static int __mkroute_input(struct sk_buff *skb,
1518                            const struct fib_result *res,
1519                            struct in_device *in_dev,
1520                            __be32 daddr, __be32 saddr, u32 tos)
1521 {
1522         struct fib_nh_exception *fnhe;
1523         struct rtable *rth;
1524         int err;
1525         struct in_device *out_dev;
1526         unsigned int flags = 0;
1527         bool do_cache;
1528         u32 itag;
1529
1530         /* get a working reference to the output device */
1531         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1532         if (out_dev == NULL) {
1533                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1534                 return -EINVAL;
1535         }
1536
1537         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1538                                   in_dev->dev, in_dev, &itag);
1539         if (err < 0) {
1540                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1541                                          saddr);
1542
1543                 goto cleanup;
1544         }
1545
1546         do_cache = res->fi && !itag;
1547         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1548             (IN_DEV_SHARED_MEDIA(out_dev) ||
1549              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1550                 flags |= RTCF_DOREDIRECT;
1551                 do_cache = false;
1552         }
1553
1554         if (skb->protocol != htons(ETH_P_IP)) {
1555                 /* Not IP (i.e. ARP). Do not create route, if it is
1556                  * invalid for proxy arp. DNAT routes are always valid.
1557                  *
1558                  * Proxy arp feature have been extended to allow, ARP
1559                  * replies back to the same interface, to support
1560                  * Private VLAN switch technologies. See arp.c.
1561                  */
1562                 if (out_dev == in_dev &&
1563                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1564                         err = -EINVAL;
1565                         goto cleanup;
1566                 }
1567         }
1568
1569         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1570         if (do_cache) {
1571                 if (fnhe != NULL)
1572                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1573                 else
1574                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1575
1576                 if (rt_cache_valid(rth)) {
1577                         skb_dst_set_noref(skb, &rth->dst);
1578                         goto out;
1579                 }
1580         }
1581
1582         rth = rt_dst_alloc(out_dev->dev,
1583                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1584                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1585         if (!rth) {
1586                 err = -ENOBUFS;
1587                 goto cleanup;
1588         }
1589
1590         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1591         rth->rt_flags = flags;
1592         rth->rt_type = res->type;
1593         rth->rt_is_input = 1;
1594         rth->rt_iif     = 0;
1595         rth->rt_pmtu    = 0;
1596         rth->rt_gateway = 0;
1597         rth->rt_uses_gateway = 0;
1598         INIT_LIST_HEAD(&rth->rt_uncached);
1599         RT_CACHE_STAT_INC(in_slow_tot);
1600
1601         rth->dst.input = ip_forward;
1602         rth->dst.output = ip_output;
1603
1604         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1605         skb_dst_set(skb, &rth->dst);
1606 out:
1607         err = 0;
1608  cleanup:
1609         return err;
1610 }
1611
1612 static int ip_mkroute_input(struct sk_buff *skb,
1613                             struct fib_result *res,
1614                             const struct flowi4 *fl4,
1615                             struct in_device *in_dev,
1616                             __be32 daddr, __be32 saddr, u32 tos)
1617 {
1618 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1619         if (res->fi && res->fi->fib_nhs > 1)
1620                 fib_select_multipath(res);
1621 #endif
1622
1623         /* create a routing cache entry */
1624         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1625 }
1626
1627 /*
1628  *      NOTE. We drop all the packets that has local source
1629  *      addresses, because every properly looped back packet
1630  *      must have correct destination already attached by output routine.
1631  *
1632  *      Such approach solves two big problems:
1633  *      1. Not simplex devices are handled properly.
1634  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1635  *      called with rcu_read_lock()
1636  */
1637
1638 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1639                                u8 tos, struct net_device *dev)
1640 {
1641         struct fib_result res;
1642         struct in_device *in_dev = __in_dev_get_rcu(dev);
1643         struct flowi4   fl4;
1644         unsigned int    flags = 0;
1645         u32             itag = 0;
1646         struct rtable   *rth;
1647         int             err = -EINVAL;
1648         struct net    *net = dev_net(dev);
1649         bool do_cache;
1650
1651         /* IP on this device is disabled. */
1652
1653         if (!in_dev)
1654                 goto out;
1655
1656         /* Check for the most weird martians, which can be not detected
1657            by fib_lookup.
1658          */
1659
1660         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1661                 goto martian_source;
1662
1663         res.fi = NULL;
1664         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1665                 goto brd_input;
1666
1667         /* Accept zero addresses only to limited broadcast;
1668          * I even do not know to fix it or not. Waiting for complains :-)
1669          */
1670         if (ipv4_is_zeronet(saddr))
1671                 goto martian_source;
1672
1673         if (ipv4_is_zeronet(daddr))
1674                 goto martian_destination;
1675
1676         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1677          * and call it once if daddr or/and saddr are loopback addresses
1678          */
1679         if (ipv4_is_loopback(daddr)) {
1680                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1681                         goto martian_destination;
1682         } else if (ipv4_is_loopback(saddr)) {
1683                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1684                         goto martian_source;
1685         }
1686
1687         /*
1688          *      Now we are ready to route packet.
1689          */
1690         fl4.flowi4_oif = 0;
1691         fl4.flowi4_iif = dev->ifindex;
1692         fl4.flowi4_mark = skb->mark;
1693         fl4.flowi4_tos = tos;
1694         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1695         fl4.daddr = daddr;
1696         fl4.saddr = saddr;
1697         err = fib_lookup(net, &fl4, &res);
1698         if (err != 0)
1699                 goto no_route;
1700
1701         if (res.type == RTN_BROADCAST)
1702                 goto brd_input;
1703
1704         if (res.type == RTN_LOCAL) {
1705                 err = fib_validate_source(skb, saddr, daddr, tos,
1706                                           LOOPBACK_IFINDEX,
1707                                           dev, in_dev, &itag);
1708                 if (err < 0)
1709                         goto martian_source_keep_err;
1710                 goto local_input;
1711         }
1712
1713         if (!IN_DEV_FORWARD(in_dev))
1714                 goto no_route;
1715         if (res.type != RTN_UNICAST)
1716                 goto martian_destination;
1717
1718         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1719 out:    return err;
1720
1721 brd_input:
1722         if (skb->protocol != htons(ETH_P_IP))
1723                 goto e_inval;
1724
1725         if (!ipv4_is_zeronet(saddr)) {
1726                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1727                                           in_dev, &itag);
1728                 if (err < 0)
1729                         goto martian_source_keep_err;
1730         }
1731         flags |= RTCF_BROADCAST;
1732         res.type = RTN_BROADCAST;
1733         RT_CACHE_STAT_INC(in_brd);
1734
1735 local_input:
1736         do_cache = false;
1737         if (res.fi) {
1738                 if (!itag) {
1739                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1740                         if (rt_cache_valid(rth)) {
1741                                 skb_dst_set_noref(skb, &rth->dst);
1742                                 err = 0;
1743                                 goto out;
1744                         }
1745                         do_cache = true;
1746                 }
1747         }
1748
1749         rth = rt_dst_alloc(net->loopback_dev,
1750                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1751         if (!rth)
1752                 goto e_nobufs;
1753
1754         rth->dst.input= ip_local_deliver;
1755         rth->dst.output= ip_rt_bug;
1756 #ifdef CONFIG_IP_ROUTE_CLASSID
1757         rth->dst.tclassid = itag;
1758 #endif
1759
1760         rth->rt_genid = rt_genid_ipv4(net);
1761         rth->rt_flags   = flags|RTCF_LOCAL;
1762         rth->rt_type    = res.type;
1763         rth->rt_is_input = 1;
1764         rth->rt_iif     = 0;
1765         rth->rt_pmtu    = 0;
1766         rth->rt_gateway = 0;
1767         rth->rt_uses_gateway = 0;
1768         INIT_LIST_HEAD(&rth->rt_uncached);
1769         RT_CACHE_STAT_INC(in_slow_tot);
1770         if (res.type == RTN_UNREACHABLE) {
1771                 rth->dst.input= ip_error;
1772                 rth->dst.error= -err;
1773                 rth->rt_flags   &= ~RTCF_LOCAL;
1774         }
1775         if (do_cache) {
1776                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1777                         rth->dst.flags |= DST_NOCACHE;
1778                         rt_add_uncached_list(rth);
1779                 }
1780         }
1781         skb_dst_set(skb, &rth->dst);
1782         err = 0;
1783         goto out;
1784
1785 no_route:
1786         RT_CACHE_STAT_INC(in_no_route);
1787         res.type = RTN_UNREACHABLE;
1788         if (err == -ESRCH)
1789                 err = -ENETUNREACH;
1790         goto local_input;
1791
1792         /*
1793          *      Do not cache martian addresses: they should be logged (RFC1812)
1794          */
1795 martian_destination:
1796         RT_CACHE_STAT_INC(in_martian_dst);
1797 #ifdef CONFIG_IP_ROUTE_VERBOSE
1798         if (IN_DEV_LOG_MARTIANS(in_dev))
1799                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1800                                      &daddr, &saddr, dev->name);
1801 #endif
1802
1803 e_inval:
1804         err = -EINVAL;
1805         goto out;
1806
1807 e_nobufs:
1808         err = -ENOBUFS;
1809         goto out;
1810
1811 martian_source:
1812         err = -EINVAL;
1813 martian_source_keep_err:
1814         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1815         goto out;
1816 }
1817
1818 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1819                          u8 tos, struct net_device *dev)
1820 {
1821         int res;
1822
1823         rcu_read_lock();
1824
1825         /* Multicast recognition logic is moved from route cache to here.
1826            The problem was that too many Ethernet cards have broken/missing
1827            hardware multicast filters :-( As result the host on multicasting
1828            network acquires a lot of useless route cache entries, sort of
1829            SDR messages from all the world. Now we try to get rid of them.
1830            Really, provided software IP multicast filter is organized
1831            reasonably (at least, hashed), it does not result in a slowdown
1832            comparing with route cache reject entries.
1833            Note, that multicast routers are not affected, because
1834            route cache entry is created eventually.
1835          */
1836         if (ipv4_is_multicast(daddr)) {
1837                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1838
1839                 if (in_dev) {
1840                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1841                                                   ip_hdr(skb)->protocol);
1842                         if (our
1843 #ifdef CONFIG_IP_MROUTE
1844                                 ||
1845                             (!ipv4_is_local_multicast(daddr) &&
1846                              IN_DEV_MFORWARD(in_dev))
1847 #endif
1848                            ) {
1849                                 int res = ip_route_input_mc(skb, daddr, saddr,
1850                                                             tos, dev, our);
1851                                 rcu_read_unlock();
1852                                 return res;
1853                         }
1854                 }
1855                 rcu_read_unlock();
1856                 return -EINVAL;
1857         }
1858         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1859         rcu_read_unlock();
1860         return res;
1861 }
1862 EXPORT_SYMBOL(ip_route_input_noref);
1863
1864 /* called with rcu_read_lock() */
1865 static struct rtable *__mkroute_output(const struct fib_result *res,
1866                                        const struct flowi4 *fl4, int orig_oif,
1867                                        struct net_device *dev_out,
1868                                        unsigned int flags)
1869 {
1870         struct fib_info *fi = res->fi;
1871         struct fib_nh_exception *fnhe;
1872         struct in_device *in_dev;
1873         u16 type = res->type;
1874         struct rtable *rth;
1875         bool do_cache;
1876
1877         in_dev = __in_dev_get_rcu(dev_out);
1878         if (!in_dev)
1879                 return ERR_PTR(-EINVAL);
1880
1881         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1882                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1883                         return ERR_PTR(-EINVAL);
1884
1885         if (ipv4_is_lbcast(fl4->daddr))
1886                 type = RTN_BROADCAST;
1887         else if (ipv4_is_multicast(fl4->daddr))
1888                 type = RTN_MULTICAST;
1889         else if (ipv4_is_zeronet(fl4->daddr))
1890                 return ERR_PTR(-EINVAL);
1891
1892         if (dev_out->flags & IFF_LOOPBACK)
1893                 flags |= RTCF_LOCAL;
1894
1895         do_cache = true;
1896         if (type == RTN_BROADCAST) {
1897                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1898                 fi = NULL;
1899         } else if (type == RTN_MULTICAST) {
1900                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1901                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1902                                      fl4->flowi4_proto))
1903                         flags &= ~RTCF_LOCAL;
1904                 else
1905                         do_cache = false;
1906                 /* If multicast route do not exist use
1907                  * default one, but do not gateway in this case.
1908                  * Yes, it is hack.
1909                  */
1910                 if (fi && res->prefixlen < 4)
1911                         fi = NULL;
1912         }
1913
1914         fnhe = NULL;
1915         do_cache &= fi != NULL;
1916         if (do_cache) {
1917                 struct rtable __rcu **prth;
1918                 struct fib_nh *nh = &FIB_RES_NH(*res);
1919
1920                 fnhe = find_exception(nh, fl4->daddr);
1921                 if (fnhe)
1922                         prth = &fnhe->fnhe_rth_output;
1923                 else {
1924                         if (unlikely(fl4->flowi4_flags &
1925                                      FLOWI_FLAG_KNOWN_NH &&
1926                                      !(nh->nh_gw &&
1927                                        nh->nh_scope == RT_SCOPE_LINK))) {
1928                                 do_cache = false;
1929                                 goto add;
1930                         }
1931                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1932                 }
1933                 rth = rcu_dereference(*prth);
1934                 if (rt_cache_valid(rth)) {
1935                         dst_hold(&rth->dst);
1936                         return rth;
1937                 }
1938         }
1939
1940 add:
1941         rth = rt_dst_alloc(dev_out,
1942                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1943                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1944                            do_cache);
1945         if (!rth)
1946                 return ERR_PTR(-ENOBUFS);
1947
1948         rth->dst.output = ip_output;
1949
1950         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1951         rth->rt_flags   = flags;
1952         rth->rt_type    = type;
1953         rth->rt_is_input = 0;
1954         rth->rt_iif     = orig_oif ? : 0;
1955         rth->rt_pmtu    = 0;
1956         rth->rt_gateway = 0;
1957         rth->rt_uses_gateway = 0;
1958         INIT_LIST_HEAD(&rth->rt_uncached);
1959
1960         RT_CACHE_STAT_INC(out_slow_tot);
1961
1962         if (flags & RTCF_LOCAL)
1963                 rth->dst.input = ip_local_deliver;
1964         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1965                 if (flags & RTCF_LOCAL &&
1966                     !(dev_out->flags & IFF_LOOPBACK)) {
1967                         rth->dst.output = ip_mc_output;
1968                         RT_CACHE_STAT_INC(out_slow_mc);
1969                 }
1970 #ifdef CONFIG_IP_MROUTE
1971                 if (type == RTN_MULTICAST) {
1972                         if (IN_DEV_MFORWARD(in_dev) &&
1973                             !ipv4_is_local_multicast(fl4->daddr)) {
1974                                 rth->dst.input = ip_mr_input;
1975                                 rth->dst.output = ip_mc_output;
1976                         }
1977                 }
1978 #endif
1979         }
1980
1981         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1982
1983         return rth;
1984 }
1985
1986 /*
1987  * Major route resolver routine.
1988  */
1989
1990 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1991 {
1992         struct net_device *dev_out = NULL;
1993         __u8 tos = RT_FL_TOS(fl4);
1994         unsigned int flags = 0;
1995         struct fib_result res;
1996         struct rtable *rth;
1997         int orig_oif;
1998
1999         res.tclassid    = 0;
2000         res.fi          = NULL;
2001         res.table       = NULL;
2002
2003         orig_oif = fl4->flowi4_oif;
2004
2005         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2006         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2007         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2008                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2009
2010         rcu_read_lock();
2011         if (fl4->saddr) {
2012                 rth = ERR_PTR(-EINVAL);
2013                 if (ipv4_is_multicast(fl4->saddr) ||
2014                     ipv4_is_lbcast(fl4->saddr) ||
2015                     ipv4_is_zeronet(fl4->saddr))
2016                         goto out;
2017
2018                 /* I removed check for oif == dev_out->oif here.
2019                    It was wrong for two reasons:
2020                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2021                       is assigned to multiple interfaces.
2022                    2. Moreover, we are allowed to send packets with saddr
2023                       of another iface. --ANK
2024                  */
2025
2026                 if (fl4->flowi4_oif == 0 &&
2027                     (ipv4_is_multicast(fl4->daddr) ||
2028                      ipv4_is_lbcast(fl4->daddr))) {
2029                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2030                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2031                         if (dev_out == NULL)
2032                                 goto out;
2033
2034                         /* Special hack: user can direct multicasts
2035                            and limited broadcast via necessary interface
2036                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2037                            This hack is not just for fun, it allows
2038                            vic,vat and friends to work.
2039                            They bind socket to loopback, set ttl to zero
2040                            and expect that it will work.
2041                            From the viewpoint of routing cache they are broken,
2042                            because we are not allowed to build multicast path
2043                            with loopback source addr (look, routing cache
2044                            cannot know, that ttl is zero, so that packet
2045                            will not leave this host and route is valid).
2046                            Luckily, this hack is good workaround.
2047                          */
2048
2049                         fl4->flowi4_oif = dev_out->ifindex;
2050                         goto make_route;
2051                 }
2052
2053                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2054                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2055                         if (!__ip_dev_find(net, fl4->saddr, false))
2056                                 goto out;
2057                 }
2058         }
2059
2060
2061         if (fl4->flowi4_oif) {
2062                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2063                 rth = ERR_PTR(-ENODEV);
2064                 if (dev_out == NULL)
2065                         goto out;
2066
2067                 /* RACE: Check return value of inet_select_addr instead. */
2068                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2069                         rth = ERR_PTR(-ENETUNREACH);
2070                         goto out;
2071                 }
2072                 if (ipv4_is_local_multicast(fl4->daddr) ||
2073                     ipv4_is_lbcast(fl4->daddr)) {
2074                         if (!fl4->saddr)
2075                                 fl4->saddr = inet_select_addr(dev_out, 0,
2076                                                               RT_SCOPE_LINK);
2077                         goto make_route;
2078                 }
2079                 if (!fl4->saddr) {
2080                         if (ipv4_is_multicast(fl4->daddr))
2081                                 fl4->saddr = inet_select_addr(dev_out, 0,
2082                                                               fl4->flowi4_scope);
2083                         else if (!fl4->daddr)
2084                                 fl4->saddr = inet_select_addr(dev_out, 0,
2085                                                               RT_SCOPE_HOST);
2086                 }
2087         }
2088
2089         if (!fl4->daddr) {
2090                 fl4->daddr = fl4->saddr;
2091                 if (!fl4->daddr)
2092                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2093                 dev_out = net->loopback_dev;
2094                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2095                 res.type = RTN_LOCAL;
2096                 flags |= RTCF_LOCAL;
2097                 goto make_route;
2098         }
2099
2100         if (fib_lookup(net, fl4, &res)) {
2101                 res.fi = NULL;
2102                 res.table = NULL;
2103                 if (fl4->flowi4_oif) {
2104                         /* Apparently, routing tables are wrong. Assume,
2105                            that the destination is on link.
2106
2107                            WHY? DW.
2108                            Because we are allowed to send to iface
2109                            even if it has NO routes and NO assigned
2110                            addresses. When oif is specified, routing
2111                            tables are looked up with only one purpose:
2112                            to catch if destination is gatewayed, rather than
2113                            direct. Moreover, if MSG_DONTROUTE is set,
2114                            we send packet, ignoring both routing tables
2115                            and ifaddr state. --ANK
2116
2117
2118                            We could make it even if oif is unknown,
2119                            likely IPv6, but we do not.
2120                          */
2121
2122                         if (fl4->saddr == 0)
2123                                 fl4->saddr = inet_select_addr(dev_out, 0,
2124                                                               RT_SCOPE_LINK);
2125                         res.type = RTN_UNICAST;
2126                         goto make_route;
2127                 }
2128                 rth = ERR_PTR(-ENETUNREACH);
2129                 goto out;
2130         }
2131
2132         if (res.type == RTN_LOCAL) {
2133                 if (!fl4->saddr) {
2134                         if (res.fi->fib_prefsrc)
2135                                 fl4->saddr = res.fi->fib_prefsrc;
2136                         else
2137                                 fl4->saddr = fl4->daddr;
2138                 }
2139                 dev_out = net->loopback_dev;
2140                 fl4->flowi4_oif = dev_out->ifindex;
2141                 flags |= RTCF_LOCAL;
2142                 goto make_route;
2143         }
2144
2145 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2146         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2147                 fib_select_multipath(&res);
2148         else
2149 #endif
2150         if (!res.prefixlen &&
2151             res.table->tb_num_default > 1 &&
2152             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2153                 fib_select_default(&res);
2154
2155         if (!fl4->saddr)
2156                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2157
2158         dev_out = FIB_RES_DEV(res);
2159         fl4->flowi4_oif = dev_out->ifindex;
2160
2161
2162 make_route:
2163         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2164
2165 out:
2166         rcu_read_unlock();
2167         return rth;
2168 }
2169 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2170
2171 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2172 {
2173         return NULL;
2174 }
2175
2176 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2177 {
2178         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2179
2180         return mtu ? : dst->dev->mtu;
2181 }
2182
2183 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2184                                           struct sk_buff *skb, u32 mtu)
2185 {
2186 }
2187
2188 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2189                                        struct sk_buff *skb)
2190 {
2191 }
2192
2193 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2194                                           unsigned long old)
2195 {
2196         return NULL;
2197 }
2198
2199 static struct dst_ops ipv4_dst_blackhole_ops = {
2200         .family                 =       AF_INET,
2201         .protocol               =       cpu_to_be16(ETH_P_IP),
2202         .check                  =       ipv4_blackhole_dst_check,
2203         .mtu                    =       ipv4_blackhole_mtu,
2204         .default_advmss         =       ipv4_default_advmss,
2205         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2206         .redirect               =       ipv4_rt_blackhole_redirect,
2207         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2208         .neigh_lookup           =       ipv4_neigh_lookup,
2209 };
2210
2211 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2212 {
2213         struct rtable *ort = (struct rtable *) dst_orig;
2214         struct rtable *rt;
2215
2216         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2217         if (rt) {
2218                 struct dst_entry *new = &rt->dst;
2219
2220                 new->__use = 1;
2221                 new->input = dst_discard;
2222                 new->output = dst_discard;
2223
2224                 new->dev = ort->dst.dev;
2225                 if (new->dev)
2226                         dev_hold(new->dev);
2227
2228                 rt->rt_is_input = ort->rt_is_input;
2229                 rt->rt_iif = ort->rt_iif;
2230                 rt->rt_pmtu = ort->rt_pmtu;
2231
2232                 rt->rt_genid = rt_genid_ipv4(net);
2233                 rt->rt_flags = ort->rt_flags;
2234                 rt->rt_type = ort->rt_type;
2235                 rt->rt_gateway = ort->rt_gateway;
2236                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2237
2238                 INIT_LIST_HEAD(&rt->rt_uncached);
2239
2240                 dst_free(new);
2241         }
2242
2243         dst_release(dst_orig);
2244
2245         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2246 }
2247
2248 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2249                                     struct sock *sk)
2250 {
2251         struct rtable *rt = __ip_route_output_key(net, flp4);
2252
2253         if (IS_ERR(rt))
2254                 return rt;
2255
2256         if (flp4->flowi4_proto)
2257                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2258                                                    flowi4_to_flowi(flp4),
2259                                                    sk, 0);
2260
2261         return rt;
2262 }
2263 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2264
2265 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2266                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2267                         u32 seq, int event, int nowait, unsigned int flags)
2268 {
2269         struct rtable *rt = skb_rtable(skb);
2270         struct rtmsg *r;
2271         struct nlmsghdr *nlh;
2272         unsigned long expires = 0;
2273         u32 error;
2274         u32 metrics[RTAX_MAX];
2275
2276         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2277         if (nlh == NULL)
2278                 return -EMSGSIZE;
2279
2280         r = nlmsg_data(nlh);
2281         r->rtm_family    = AF_INET;
2282         r->rtm_dst_len  = 32;
2283         r->rtm_src_len  = 0;
2284         r->rtm_tos      = fl4->flowi4_tos;
2285         r->rtm_table    = RT_TABLE_MAIN;
2286         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2287                 goto nla_put_failure;
2288         r->rtm_type     = rt->rt_type;
2289         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2290         r->rtm_protocol = RTPROT_UNSPEC;
2291         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2292         if (rt->rt_flags & RTCF_NOTIFY)
2293                 r->rtm_flags |= RTM_F_NOTIFY;
2294
2295         if (nla_put_be32(skb, RTA_DST, dst))
2296                 goto nla_put_failure;
2297         if (src) {
2298                 r->rtm_src_len = 32;
2299                 if (nla_put_be32(skb, RTA_SRC, src))
2300                         goto nla_put_failure;
2301         }
2302         if (rt->dst.dev &&
2303             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2304                 goto nla_put_failure;
2305 #ifdef CONFIG_IP_ROUTE_CLASSID
2306         if (rt->dst.tclassid &&
2307             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2308                 goto nla_put_failure;
2309 #endif
2310         if (!rt_is_input_route(rt) &&
2311             fl4->saddr != src) {
2312                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2313                         goto nla_put_failure;
2314         }
2315         if (rt->rt_uses_gateway &&
2316             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2317                 goto nla_put_failure;
2318
2319         expires = rt->dst.expires;
2320         if (expires) {
2321                 unsigned long now = jiffies;
2322
2323                 if (time_before(now, expires))
2324                         expires -= now;
2325                 else
2326                         expires = 0;
2327         }
2328
2329         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2330         if (rt->rt_pmtu && expires)
2331                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2332         if (rtnetlink_put_metrics(skb, metrics) < 0)
2333                 goto nla_put_failure;
2334
2335         if (fl4->flowi4_mark &&
2336             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2337                 goto nla_put_failure;
2338
2339         error = rt->dst.error;
2340
2341         if (rt_is_input_route(rt)) {
2342 #ifdef CONFIG_IP_MROUTE
2343                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2344                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2345                         int err = ipmr_get_route(net, skb,
2346                                                  fl4->saddr, fl4->daddr,
2347                                                  r, nowait);
2348                         if (err <= 0) {
2349                                 if (!nowait) {
2350                                         if (err == 0)
2351                                                 return 0;
2352                                         goto nla_put_failure;
2353                                 } else {
2354                                         if (err == -EMSGSIZE)
2355                                                 goto nla_put_failure;
2356                                         error = err;
2357                                 }
2358                         }
2359                 } else
2360 #endif
2361                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2362                                 goto nla_put_failure;
2363         }
2364
2365         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2366                 goto nla_put_failure;
2367
2368         return nlmsg_end(skb, nlh);
2369
2370 nla_put_failure:
2371         nlmsg_cancel(skb, nlh);
2372         return -EMSGSIZE;
2373 }
2374
2375 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2376 {
2377         struct net *net = sock_net(in_skb->sk);
2378         struct rtmsg *rtm;
2379         struct nlattr *tb[RTA_MAX+1];
2380         struct rtable *rt = NULL;
2381         struct flowi4 fl4;
2382         __be32 dst = 0;
2383         __be32 src = 0;
2384         u32 iif;
2385         int err;
2386         int mark;
2387         struct sk_buff *skb;
2388
2389         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2390         if (err < 0)
2391                 goto errout;
2392
2393         rtm = nlmsg_data(nlh);
2394
2395         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2396         if (skb == NULL) {
2397                 err = -ENOBUFS;
2398                 goto errout;
2399         }
2400
2401         /* Reserve room for dummy headers, this skb can pass
2402            through good chunk of routing engine.
2403          */
2404         skb_reset_mac_header(skb);
2405         skb_reset_network_header(skb);
2406
2407         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2408         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2409         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2410
2411         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2412         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2413         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2414         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2415
2416         memset(&fl4, 0, sizeof(fl4));
2417         fl4.daddr = dst;
2418         fl4.saddr = src;
2419         fl4.flowi4_tos = rtm->rtm_tos;
2420         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2421         fl4.flowi4_mark = mark;
2422
2423         if (iif) {
2424                 struct net_device *dev;
2425
2426                 dev = __dev_get_by_index(net, iif);
2427                 if (dev == NULL) {
2428                         err = -ENODEV;
2429                         goto errout_free;
2430                 }
2431
2432                 skb->protocol   = htons(ETH_P_IP);
2433                 skb->dev        = dev;
2434                 skb->mark       = mark;
2435                 local_bh_disable();
2436                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2437                 local_bh_enable();
2438
2439                 rt = skb_rtable(skb);
2440                 if (err == 0 && rt->dst.error)
2441                         err = -rt->dst.error;
2442         } else {
2443                 rt = ip_route_output_key(net, &fl4);
2444
2445                 err = 0;
2446                 if (IS_ERR(rt))
2447                         err = PTR_ERR(rt);
2448         }
2449
2450         if (err)
2451                 goto errout_free;
2452
2453         skb_dst_set(skb, &rt->dst);
2454         if (rtm->rtm_flags & RTM_F_NOTIFY)
2455                 rt->rt_flags |= RTCF_NOTIFY;
2456
2457         err = rt_fill_info(net, dst, src, &fl4, skb,
2458                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2459                            RTM_NEWROUTE, 0, 0);
2460         if (err <= 0)
2461                 goto errout_free;
2462
2463         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2464 errout:
2465         return err;
2466
2467 errout_free:
2468         kfree_skb(skb);
2469         goto errout;
2470 }
2471
2472 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2473 {
2474         return skb->len;
2475 }
2476
2477 void ip_rt_multicast_event(struct in_device *in_dev)
2478 {
2479         rt_cache_flush(dev_net(in_dev->dev));
2480 }
2481
2482 #ifdef CONFIG_SYSCTL
2483 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2484 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2485 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2486 static int ip_rt_gc_elasticity __read_mostly    = 8;
2487
2488 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2489                                         void __user *buffer,
2490                                         size_t *lenp, loff_t *ppos)
2491 {
2492         struct net *net = (struct net *)__ctl->extra1;
2493
2494         if (write) {
2495                 rt_cache_flush(net);
2496                 fnhe_genid_bump(net);
2497                 return 0;
2498         }
2499
2500         return -EINVAL;
2501 }
2502
2503 static struct ctl_table ipv4_route_table[] = {
2504         {
2505                 .procname       = "gc_thresh",
2506                 .data           = &ipv4_dst_ops.gc_thresh,
2507                 .maxlen         = sizeof(int),
2508                 .mode           = 0644,
2509                 .proc_handler   = proc_dointvec,
2510         },
2511         {
2512                 .procname       = "max_size",
2513                 .data           = &ip_rt_max_size,
2514                 .maxlen         = sizeof(int),
2515                 .mode           = 0644,
2516                 .proc_handler   = proc_dointvec,
2517         },
2518         {
2519                 /*  Deprecated. Use gc_min_interval_ms */
2520
2521                 .procname       = "gc_min_interval",
2522                 .data           = &ip_rt_gc_min_interval,
2523                 .maxlen         = sizeof(int),
2524                 .mode           = 0644,
2525                 .proc_handler   = proc_dointvec_jiffies,
2526         },
2527         {
2528                 .procname       = "gc_min_interval_ms",
2529                 .data           = &ip_rt_gc_min_interval,
2530                 .maxlen         = sizeof(int),
2531                 .mode           = 0644,
2532                 .proc_handler   = proc_dointvec_ms_jiffies,
2533         },
2534         {
2535                 .procname       = "gc_timeout",
2536                 .data           = &ip_rt_gc_timeout,
2537                 .maxlen         = sizeof(int),
2538                 .mode           = 0644,
2539                 .proc_handler   = proc_dointvec_jiffies,
2540         },
2541         {
2542                 .procname       = "gc_interval",
2543                 .data           = &ip_rt_gc_interval,
2544                 .maxlen         = sizeof(int),
2545                 .mode           = 0644,
2546                 .proc_handler   = proc_dointvec_jiffies,
2547         },
2548         {
2549                 .procname       = "redirect_load",
2550                 .data           = &ip_rt_redirect_load,
2551                 .maxlen         = sizeof(int),
2552                 .mode           = 0644,
2553                 .proc_handler   = proc_dointvec,
2554         },
2555         {
2556                 .procname       = "redirect_number",
2557                 .data           = &ip_rt_redirect_number,
2558                 .maxlen         = sizeof(int),
2559                 .mode           = 0644,
2560                 .proc_handler   = proc_dointvec,
2561         },
2562         {
2563                 .procname       = "redirect_silence",
2564                 .data           = &ip_rt_redirect_silence,
2565                 .maxlen         = sizeof(int),
2566                 .mode           = 0644,
2567                 .proc_handler   = proc_dointvec,
2568         },
2569         {
2570                 .procname       = "error_cost",
2571                 .data           = &ip_rt_error_cost,
2572                 .maxlen         = sizeof(int),
2573                 .mode           = 0644,
2574                 .proc_handler   = proc_dointvec,
2575         },
2576         {
2577                 .procname       = "error_burst",
2578                 .data           = &ip_rt_error_burst,
2579                 .maxlen         = sizeof(int),
2580                 .mode           = 0644,
2581                 .proc_handler   = proc_dointvec,
2582         },
2583         {
2584                 .procname       = "gc_elasticity",
2585                 .data           = &ip_rt_gc_elasticity,
2586                 .maxlen         = sizeof(int),
2587                 .mode           = 0644,
2588                 .proc_handler   = proc_dointvec,
2589         },
2590         {
2591                 .procname       = "mtu_expires",
2592                 .data           = &ip_rt_mtu_expires,
2593                 .maxlen         = sizeof(int),
2594                 .mode           = 0644,
2595                 .proc_handler   = proc_dointvec_jiffies,
2596         },
2597         {
2598                 .procname       = "min_pmtu",
2599                 .data           = &ip_rt_min_pmtu,
2600                 .maxlen         = sizeof(int),
2601                 .mode           = 0644,
2602                 .proc_handler   = proc_dointvec,
2603         },
2604         {
2605                 .procname       = "min_adv_mss",
2606                 .data           = &ip_rt_min_advmss,
2607                 .maxlen         = sizeof(int),
2608                 .mode           = 0644,
2609                 .proc_handler   = proc_dointvec,
2610         },
2611         { }
2612 };
2613
2614 static struct ctl_table ipv4_route_flush_table[] = {
2615         {
2616                 .procname       = "flush",
2617                 .maxlen         = sizeof(int),
2618                 .mode           = 0200,
2619                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2620         },
2621         { },
2622 };
2623
2624 static __net_init int sysctl_route_net_init(struct net *net)
2625 {
2626         struct ctl_table *tbl;
2627
2628         tbl = ipv4_route_flush_table;
2629         if (!net_eq(net, &init_net)) {
2630                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2631                 if (tbl == NULL)
2632                         goto err_dup;
2633
2634                 /* Don't export sysctls to unprivileged users */
2635                 if (net->user_ns != &init_user_ns)
2636                         tbl[0].procname = NULL;
2637         }
2638         tbl[0].extra1 = net;
2639
2640         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2641         if (net->ipv4.route_hdr == NULL)
2642                 goto err_reg;
2643         return 0;
2644
2645 err_reg:
2646         if (tbl != ipv4_route_flush_table)
2647                 kfree(tbl);
2648 err_dup:
2649         return -ENOMEM;
2650 }
2651
2652 static __net_exit void sysctl_route_net_exit(struct net *net)
2653 {
2654         struct ctl_table *tbl;
2655
2656         tbl = net->ipv4.route_hdr->ctl_table_arg;
2657         unregister_net_sysctl_table(net->ipv4.route_hdr);
2658         BUG_ON(tbl == ipv4_route_flush_table);
2659         kfree(tbl);
2660 }
2661
2662 static __net_initdata struct pernet_operations sysctl_route_ops = {
2663         .init = sysctl_route_net_init,
2664         .exit = sysctl_route_net_exit,
2665 };
2666 #endif
2667
2668 static __net_init int rt_genid_init(struct net *net)
2669 {
2670         atomic_set(&net->ipv4.rt_genid, 0);
2671         atomic_set(&net->fnhe_genid, 0);
2672         get_random_bytes(&net->ipv4.dev_addr_genid,
2673                          sizeof(net->ipv4.dev_addr_genid));
2674         return 0;
2675 }
2676
2677 static __net_initdata struct pernet_operations rt_genid_ops = {
2678         .init = rt_genid_init,
2679 };
2680
2681 static int __net_init ipv4_inetpeer_init(struct net *net)
2682 {
2683         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2684
2685         if (!bp)
2686                 return -ENOMEM;
2687         inet_peer_base_init(bp);
2688         net->ipv4.peers = bp;
2689         return 0;
2690 }
2691
2692 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2693 {
2694         struct inet_peer_base *bp = net->ipv4.peers;
2695
2696         net->ipv4.peers = NULL;
2697         inetpeer_invalidate_tree(bp);
2698         kfree(bp);
2699 }
2700
2701 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2702         .init   =       ipv4_inetpeer_init,
2703         .exit   =       ipv4_inetpeer_exit,
2704 };
2705
2706 #ifdef CONFIG_IP_ROUTE_CLASSID
2707 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2708 #endif /* CONFIG_IP_ROUTE_CLASSID */
2709
2710 int __init ip_rt_init(void)
2711 {
2712         int rc = 0;
2713
2714 #ifdef CONFIG_IP_ROUTE_CLASSID
2715         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2716         if (!ip_rt_acct)
2717                 panic("IP: failed to allocate ip_rt_acct\n");
2718 #endif
2719
2720         ipv4_dst_ops.kmem_cachep =
2721                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2722                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2723
2724         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2725
2726         if (dst_entries_init(&ipv4_dst_ops) < 0)
2727                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2728
2729         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2730                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2731
2732         ipv4_dst_ops.gc_thresh = ~0;
2733         ip_rt_max_size = INT_MAX;
2734
2735         devinet_init();
2736         ip_fib_init();
2737
2738         if (ip_rt_proc_init())
2739                 pr_err("Unable to create route proc files\n");
2740 #ifdef CONFIG_XFRM
2741         xfrm_init();
2742         xfrm4_init();
2743 #endif
2744         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2745
2746 #ifdef CONFIG_SYSCTL
2747         register_pernet_subsys(&sysctl_route_ops);
2748 #endif
2749         register_pernet_subsys(&rt_genid_ops);
2750         register_pernet_subsys(&ipv4_inetpeer_ops);
2751         return rc;
2752 }
2753
2754 #ifdef CONFIG_SYSCTL
2755 /*
2756  * We really need to sanitize the damn ipv4 init order, then all
2757  * this nonsense will go away.
2758  */
2759 void __init ip_static_sysctl_init(void)
2760 {
2761         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2762 }
2763 #endif