net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #include <linux/kmemleak.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #define RT_FL_TOS(oldflp4) \
 118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131
 132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 133
 134 /*
 135  *      Interface to generic destination cache.
 136  */
 137
 138 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 139 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 140 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 141 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 142 static void              ipv4_link_failure(struct sk_buff *skb);
 143 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 144                                            struct sk_buff *skb, u32 mtu);
 145 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 146                                         struct sk_buff *skb);
 147 static void             ipv4_dst_destroy(struct dst_entry *dst);
 148
 149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 150 {
 151         WARN_ON(1);
 152         return NULL;
 153 }
 154
 155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 156                                            struct sk_buff *skb,
 157                                            const void *daddr);
 158
 159 static struct dst_ops ipv4_dst_ops = {
 160         .family =               AF_INET,
 161         .check =                ipv4_dst_check,
 162         .default_advmss =       ipv4_default_advmss,
 163         .mtu =                  ipv4_mtu,
 164         .cow_metrics =          ipv4_cow_metrics,
 165         .destroy =              ipv4_dst_destroy,
 166         .negative_advice =      ipv4_negative_advice,
 167         .link_failure =         ipv4_link_failure,
 168         .update_pmtu =          ip_rt_update_pmtu,
 169         .redirect =             ip_do_redirect,
 170         .local_out =            __ip_local_out,
 171         .neigh_lookup =         ipv4_neigh_lookup,
 172 };
 173
 174 #define ECN_OR_COST(class)      TC_PRIO_##class
 175
 176 const __u8 ip_tos2prio[16] = {
 177         TC_PRIO_BESTEFFORT,
 178         ECN_OR_COST(BESTEFFORT),
 179         TC_PRIO_BESTEFFORT,
 180         ECN_OR_COST(BESTEFFORT),
 181         TC_PRIO_BULK,
 182         ECN_OR_COST(BULK),
 183         TC_PRIO_BULK,
 184         ECN_OR_COST(BULK),
 185         TC_PRIO_INTERACTIVE,
 186         ECN_OR_COST(INTERACTIVE),
 187         TC_PRIO_INTERACTIVE,
 188         ECN_OR_COST(INTERACTIVE),
 189         TC_PRIO_INTERACTIVE_BULK,
 190         ECN_OR_COST(INTERACTIVE_BULK),
 191         TC_PRIO_INTERACTIVE_BULK,
 192         ECN_OR_COST(INTERACTIVE_BULK)
 193 };
 194 EXPORT_SYMBOL(ip_tos2prio);
 195
 196 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 197 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 198
 199 #ifdef CONFIG_PROC_FS
 200 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 201 {
 202         if (*pos)
 203                 return NULL;
 204         return SEQ_START_TOKEN;
 205 }
 206
 207 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 208 {
 209         ++*pos;
 210         return NULL;
 211 }
 212
 213 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 214 {
 215 }
 216
 217 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 218 {
 219         if (v == SEQ_START_TOKEN)
 220                 seq_printf(seq, "%-127s\n",
 221                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 222                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 223                            "HHUptod\tSpecDst");
 224         return 0;
 225 }
 226
 227 static const struct seq_operations rt_cache_seq_ops = {
 228         .start  = rt_cache_seq_start,
 229         .next   = rt_cache_seq_next,
 230         .stop   = rt_cache_seq_stop,
 231         .show   = rt_cache_seq_show,
 232 };
 233
 234 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 235 {
 236         return seq_open(file, &rt_cache_seq_ops);
 237 }
 238
 239 static const struct file_operations rt_cache_seq_fops = {
 240         .owner   = THIS_MODULE,
 241         .open    = rt_cache_seq_open,
 242         .read    = seq_read,
 243         .llseek  = seq_lseek,
 244         .release = seq_release,
 245 };
 246
 247
 248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 249 {
 250         int cpu;
 251
 252         if (*pos == 0)
 253                 return SEQ_START_TOKEN;
 254
 255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 256                 if (!cpu_possible(cpu))
 257                         continue;
 258                 *pos = cpu+1;
 259                 return &per_cpu(rt_cache_stat, cpu);
 260         }
 261         return NULL;
 262 }
 263
 264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 265 {
 266         int cpu;
 267
 268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 269                 if (!cpu_possible(cpu))
 270                         continue;
 271                 *pos = cpu+1;
 272                 return &per_cpu(rt_cache_stat, cpu);
 273         }
 274         return NULL;
 275
 276 }
 277
 278 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 279 {
 280
 281 }
 282
 283 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 284 {
 285         struct rt_cache_stat *st = v;
 286
 287         if (v == SEQ_START_TOKEN) {
 288                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 289                 return 0;
 290         }
 291
 292         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 293                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 294                    dst_entries_get_slow(&ipv4_dst_ops),
 295                    0, /* st->in_hit */
 296                    st->in_slow_tot,
 297                    st->in_slow_mc,
 298                    st->in_no_route,
 299                    st->in_brd,
 300                    st->in_martian_dst,
 301                    st->in_martian_src,
 302
 303                    0, /* st->out_hit */
 304                    st->out_slow_tot,
 305                    st->out_slow_mc,
 306
 307                    0, /* st->gc_total */
 308                    0, /* st->gc_ignored */
 309                    0, /* st->gc_goal_miss */
 310                    0, /* st->gc_dst_overflow */
 311                    0, /* st->in_hlist_search */
 312                    0  /* st->out_hlist_search */
 313                 );
 314         return 0;
 315 }
 316
 317 static const struct seq_operations rt_cpu_seq_ops = {
 318         .start  = rt_cpu_seq_start,
 319         .next   = rt_cpu_seq_next,
 320         .stop   = rt_cpu_seq_stop,
 321         .show   = rt_cpu_seq_show,
 322 };
 323
 324
 325 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 326 {
 327         return seq_open(file, &rt_cpu_seq_ops);
 328 }
 329
 330 static const struct file_operations rt_cpu_seq_fops = {
 331         .owner   = THIS_MODULE,
 332         .open    = rt_cpu_seq_open,
 333         .read    = seq_read,
 334         .llseek  = seq_lseek,
 335         .release = seq_release,
 336 };
 337
 338 #ifdef CONFIG_IP_ROUTE_CLASSID
 339 static int rt_acct_proc_show(struct seq_file *m, void *v)
 340 {
 341         struct ip_rt_acct *dst, *src;
 342         unsigned int i, j;
 343
 344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 345         if (!dst)
 346                 return -ENOMEM;
 347
 348         for_each_possible_cpu(i) {
 349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 350                 for (j = 0; j < 256; j++) {
 351                         dst[j].o_bytes   += src[j].o_bytes;
 352                         dst[j].o_packets += src[j].o_packets;
 353                         dst[j].i_bytes   += src[j].i_bytes;
 354                         dst[j].i_packets += src[j].i_packets;
 355                 }
 356         }
 357
 358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 359         kfree(dst);
 360         return 0;
 361 }
 362
 363 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 364 {
 365         return single_open(file, rt_acct_proc_show, NULL);
 366 }
 367
 368 static const struct file_operations rt_acct_proc_fops = {
 369         .owner          = THIS_MODULE,
 370         .open           = rt_acct_proc_open,
 371         .read           = seq_read,
 372         .llseek         = seq_lseek,
 373         .release        = single_release,
 374 };
 375 #endif
 376
 377 static int __net_init ip_rt_do_proc_init(struct net *net)
 378 {
 379         struct proc_dir_entry *pde;
 380
 381         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 382                           &rt_cache_seq_fops);
 383         if (!pde)
 384                 goto err1;
 385
 386         pde = proc_create("rt_cache", S_IRUGO,
 387                           net->proc_net_stat, &rt_cpu_seq_fops);
 388         if (!pde)
 389                 goto err2;
 390
 391 #ifdef CONFIG_IP_ROUTE_CLASSID
 392         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 393         if (!pde)
 394                 goto err3;
 395 #endif
 396         return 0;
 397
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399 err3:
 400         remove_proc_entry("rt_cache", net->proc_net_stat);
 401 #endif
 402 err2:
 403         remove_proc_entry("rt_cache", net->proc_net);
 404 err1:
 405         return -ENOMEM;
 406 }
 407
 408 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 409 {
 410         remove_proc_entry("rt_cache", net->proc_net_stat);
 411         remove_proc_entry("rt_cache", net->proc_net);
 412 #ifdef CONFIG_IP_ROUTE_CLASSID
 413         remove_proc_entry("rt_acct", net->proc_net);
 414 #endif
 415 }
 416
 417 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 418         .init = ip_rt_do_proc_init,
 419         .exit = ip_rt_do_proc_exit,
 420 };
 421
 422 static int __init ip_rt_proc_init(void)
 423 {
 424         return register_pernet_subsys(&ip_rt_proc_ops);
 425 }
 426
 427 #else
 428 static inline int ip_rt_proc_init(void)
 429 {
 430         return 0;
 431 }
 432 #endif /* CONFIG_PROC_FS */
 433
 434 static inline bool rt_is_expired(const struct rtable *rth)
 435 {
 436         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 437 }
 438
 439 void rt_cache_flush(struct net *net)
 440 {
 441         rt_genid_bump_ipv4(net);
 442 }
 443
 444 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 445                                            struct sk_buff *skb,
 446                                            const void *daddr)
 447 {
 448         struct net_device *dev = dst->dev;
 449         const __be32 *pkey = daddr;
 450         const struct rtable *rt;
 451         struct neighbour *n;
 452
 453         rt = (const struct rtable *) dst;
 454         if (rt->rt_gateway)
 455                 pkey = (const __be32 *) &rt->rt_gateway;
 456         else if (skb)
 457                 pkey = &ip_hdr(skb)->daddr;
 458
 459         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 460         if (n)
 461                 return n;
 462         return neigh_create(&arp_tbl, pkey, dev);
 463 }
 464
 465 #define IP_IDENTS_SZ 2048u
 466
 467 static atomic_t *ip_idents __read_mostly;
 468 static u32 *ip_tstamps __read_mostly;
 469
 470 /* In order to protect privacy, we add a perturbation to identifiers
 471  * if one generator is seldom used. This makes hard for an attacker
 472  * to infer how many packets were sent between two points in time.
 473  */
 474 u32 ip_idents_reserve(u32 hash, int segs)
 475 {
 476         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 477         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 478         u32 old = ACCESS_ONCE(*p_tstamp);
 479         u32 now = (u32)jiffies;
 480         u32 delta = 0;
 481
 482         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 483                 delta = prandom_u32_max(now - old);
 484
 485         /* If UBSAN reports an error there, please make sure your compiler
 486          * supports -fno-strict-overflow before reporting it that was a bug
 487          * in UBSAN, and it has been fixed in GCC-8.
 488          */
 489         return atomic_add_return(segs + delta, p_id) - segs;
 490 }
 491 EXPORT_SYMBOL(ip_idents_reserve);
 492
 493 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 494 {
 495         u32 hash, id;
 496
 497         /* Note the following code is not safe, but this is okay. */
 498         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 499                 get_random_bytes(&net->ipv4.ip_id_key,
 500                                  sizeof(net->ipv4.ip_id_key));
 501
 502         hash = siphash_3u32((__force u32)iph->daddr,
 503                             (__force u32)iph->saddr,
 504                             iph->protocol,
 505                             &net->ipv4.ip_id_key);
 506         id = ip_idents_reserve(hash, segs);
 507         iph->id = htons(id);
 508 }
 509 EXPORT_SYMBOL(__ip_select_ident);
 510
 511 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 512                              const struct iphdr *iph,
 513                              int oif, u8 tos,
 514                              u8 prot, u32 mark, int flow_flags)
 515 {
 516         if (sk) {
 517                 const struct inet_sock *inet = inet_sk(sk);
 518
 519                 oif = sk->sk_bound_dev_if;
 520                 mark = sk->sk_mark;
 521                 tos = RT_CONN_FLAGS(sk);
 522                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 523         }
 524         flowi4_init_output(fl4, oif, mark, tos,
 525                            RT_SCOPE_UNIVERSE, prot,
 526                            flow_flags,
 527                            iph->daddr, iph->saddr, 0, 0);
 528 }
 529
 530 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 531                                const struct sock *sk)
 532 {
 533         const struct iphdr *iph = ip_hdr(skb);
 534         int oif = skb->dev->ifindex;
 535         u8 tos = RT_TOS(iph->tos);
 536         u8 prot = iph->protocol;
 537         u32 mark = skb->mark;
 538
 539         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 540 }
 541
 542 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 543 {
 544         const struct inet_sock *inet = inet_sk(sk);
 545         const struct ip_options_rcu *inet_opt;
 546         __be32 daddr = inet->inet_daddr;
 547
 548         rcu_read_lock();
 549         inet_opt = rcu_dereference(inet->inet_opt);
 550         if (inet_opt && inet_opt->opt.srr)
 551                 daddr = inet_opt->opt.faddr;
 552         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 553                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 554                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 555                            inet_sk_flowi_flags(sk),
 556                            daddr, inet->inet_saddr, 0, 0);
 557         rcu_read_unlock();
 558 }
 559
 560 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 561                                  const struct sk_buff *skb)
 562 {
 563         if (skb)
 564                 build_skb_flow_key(fl4, skb, sk);
 565         else
 566                 build_sk_flow_key(fl4, sk);
 567 }
 568
 569 static inline void rt_free(struct rtable *rt)
 570 {
 571         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 572 }
 573
 574 static DEFINE_SPINLOCK(fnhe_lock);
 575
 576 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 577 {
 578         struct rtable *rt;
 579
 580         rt = rcu_dereference(fnhe->fnhe_rth_input);
 581         if (rt) {
 582                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 583                 rt_free(rt);
 584         }
 585         rt = rcu_dereference(fnhe->fnhe_rth_output);
 586         if (rt) {
 587                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 588                 rt_free(rt);
 589         }
 590 }
 591
 592 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 593 {
 594         struct fib_nh_exception *fnhe, *oldest;
 595
 596         oldest = rcu_dereference(hash->chain);
 597         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 598              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 599                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 600                         oldest = fnhe;
 601         }
 602         fnhe_flush_routes(oldest);
 603         return oldest;
 604 }
 605
 606 static inline u32 fnhe_hashfun(__be32 daddr)
 607 {
 608         static u32 fnhe_hashrnd __read_mostly;
 609         u32 hval;
 610
 611         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 612         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 613         return hash_32(hval, FNHE_HASH_SHIFT);
 614 }
 615
 616 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 617 {
 618         rt->rt_pmtu = fnhe->fnhe_pmtu;
 619         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 620         rt->dst.expires = fnhe->fnhe_expires;
 621
 622         if (fnhe->fnhe_gw) {
 623                 rt->rt_flags |= RTCF_REDIRECTED;
 624                 rt->rt_gateway = fnhe->fnhe_gw;
 625                 rt->rt_uses_gateway = 1;
 626         }
 627 }
 628
 629 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 630                                   u32 pmtu, bool lock, unsigned long expires)
 631 {
 632         struct fnhe_hash_bucket *hash;
 633         struct fib_nh_exception *fnhe;
 634         struct rtable *rt;
 635         u32 genid, hval;
 636         unsigned int i;
 637         int depth;
 638
 639         genid = fnhe_genid(dev_net(nh->nh_dev));
 640         hval = fnhe_hashfun(daddr);
 641
 642         spin_lock_bh(&fnhe_lock);
 643
 644         hash = rcu_dereference(nh->nh_exceptions);
 645         if (!hash) {
 646                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 647                 if (!hash)
 648                         goto out_unlock;
 649                 rcu_assign_pointer(nh->nh_exceptions, hash);
 650         }
 651
 652         hash += hval;
 653
 654         depth = 0;
 655         for (fnhe = rcu_dereference(hash->chain); fnhe;
 656              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 657                 if (fnhe->fnhe_daddr == daddr)
 658                         break;
 659                 depth++;
 660         }
 661
 662         if (fnhe) {
 663                 if (fnhe->fnhe_genid != genid)
 664                         fnhe->fnhe_genid = genid;
 665                 if (gw)
 666                         fnhe->fnhe_gw = gw;
 667                 if (pmtu) {
 668                         fnhe->fnhe_pmtu = pmtu;
 669                         fnhe->fnhe_mtu_locked = lock;
 670                 }
 671                 fnhe->fnhe_expires = max(1UL, expires);
 672                 /* Update all cached dsts too */
 673                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 674                 if (rt)
 675                         fill_route_from_fnhe(rt, fnhe);
 676                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 677                 if (rt)
 678                         fill_route_from_fnhe(rt, fnhe);
 679         } else {
 680                 if (depth > FNHE_RECLAIM_DEPTH)
 681                         fnhe = fnhe_oldest(hash);
 682                 else {
 683                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 684                         if (!fnhe)
 685                                 goto out_unlock;
 686
 687                         fnhe->fnhe_next = hash->chain;
 688                         rcu_assign_pointer(hash->chain, fnhe);
 689                 }
 690                 fnhe->fnhe_genid = genid;
 691                 fnhe->fnhe_daddr = daddr;
 692                 fnhe->fnhe_gw = gw;
 693                 fnhe->fnhe_pmtu = pmtu;
 694                 fnhe->fnhe_mtu_locked = lock;
 695                 fnhe->fnhe_expires = expires;
 696
 697                 /* Exception created; mark the cached routes for the nexthop
 698                  * stale, so anyone caching it rechecks if this exception
 699                  * applies to them.
 700                  */
 701                 rt = rcu_dereference(nh->nh_rth_input);
 702                 if (rt)
 703                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 704
 705                 for_each_possible_cpu(i) {
 706                         struct rtable __rcu **prt;
 707                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 708                         rt = rcu_dereference(*prt);
 709                         if (rt)
 710                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 711                 }
 712         }
 713
 714         fnhe->fnhe_stamp = jiffies;
 715
 716 out_unlock:
 717         spin_unlock_bh(&fnhe_lock);
 718 }
 719
 720 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 721                              bool kill_route)
 722 {
 723         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 724         __be32 old_gw = ip_hdr(skb)->saddr;
 725         struct net_device *dev = skb->dev;
 726         struct in_device *in_dev;
 727         struct fib_result res;
 728         struct neighbour *n;
 729         struct net *net;
 730
 731         switch (icmp_hdr(skb)->code & 7) {
 732         case ICMP_REDIR_NET:
 733         case ICMP_REDIR_NETTOS:
 734         case ICMP_REDIR_HOST:
 735         case ICMP_REDIR_HOSTTOS:
 736                 break;
 737
 738         default:
 739                 return;
 740         }
 741
 742         if (rt->rt_gateway != old_gw)
 743                 return;
 744
 745         in_dev = __in_dev_get_rcu(dev);
 746         if (!in_dev)
 747                 return;
 748
 749         net = dev_net(dev);
 750         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 751             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 752             ipv4_is_zeronet(new_gw))
 753                 goto reject_redirect;
 754
 755         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 756                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 757                         goto reject_redirect;
 758                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 759                         goto reject_redirect;
 760         } else {
 761                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 762                         goto reject_redirect;
 763         }
 764
 765         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 766         if (!n)
 767                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 768         if (!IS_ERR(n)) {
 769                 if (!(n->nud_state & NUD_VALID)) {
 770                         neigh_event_send(n, NULL);
 771                 } else {
 772                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 773                                 struct fib_nh *nh = &FIB_RES_NH(res);
 774
 775                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 776                                                 0, false,
 777                                                 jiffies + ip_rt_gc_timeout);
 778                         }
 779                         if (kill_route)
 780                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 781                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 782                 }
 783                 neigh_release(n);
 784         }
 785         return;
 786
 787 reject_redirect:
 788 #ifdef CONFIG_IP_ROUTE_VERBOSE
 789         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 790                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 791                 __be32 daddr = iph->daddr;
 792                 __be32 saddr = iph->saddr;
 793
 794                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 795                                      "  Advised path = %pI4 -> %pI4\n",
 796                                      &old_gw, dev->name, &new_gw,
 797                                      &saddr, &daddr);
 798         }
 799 #endif
 800         ;
 801 }
 802
 803 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 804 {
 805         struct rtable *rt;
 806         struct flowi4 fl4;
 807         const struct iphdr *iph = (const struct iphdr *) skb->data;
 808         int oif = skb->dev->ifindex;
 809         u8 tos = RT_TOS(iph->tos);
 810         u8 prot = iph->protocol;
 811         u32 mark = skb->mark;
 812
 813         rt = (struct rtable *) dst;
 814
 815         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 816         __ip_do_redirect(rt, skb, &fl4, true);
 817 }
 818
 819 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 820 {
 821         struct rtable *rt = (struct rtable *)dst;
 822         struct dst_entry *ret = dst;
 823
 824         if (rt) {
 825                 if (dst->obsolete > 0) {
 826                         ip_rt_put(rt);
 827                         ret = NULL;
 828                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 829                            rt->dst.expires) {
 830                         ip_rt_put(rt);
 831                         ret = NULL;
 832                 }
 833         }
 834         return ret;
 835 }
 836
 837 /*
 838  * Algorithm:
 839  *      1. The first ip_rt_redirect_number redirects are sent
 840  *         with exponential backoff, then we stop sending them at all,
 841  *         assuming that the host ignores our redirects.
 842  *      2. If we did not see packets requiring redirects
 843  *         during ip_rt_redirect_silence, we assume that the host
 844  *         forgot redirected route and start to send redirects again.
 845  *
 846  * This algorithm is much cheaper and more intelligent than dumb load limiting
 847  * in icmp.c.
 848  *
 849  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 850  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 851  */
 852
 853 void ip_rt_send_redirect(struct sk_buff *skb)
 854 {
 855         struct rtable *rt = skb_rtable(skb);
 856         struct in_device *in_dev;
 857         struct inet_peer *peer;
 858         struct net *net;
 859         int log_martians;
 860         int vif;
 861
 862         rcu_read_lock();
 863         in_dev = __in_dev_get_rcu(rt->dst.dev);
 864         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 865                 rcu_read_unlock();
 866                 return;
 867         }
 868         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 869         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 870         rcu_read_unlock();
 871
 872         net = dev_net(rt->dst.dev);
 873         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 874         if (!peer) {
 875                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 876                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 877                 return;
 878         }
 879
 880         /* No redirected packets during ip_rt_redirect_silence;
 881          * reset the algorithm.
 882          */
 883         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 884                 peer->rate_tokens = 0;
 885                 peer->n_redirects = 0;
 886         }
 887
 888         /* Too many ignored redirects; do not send anything
 889          * set dst.rate_last to the last seen redirected packet.
 890          */
 891         if (peer->n_redirects >= ip_rt_redirect_number) {
 892                 peer->rate_last = jiffies;
 893                 goto out_put_peer;
 894         }
 895
 896         /* Check for load limit; set rate_last to the latest sent
 897          * redirect.
 898          */
 899         if (peer->n_redirects == 0 ||
 900             time_after(jiffies,
 901                        (peer->rate_last +
 902                         (ip_rt_redirect_load << peer->n_redirects)))) {
 903                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 904
 905                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 906                 peer->rate_last = jiffies;
 907                 ++peer->n_redirects;
 908 #ifdef CONFIG_IP_ROUTE_VERBOSE
 909                 if (log_martians &&
 910                     peer->n_redirects == ip_rt_redirect_number)
 911                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 912                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 913                                              &ip_hdr(skb)->daddr, &gw);
 914 #endif
 915         }
 916 out_put_peer:
 917         inet_putpeer(peer);
 918 }
 919
 920 static int ip_error(struct sk_buff *skb)
 921 {
 922         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 923         struct rtable *rt = skb_rtable(skb);
 924         struct inet_peer *peer;
 925         unsigned long now;
 926         struct net *net;
 927         bool send;
 928         int code;
 929
 930         /* IP on this device is disabled. */
 931         if (!in_dev)
 932                 goto out;
 933
 934         net = dev_net(rt->dst.dev);
 935         if (!IN_DEV_FORWARD(in_dev)) {
 936                 switch (rt->dst.error) {
 937                 case EHOSTUNREACH:
 938                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 939                         break;
 940
 941                 case ENETUNREACH:
 942                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 943                         break;
 944                 }
 945                 goto out;
 946         }
 947
 948         switch (rt->dst.error) {
 949         case EINVAL:
 950         default:
 951                 goto out;
 952         case EHOSTUNREACH:
 953                 code = ICMP_HOST_UNREACH;
 954                 break;
 955         case ENETUNREACH:
 956                 code = ICMP_NET_UNREACH;
 957                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 958                 break;
 959         case EACCES:
 960                 code = ICMP_PKT_FILTERED;
 961                 break;
 962         }
 963
 964         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 965                                l3mdev_master_ifindex(skb->dev), 1);
 966
 967         send = true;
 968         if (peer) {
 969                 now = jiffies;
 970                 peer->rate_tokens += now - peer->rate_last;
 971                 if (peer->rate_tokens > ip_rt_error_burst)
 972                         peer->rate_tokens = ip_rt_error_burst;
 973                 peer->rate_last = now;
 974                 if (peer->rate_tokens >= ip_rt_error_cost)
 975                         peer->rate_tokens -= ip_rt_error_cost;
 976                 else
 977                         send = false;
 978                 inet_putpeer(peer);
 979         }
 980         if (send)
 981                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 982
 983 out:    kfree_skb(skb);
 984         return 0;
 985 }
 986
 987 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 988 {
 989         struct dst_entry *dst = &rt->dst;
 990         u32 old_mtu = ipv4_mtu(dst);
 991         struct fib_result res;
 992         bool lock = false;
 993
 994         if (ip_mtu_locked(dst))
 995                 return;
 996
 997         if (old_mtu < mtu)
 998                 return;
 999
1000         if (mtu < ip_rt_min_pmtu) {
1001                 lock = true;
1002                 mtu = min(old_mtu, ip_rt_min_pmtu);
1003         }
1004
1005         if (rt->rt_pmtu == mtu && !lock &&
1006             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1007                 return;
1008
1009         rcu_read_lock();
1010         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1011                 struct fib_nh *nh = &FIB_RES_NH(res);
1012
1013                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1014                                       jiffies + ip_rt_mtu_expires);
1015         }
1016         rcu_read_unlock();
1017 }
1018
1019 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1020                               struct sk_buff *skb, u32 mtu)
1021 {
1022         struct rtable *rt = (struct rtable *) dst;
1023         struct flowi4 fl4;
1024
1025         ip_rt_build_flow_key(&fl4, sk, skb);
1026         __ip_rt_update_pmtu(rt, &fl4, mtu);
1027 }
1028
1029 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1030                       int oif, u32 mark, u8 protocol, int flow_flags)
1031 {
1032         const struct iphdr *iph = (const struct iphdr *) skb->data;
1033         struct flowi4 fl4;
1034         struct rtable *rt;
1035
1036         if (!mark)
1037                 mark = IP4_REPLY_MARK(net, skb->mark);
1038
1039         __build_flow_key(&fl4, NULL, iph, oif,
1040                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1041         rt = __ip_route_output_key(net, &fl4);
1042         if (!IS_ERR(rt)) {
1043                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1044                 ip_rt_put(rt);
1045         }
1046 }
1047 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1048
1049 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1050 {
1051         const struct iphdr *iph = (const struct iphdr *) skb->data;
1052         struct flowi4 fl4;
1053         struct rtable *rt;
1054
1055         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1056
1057         if (!fl4.flowi4_mark)
1058                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1059
1060         rt = __ip_route_output_key(sock_net(sk), &fl4);
1061         if (!IS_ERR(rt)) {
1062                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1063                 ip_rt_put(rt);
1064         }
1065 }
1066
1067 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1068 {
1069         const struct iphdr *iph = (const struct iphdr *) skb->data;
1070         struct flowi4 fl4;
1071         struct rtable *rt;
1072         struct dst_entry *odst = NULL;
1073         bool new = false;
1074
1075         bh_lock_sock(sk);
1076
1077         if (!ip_sk_accept_pmtu(sk))
1078                 goto out;
1079
1080         odst = sk_dst_get(sk);
1081
1082         if (sock_owned_by_user(sk) || !odst) {
1083                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1084                 goto out;
1085         }
1086
1087         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1088
1089         rt = (struct rtable *)odst;
1090         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1091                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1092                 if (IS_ERR(rt))
1093                         goto out;
1094
1095                 new = true;
1096         }
1097
1098         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1099
1100         if (!dst_check(&rt->dst, 0)) {
1101                 if (new)
1102                         dst_release(&rt->dst);
1103
1104                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1105                 if (IS_ERR(rt))
1106                         goto out;
1107
1108                 new = true;
1109         }
1110
1111         if (new)
1112                 sk_dst_set(sk, &rt->dst);
1113
1114 out:
1115         bh_unlock_sock(sk);
1116         dst_release(odst);
1117 }
1118 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1119
1120 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1121                    int oif, u32 mark, u8 protocol, int flow_flags)
1122 {
1123         const struct iphdr *iph = (const struct iphdr *) skb->data;
1124         struct flowi4 fl4;
1125         struct rtable *rt;
1126
1127         __build_flow_key(&fl4, NULL, iph, oif,
1128                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1129         rt = __ip_route_output_key(net, &fl4);
1130         if (!IS_ERR(rt)) {
1131                 __ip_do_redirect(rt, skb, &fl4, false);
1132                 ip_rt_put(rt);
1133         }
1134 }
1135 EXPORT_SYMBOL_GPL(ipv4_redirect);
1136
1137 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1138 {
1139         const struct iphdr *iph = (const struct iphdr *) skb->data;
1140         struct flowi4 fl4;
1141         struct rtable *rt;
1142
1143         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1144         rt = __ip_route_output_key(sock_net(sk), &fl4);
1145         if (!IS_ERR(rt)) {
1146                 __ip_do_redirect(rt, skb, &fl4, false);
1147                 ip_rt_put(rt);
1148         }
1149 }
1150 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1151
1152 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1153 {
1154         struct rtable *rt = (struct rtable *) dst;
1155
1156         /* All IPV4 dsts are created with ->obsolete set to the value
1157          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1158          * into this function always.
1159          *
1160          * When a PMTU/redirect information update invalidates a route,
1161          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1162          * DST_OBSOLETE_DEAD by dst_free().
1163          */
1164         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1165                 return NULL;
1166         return dst;
1167 }
1168
1169 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1170 {
1171         struct ip_options opt;
1172         int res;
1173
1174         /* Recompile ip options since IPCB may not be valid anymore.
1175          * Also check we have a reasonable ipv4 header.
1176          */
1177         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1178             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1179                 return;
1180
1181         memset(&opt, 0, sizeof(opt));
1182         if (ip_hdr(skb)->ihl > 5) {
1183                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1184                         return;
1185                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1186
1187                 rcu_read_lock();
1188                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1189                 rcu_read_unlock();
1190
1191                 if (res)
1192                         return;
1193         }
1194         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1195 }
1196
1197 static void ipv4_link_failure(struct sk_buff *skb)
1198 {
1199         struct rtable *rt;
1200
1201         ipv4_send_dest_unreach(skb);
1202
1203         rt = skb_rtable(skb);
1204         if (rt)
1205                 dst_set_expires(&rt->dst, 0);
1206 }
1207
1208 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1209 {
1210         pr_debug("%s: %pI4 -> %pI4, %s\n",
1211                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1212                  skb->dev ? skb->dev->name : "?");
1213         kfree_skb(skb);
1214         WARN_ON(1);
1215         return 0;
1216 }
1217
1218 /*
1219    We do not cache source address of outgoing interface,
1220    because it is used only by IP RR, TS and SRR options,
1221    so that it out of fast path.
1222
1223    BTW remember: "addr" is allowed to be not aligned
1224    in IP options!
1225  */
1226
1227 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1228 {
1229         __be32 src;
1230
1231         if (rt_is_output_route(rt))
1232                 src = ip_hdr(skb)->saddr;
1233         else {
1234                 struct fib_result res;
1235                 struct flowi4 fl4;
1236                 struct iphdr *iph;
1237
1238                 iph = ip_hdr(skb);
1239
1240                 memset(&fl4, 0, sizeof(fl4));
1241                 fl4.daddr = iph->daddr;
1242                 fl4.saddr = iph->saddr;
1243                 fl4.flowi4_tos = RT_TOS(iph->tos);
1244                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1245                 fl4.flowi4_iif = skb->dev->ifindex;
1246                 fl4.flowi4_mark = skb->mark;
1247
1248                 rcu_read_lock();
1249                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1250                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1251                 else
1252                         src = inet_select_addr(rt->dst.dev,
1253                                                rt_nexthop(rt, iph->daddr),
1254                                                RT_SCOPE_UNIVERSE);
1255                 rcu_read_unlock();
1256         }
1257         memcpy(addr, &src, 4);
1258 }
1259
1260 #ifdef CONFIG_IP_ROUTE_CLASSID
1261 static void set_class_tag(struct rtable *rt, u32 tag)
1262 {
1263         if (!(rt->dst.tclassid & 0xFFFF))
1264                 rt->dst.tclassid |= tag & 0xFFFF;
1265         if (!(rt->dst.tclassid & 0xFFFF0000))
1266                 rt->dst.tclassid |= tag & 0xFFFF0000;
1267 }
1268 #endif
1269
1270 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1271 {
1272         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1273
1274         if (advmss == 0) {
1275                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1276                                ip_rt_min_advmss);
1277                 if (advmss > 65535 - 40)
1278                         advmss = 65535 - 40;
1279         }
1280         return advmss;
1281 }
1282
1283 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1284 {
1285         const struct rtable *rt = (const struct rtable *) dst;
1286         unsigned int mtu = rt->rt_pmtu;
1287
1288         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1289                 mtu = dst_metric_raw(dst, RTAX_MTU);
1290
1291         if (mtu)
1292                 return mtu;
1293
1294         mtu = READ_ONCE(dst->dev->mtu);
1295
1296         if (unlikely(ip_mtu_locked(dst))) {
1297                 if (rt->rt_uses_gateway && mtu > 576)
1298                         mtu = 576;
1299         }
1300
1301         return min_t(unsigned int, mtu, IP_MAX_MTU);
1302 }
1303
1304 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1305 {
1306         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1307         struct fib_nh_exception *fnhe;
1308         u32 hval;
1309
1310         if (!hash)
1311                 return NULL;
1312
1313         hval = fnhe_hashfun(daddr);
1314
1315         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1316              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1317                 if (fnhe->fnhe_daddr == daddr)
1318                         return fnhe;
1319         }
1320         return NULL;
1321 }
1322
1323 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1324                               __be32 daddr)
1325 {
1326         bool ret = false;
1327
1328         spin_lock_bh(&fnhe_lock);
1329
1330         if (daddr == fnhe->fnhe_daddr) {
1331                 struct rtable __rcu **porig;
1332                 struct rtable *orig;
1333                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1334
1335                 if (rt_is_input_route(rt))
1336                         porig = &fnhe->fnhe_rth_input;
1337                 else
1338                         porig = &fnhe->fnhe_rth_output;
1339                 orig = rcu_dereference(*porig);
1340
1341                 if (fnhe->fnhe_genid != genid) {
1342                         fnhe->fnhe_genid = genid;
1343                         fnhe->fnhe_gw = 0;
1344                         fnhe->fnhe_pmtu = 0;
1345                         fnhe->fnhe_expires = 0;
1346                         fnhe_flush_routes(fnhe);
1347                         orig = NULL;
1348                 }
1349                 fill_route_from_fnhe(rt, fnhe);
1350                 if (!rt->rt_gateway)
1351                         rt->rt_gateway = daddr;
1352
1353                 if (!(rt->dst.flags & DST_NOCACHE)) {
1354                         rcu_assign_pointer(*porig, rt);
1355                         if (orig)
1356                                 rt_free(orig);
1357                         ret = true;
1358                 }
1359
1360                 fnhe->fnhe_stamp = jiffies;
1361         }
1362         spin_unlock_bh(&fnhe_lock);
1363
1364         return ret;
1365 }
1366
1367 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1368 {
1369         struct rtable *orig, *prev, **p;
1370         bool ret = true;
1371
1372         if (rt_is_input_route(rt)) {
1373                 p = (struct rtable **)&nh->nh_rth_input;
1374         } else {
1375                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1376         }
1377         orig = *p;
1378
1379         prev = cmpxchg(p, orig, rt);
1380         if (prev == orig) {
1381                 if (orig)
1382                         rt_free(orig);
1383         } else
1384                 ret = false;
1385
1386         return ret;
1387 }
1388
1389 struct uncached_list {
1390         spinlock_t              lock;
1391         struct list_head        head;
1392 };
1393
1394 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1395
1396 static void rt_add_uncached_list(struct rtable *rt)
1397 {
1398         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1399
1400         rt->rt_uncached_list = ul;
1401
1402         spin_lock_bh(&ul->lock);
1403         list_add_tail(&rt->rt_uncached, &ul->head);
1404         spin_unlock_bh(&ul->lock);
1405 }
1406
1407 static void ipv4_dst_destroy(struct dst_entry *dst)
1408 {
1409         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1410         struct rtable *rt = (struct rtable *) dst;
1411
1412         if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1413                 kfree(p);
1414
1415         if (!list_empty(&rt->rt_uncached)) {
1416                 struct uncached_list *ul = rt->rt_uncached_list;
1417
1418                 spin_lock_bh(&ul->lock);
1419                 list_del(&rt->rt_uncached);
1420                 spin_unlock_bh(&ul->lock);
1421         }
1422 }
1423
1424 void rt_flush_dev(struct net_device *dev)
1425 {
1426         struct net *net = dev_net(dev);
1427         struct rtable *rt;
1428         int cpu;
1429
1430         for_each_possible_cpu(cpu) {
1431                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1432
1433                 spin_lock_bh(&ul->lock);
1434                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1435                         if (rt->dst.dev != dev)
1436                                 continue;
1437                         rt->dst.dev = net->loopback_dev;
1438                         dev_hold(rt->dst.dev);
1439                         dev_put(dev);
1440                 }
1441                 spin_unlock_bh(&ul->lock);
1442         }
1443 }
1444
1445 static bool rt_cache_valid(const struct rtable *rt)
1446 {
1447         return  rt &&
1448                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1449                 !rt_is_expired(rt);
1450 }
1451
1452 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1453                            const struct fib_result *res,
1454                            struct fib_nh_exception *fnhe,
1455                            struct fib_info *fi, u16 type, u32 itag)
1456 {
1457         bool cached = false;
1458
1459         if (fi) {
1460                 struct fib_nh *nh = &FIB_RES_NH(*res);
1461
1462                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1463                         rt->rt_gateway = nh->nh_gw;
1464                         rt->rt_uses_gateway = 1;
1465                 }
1466                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1467                 if (fi->fib_metrics != &dst_default_metrics) {
1468                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1469                         atomic_inc(&fi->fib_metrics->refcnt);
1470                 }
1471 #ifdef CONFIG_IP_ROUTE_CLASSID
1472                 rt->dst.tclassid = nh->nh_tclassid;
1473 #endif
1474                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1475                 if (unlikely(fnhe))
1476                         cached = rt_bind_exception(rt, fnhe, daddr);
1477                 else if (!(rt->dst.flags & DST_NOCACHE))
1478                         cached = rt_cache_route(nh, rt);
1479                 if (unlikely(!cached)) {
1480                         /* Routes we intend to cache in nexthop exception or
1481                          * FIB nexthop have the DST_NOCACHE bit clear.
1482                          * However, if we are unsuccessful at storing this
1483                          * route into the cache we really need to set it.
1484                          */
1485                         rt->dst.flags |= DST_NOCACHE;
1486                         if (!rt->rt_gateway)
1487                                 rt->rt_gateway = daddr;
1488                         rt_add_uncached_list(rt);
1489                 }
1490         } else
1491                 rt_add_uncached_list(rt);
1492
1493 #ifdef CONFIG_IP_ROUTE_CLASSID
1494 #ifdef CONFIG_IP_MULTIPLE_TABLES
1495         set_class_tag(rt, res->tclassid);
1496 #endif
1497         set_class_tag(rt, itag);
1498 #endif
1499 }
1500
1501 struct rtable *rt_dst_alloc(struct net_device *dev,
1502                             unsigned int flags, u16 type,
1503                             bool nopolicy, bool noxfrm, bool will_cache)
1504 {
1505         struct rtable *rt;
1506
1507         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1508                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1509                        (nopolicy ? DST_NOPOLICY : 0) |
1510                        (noxfrm ? DST_NOXFRM : 0));
1511
1512         if (rt) {
1513                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1514                 rt->rt_flags = flags;
1515                 rt->rt_type = type;
1516                 rt->rt_is_input = 0;
1517                 rt->rt_iif = 0;
1518                 rt->rt_pmtu = 0;
1519                 rt->rt_mtu_locked = 0;
1520                 rt->rt_gateway = 0;
1521                 rt->rt_uses_gateway = 0;
1522                 rt->rt_table_id = 0;
1523                 INIT_LIST_HEAD(&rt->rt_uncached);
1524
1525                 rt->dst.output = ip_output;
1526                 if (flags & RTCF_LOCAL)
1527                         rt->dst.input = ip_local_deliver;
1528         }
1529
1530         return rt;
1531 }
1532 EXPORT_SYMBOL(rt_dst_alloc);
1533
1534 /* called in rcu_read_lock() section */
1535 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1536                                 u8 tos, struct net_device *dev, int our)
1537 {
1538         struct rtable *rth;
1539         struct in_device *in_dev = __in_dev_get_rcu(dev);
1540         unsigned int flags = RTCF_MULTICAST;
1541         u32 itag = 0;
1542         int err;
1543
1544         /* Primary sanity checks. */
1545
1546         if (!in_dev)
1547                 return -EINVAL;
1548
1549         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1550             skb->protocol != htons(ETH_P_IP))
1551                 goto e_inval;
1552
1553         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1554                 goto e_inval;
1555
1556         if (ipv4_is_zeronet(saddr)) {
1557                 if (!ipv4_is_local_multicast(daddr))
1558                         goto e_inval;
1559         } else {
1560                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1561                                           in_dev, &itag);
1562                 if (err < 0)
1563                         goto e_err;
1564         }
1565         if (our)
1566                 flags |= RTCF_LOCAL;
1567
1568         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1569                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1570         if (!rth)
1571                 goto e_nobufs;
1572
1573 #ifdef CONFIG_IP_ROUTE_CLASSID
1574         rth->dst.tclassid = itag;
1575 #endif
1576         rth->dst.output = ip_rt_bug;
1577         rth->rt_is_input= 1;
1578
1579 #ifdef CONFIG_IP_MROUTE
1580         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1581                 rth->dst.input = ip_mr_input;
1582 #endif
1583         RT_CACHE_STAT_INC(in_slow_mc);
1584
1585         skb_dst_set(skb, &rth->dst);
1586         return 0;
1587
1588 e_nobufs:
1589         return -ENOBUFS;
1590 e_inval:
1591         return -EINVAL;
1592 e_err:
1593         return err;
1594 }
1595
1596
1597 static void ip_handle_martian_source(struct net_device *dev,
1598                                      struct in_device *in_dev,
1599                                      struct sk_buff *skb,
1600                                      __be32 daddr,
1601                                      __be32 saddr)
1602 {
1603         RT_CACHE_STAT_INC(in_martian_src);
1604 #ifdef CONFIG_IP_ROUTE_VERBOSE
1605         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1606                 /*
1607                  *      RFC1812 recommendation, if source is martian,
1608                  *      the only hint is MAC header.
1609                  */
1610                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1611                         &daddr, &saddr, dev->name);
1612                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1613                         print_hex_dump(KERN_WARNING, "ll header: ",
1614                                        DUMP_PREFIX_OFFSET, 16, 1,
1615                                        skb_mac_header(skb),
1616                                        dev->hard_header_len, true);
1617                 }
1618         }
1619 #endif
1620 }
1621
1622 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1623 {
1624         struct fnhe_hash_bucket *hash;
1625         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1626         u32 hval = fnhe_hashfun(daddr);
1627
1628         spin_lock_bh(&fnhe_lock);
1629
1630         hash = rcu_dereference_protected(nh->nh_exceptions,
1631                                          lockdep_is_held(&fnhe_lock));
1632         hash += hval;
1633
1634         fnhe_p = &hash->chain;
1635         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1636         while (fnhe) {
1637                 if (fnhe->fnhe_daddr == daddr) {
1638                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1639                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1640                         /* set fnhe_daddr to 0 to ensure it won't bind with
1641                          * new dsts in rt_bind_exception().
1642                          */
1643                         fnhe->fnhe_daddr = 0;
1644                         fnhe_flush_routes(fnhe);
1645                         kfree_rcu(fnhe, rcu);
1646                         break;
1647                 }
1648                 fnhe_p = &fnhe->fnhe_next;
1649                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1650                                                  lockdep_is_held(&fnhe_lock));
1651         }
1652
1653         spin_unlock_bh(&fnhe_lock);
1654 }
1655
1656 /* called in rcu_read_lock() section */
1657 static int __mkroute_input(struct sk_buff *skb,
1658                            const struct fib_result *res,
1659                            struct in_device *in_dev,
1660                            __be32 daddr, __be32 saddr, u32 tos)
1661 {
1662         struct fib_nh_exception *fnhe;
1663         struct rtable *rth;
1664         int err;
1665         struct in_device *out_dev;
1666         bool do_cache;
1667         u32 itag = 0;
1668
1669         /* get a working reference to the output device */
1670         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1671         if (!out_dev) {
1672                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1673                 return -EINVAL;
1674         }
1675
1676         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1677                                   in_dev->dev, in_dev, &itag);
1678         if (err < 0) {
1679                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1680                                          saddr);
1681
1682                 goto cleanup;
1683         }
1684
1685         do_cache = res->fi && !itag;
1686         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1687             skb->protocol == htons(ETH_P_IP) &&
1688             (IN_DEV_SHARED_MEDIA(out_dev) ||
1689              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1690                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1691
1692         if (skb->protocol != htons(ETH_P_IP)) {
1693                 /* Not IP (i.e. ARP). Do not create route, if it is
1694                  * invalid for proxy arp. DNAT routes are always valid.
1695                  *
1696                  * Proxy arp feature have been extended to allow, ARP
1697                  * replies back to the same interface, to support
1698                  * Private VLAN switch technologies. See arp.c.
1699                  */
1700                 if (out_dev == in_dev &&
1701                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1702                         err = -EINVAL;
1703                         goto cleanup;
1704                 }
1705         }
1706
1707         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1708         if (do_cache) {
1709                 if (fnhe) {
1710                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1711                         if (rth && rth->dst.expires &&
1712                             time_after(jiffies, rth->dst.expires)) {
1713                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1714                                 fnhe = NULL;
1715                         } else {
1716                                 goto rt_cache;
1717                         }
1718                 }
1719
1720                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1721
1722 rt_cache:
1723                 if (rt_cache_valid(rth)) {
1724                         skb_dst_set_noref(skb, &rth->dst);
1725                         goto out;
1726                 }
1727         }
1728
1729         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1730                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1731                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1732         if (!rth) {
1733                 err = -ENOBUFS;
1734                 goto cleanup;
1735         }
1736
1737         rth->rt_is_input = 1;
1738         if (res->table)
1739                 rth->rt_table_id = res->table->tb_id;
1740         RT_CACHE_STAT_INC(in_slow_tot);
1741
1742         rth->dst.input = ip_forward;
1743
1744         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1745         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1746                 rth->dst.lwtstate->orig_output = rth->dst.output;
1747                 rth->dst.output = lwtunnel_output;
1748         }
1749         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1750                 rth->dst.lwtstate->orig_input = rth->dst.input;
1751                 rth->dst.input = lwtunnel_input;
1752         }
1753         skb_dst_set(skb, &rth->dst);
1754 out:
1755         err = 0;
1756  cleanup:
1757         return err;
1758 }
1759
1760 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1761
1762 /* To make ICMP packets follow the right flow, the multipath hash is
1763  * calculated from the inner IP addresses in reverse order.
1764  */
1765 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1766 {
1767         const struct iphdr *outer_iph = ip_hdr(skb);
1768         struct icmphdr _icmph;
1769         const struct icmphdr *icmph;
1770         struct iphdr _inner_iph;
1771         const struct iphdr *inner_iph;
1772
1773         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1774                 goto standard_hash;
1775
1776         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1777                                    &_icmph);
1778         if (!icmph)
1779                 goto standard_hash;
1780
1781         if (icmph->type != ICMP_DEST_UNREACH &&
1782             icmph->type != ICMP_REDIRECT &&
1783             icmph->type != ICMP_TIME_EXCEEDED &&
1784             icmph->type != ICMP_PARAMETERPROB) {
1785                 goto standard_hash;
1786         }
1787
1788         inner_iph = skb_header_pointer(skb,
1789                                        outer_iph->ihl * 4 + sizeof(_icmph),
1790                                        sizeof(_inner_iph), &_inner_iph);
1791         if (!inner_iph)
1792                 goto standard_hash;
1793
1794         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1795
1796 standard_hash:
1797         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1798 }
1799
1800 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1801
1802 static int ip_mkroute_input(struct sk_buff *skb,
1803                             struct fib_result *res,
1804                             const struct flowi4 *fl4,
1805                             struct in_device *in_dev,
1806                             __be32 daddr, __be32 saddr, u32 tos)
1807 {
1808 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1809         if (res->fi && res->fi->fib_nhs > 1) {
1810                 int h;
1811
1812                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1813                         h = ip_multipath_icmp_hash(skb);
1814                 else
1815                         h = fib_multipath_hash(saddr, daddr);
1816                 fib_select_multipath(res, h);
1817         }
1818 #endif
1819
1820         /* create a routing cache entry */
1821         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1822 }
1823
1824 /*
1825  *      NOTE. We drop all the packets that has local source
1826  *      addresses, because every properly looped back packet
1827  *      must have correct destination already attached by output routine.
1828  *
1829  *      Such approach solves two big problems:
1830  *      1. Not simplex devices are handled properly.
1831  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1832  *      called with rcu_read_lock()
1833  */
1834
1835 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1836                                u8 tos, struct net_device *dev)
1837 {
1838         struct fib_result res;
1839         struct in_device *in_dev = __in_dev_get_rcu(dev);
1840         struct ip_tunnel_info *tun_info;
1841         struct flowi4   fl4;
1842         unsigned int    flags = 0;
1843         u32             itag = 0;
1844         struct rtable   *rth;
1845         int             err = -EINVAL;
1846         struct net    *net = dev_net(dev);
1847         bool do_cache;
1848
1849         /* IP on this device is disabled. */
1850
1851         if (!in_dev)
1852                 goto out;
1853
1854         /* Check for the most weird martians, which can be not detected
1855            by fib_lookup.
1856          */
1857
1858         tun_info = skb_tunnel_info(skb);
1859         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1860                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1861         else
1862                 fl4.flowi4_tun_key.tun_id = 0;
1863         skb_dst_drop(skb);
1864
1865         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1866                 goto martian_source;
1867
1868         res.fi = NULL;
1869         res.table = NULL;
1870         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1871                 goto brd_input;
1872
1873         /* Accept zero addresses only to limited broadcast;
1874          * I even do not know to fix it or not. Waiting for complains :-)
1875          */
1876         if (ipv4_is_zeronet(saddr))
1877                 goto martian_source;
1878
1879         if (ipv4_is_zeronet(daddr))
1880                 goto martian_destination;
1881
1882         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1883          * and call it once if daddr or/and saddr are loopback addresses
1884          */
1885         if (ipv4_is_loopback(daddr)) {
1886                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1887                         goto martian_destination;
1888         } else if (ipv4_is_loopback(saddr)) {
1889                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1890                         goto martian_source;
1891         }
1892
1893         /*
1894          *      Now we are ready to route packet.
1895          */
1896         fl4.flowi4_oif = 0;
1897         fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1898         fl4.flowi4_mark = skb->mark;
1899         fl4.flowi4_tos = tos;
1900         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1901         fl4.flowi4_flags = 0;
1902         fl4.daddr = daddr;
1903         fl4.saddr = saddr;
1904         err = fib_lookup(net, &fl4, &res, 0);
1905         if (err != 0) {
1906                 if (!IN_DEV_FORWARD(in_dev))
1907                         err = -EHOSTUNREACH;
1908                 goto no_route;
1909         }
1910
1911         if (res.type == RTN_BROADCAST)
1912                 goto brd_input;
1913
1914         if (res.type == RTN_LOCAL) {
1915                 err = fib_validate_source(skb, saddr, daddr, tos,
1916                                           0, dev, in_dev, &itag);
1917                 if (err < 0)
1918                         goto martian_source;
1919                 goto local_input;
1920         }
1921
1922         if (!IN_DEV_FORWARD(in_dev)) {
1923                 err = -EHOSTUNREACH;
1924                 goto no_route;
1925         }
1926         if (res.type != RTN_UNICAST)
1927                 goto martian_destination;
1928
1929         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1930 out:    return err;
1931
1932 brd_input:
1933         if (skb->protocol != htons(ETH_P_IP))
1934                 goto e_inval;
1935
1936         if (!ipv4_is_zeronet(saddr)) {
1937                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1938                                           in_dev, &itag);
1939                 if (err < 0)
1940                         goto martian_source;
1941         }
1942         flags |= RTCF_BROADCAST;
1943         res.type = RTN_BROADCAST;
1944         RT_CACHE_STAT_INC(in_brd);
1945
1946 local_input:
1947         do_cache = false;
1948         if (res.fi) {
1949                 if (!itag) {
1950                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1951                         if (rt_cache_valid(rth)) {
1952                                 skb_dst_set_noref(skb, &rth->dst);
1953                                 err = 0;
1954                                 goto out;
1955                         }
1956                         do_cache = true;
1957                 }
1958         }
1959
1960         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1961                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1962         if (!rth)
1963                 goto e_nobufs;
1964
1965         rth->dst.output= ip_rt_bug;
1966 #ifdef CONFIG_IP_ROUTE_CLASSID
1967         rth->dst.tclassid = itag;
1968 #endif
1969         rth->rt_is_input = 1;
1970         if (res.table)
1971                 rth->rt_table_id = res.table->tb_id;
1972
1973         RT_CACHE_STAT_INC(in_slow_tot);
1974         if (res.type == RTN_UNREACHABLE) {
1975                 rth->dst.input= ip_error;
1976                 rth->dst.error= -err;
1977                 rth->rt_flags   &= ~RTCF_LOCAL;
1978         }
1979         if (do_cache) {
1980                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1981                         rth->dst.flags |= DST_NOCACHE;
1982                         rt_add_uncached_list(rth);
1983                 }
1984         }
1985         skb_dst_set(skb, &rth->dst);
1986         err = 0;
1987         goto out;
1988
1989 no_route:
1990         RT_CACHE_STAT_INC(in_no_route);
1991         res.type = RTN_UNREACHABLE;
1992         res.fi = NULL;
1993         res.table = NULL;
1994         goto local_input;
1995
1996         /*
1997          *      Do not cache martian addresses: they should be logged (RFC1812)
1998          */
1999 martian_destination:
2000         RT_CACHE_STAT_INC(in_martian_dst);
2001 #ifdef CONFIG_IP_ROUTE_VERBOSE
2002         if (IN_DEV_LOG_MARTIANS(in_dev))
2003                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2004                                      &daddr, &saddr, dev->name);
2005 #endif
2006
2007 e_inval:
2008         err = -EINVAL;
2009         goto out;
2010
2011 e_nobufs:
2012         err = -ENOBUFS;
2013         goto out;
2014
2015 martian_source:
2016         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2017         goto out;
2018 }
2019
2020 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2021                          u8 tos, struct net_device *dev)
2022 {
2023         int res;
2024
2025         tos &= IPTOS_RT_MASK;
2026         rcu_read_lock();
2027
2028         /* Multicast recognition logic is moved from route cache to here.
2029            The problem was that too many Ethernet cards have broken/missing
2030            hardware multicast filters :-( As result the host on multicasting
2031            network acquires a lot of useless route cache entries, sort of
2032            SDR messages from all the world. Now we try to get rid of them.
2033            Really, provided software IP multicast filter is organized
2034            reasonably (at least, hashed), it does not result in a slowdown
2035            comparing with route cache reject entries.
2036            Note, that multicast routers are not affected, because
2037            route cache entry is created eventually.
2038          */
2039         if (ipv4_is_multicast(daddr)) {
2040                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2041
2042                 if (in_dev) {
2043                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2044                                                   ip_hdr(skb)->protocol);
2045                         if (our
2046 #ifdef CONFIG_IP_MROUTE
2047                                 ||
2048                             (!ipv4_is_local_multicast(daddr) &&
2049                              IN_DEV_MFORWARD(in_dev))
2050 #endif
2051                            ) {
2052                                 int res = ip_route_input_mc(skb, daddr, saddr,
2053                                                             tos, dev, our);
2054                                 rcu_read_unlock();
2055                                 return res;
2056                         }
2057                 }
2058                 rcu_read_unlock();
2059                 return -EINVAL;
2060         }
2061         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2062         rcu_read_unlock();
2063         return res;
2064 }
2065 EXPORT_SYMBOL(ip_route_input_noref);
2066
2067 /* called with rcu_read_lock() */
2068 static struct rtable *__mkroute_output(const struct fib_result *res,
2069                                        const struct flowi4 *fl4, int orig_oif,
2070                                        struct net_device *dev_out,
2071                                        unsigned int flags)
2072 {
2073         struct fib_info *fi = res->fi;
2074         struct fib_nh_exception *fnhe;
2075         struct in_device *in_dev;
2076         u16 type = res->type;
2077         struct rtable *rth;
2078         bool do_cache;
2079
2080         in_dev = __in_dev_get_rcu(dev_out);
2081         if (!in_dev)
2082                 return ERR_PTR(-EINVAL);
2083
2084         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2085                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2086                         return ERR_PTR(-EINVAL);
2087
2088         if (ipv4_is_lbcast(fl4->daddr))
2089                 type = RTN_BROADCAST;
2090         else if (ipv4_is_multicast(fl4->daddr))
2091                 type = RTN_MULTICAST;
2092         else if (ipv4_is_zeronet(fl4->daddr))
2093                 return ERR_PTR(-EINVAL);
2094
2095         if (dev_out->flags & IFF_LOOPBACK)
2096                 flags |= RTCF_LOCAL;
2097
2098         do_cache = true;
2099         if (type == RTN_BROADCAST) {
2100                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2101                 fi = NULL;
2102         } else if (type == RTN_MULTICAST) {
2103                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2104                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2105                                      fl4->flowi4_proto))
2106                         flags &= ~RTCF_LOCAL;
2107                 else
2108                         do_cache = false;
2109                 /* If multicast route do not exist use
2110                  * default one, but do not gateway in this case.
2111                  * Yes, it is hack.
2112                  */
2113                 if (fi && res->prefixlen < 4)
2114                         fi = NULL;
2115         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2116                    (orig_oif != dev_out->ifindex)) {
2117                 /* For local routes that require a particular output interface
2118                  * we do not want to cache the result.  Caching the result
2119                  * causes incorrect behaviour when there are multiple source
2120                  * addresses on the interface, the end result being that if the
2121                  * intended recipient is waiting on that interface for the
2122                  * packet he won't receive it because it will be delivered on
2123                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2124                  * be set to the loopback interface as well.
2125                  */
2126                 fi = NULL;
2127         }
2128
2129         fnhe = NULL;
2130         do_cache &= fi != NULL;
2131         if (do_cache) {
2132                 struct rtable __rcu **prth;
2133                 struct fib_nh *nh = &FIB_RES_NH(*res);
2134
2135                 fnhe = find_exception(nh, fl4->daddr);
2136                 if (fnhe) {
2137                         prth = &fnhe->fnhe_rth_output;
2138                         rth = rcu_dereference(*prth);
2139                         if (rth && rth->dst.expires &&
2140                             time_after(jiffies, rth->dst.expires)) {
2141                                 ip_del_fnhe(nh, fl4->daddr);
2142                                 fnhe = NULL;
2143                         } else {
2144                                 goto rt_cache;
2145                         }
2146                 }
2147
2148                 if (unlikely(fl4->flowi4_flags &
2149                              FLOWI_FLAG_KNOWN_NH &&
2150                              !(nh->nh_gw &&
2151                                nh->nh_scope == RT_SCOPE_LINK))) {
2152                         do_cache = false;
2153                         goto add;
2154                 }
2155                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2156                 rth = rcu_dereference(*prth);
2157
2158 rt_cache:
2159                 if (rt_cache_valid(rth)) {
2160                         dst_hold(&rth->dst);
2161                         return rth;
2162                 }
2163         }
2164
2165 add:
2166         rth = rt_dst_alloc(dev_out, flags, type,
2167                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2168                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2169                            do_cache);
2170         if (!rth)
2171                 return ERR_PTR(-ENOBUFS);
2172
2173         rth->rt_iif     = orig_oif ? : 0;
2174         if (res->table)
2175                 rth->rt_table_id = res->table->tb_id;
2176
2177         RT_CACHE_STAT_INC(out_slow_tot);
2178
2179         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2180                 if (flags & RTCF_LOCAL &&
2181                     !(dev_out->flags & IFF_LOOPBACK)) {
2182                         rth->dst.output = ip_mc_output;
2183                         RT_CACHE_STAT_INC(out_slow_mc);
2184                 }
2185 #ifdef CONFIG_IP_MROUTE
2186                 if (type == RTN_MULTICAST) {
2187                         if (IN_DEV_MFORWARD(in_dev) &&
2188                             !ipv4_is_local_multicast(fl4->daddr)) {
2189                                 rth->dst.input = ip_mr_input;
2190                                 rth->dst.output = ip_mc_output;
2191                         }
2192                 }
2193 #endif
2194         }
2195
2196         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2197         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2198                 rth->dst.output = lwtunnel_output;
2199
2200         return rth;
2201 }
2202
2203 /*
2204  * Major route resolver routine.
2205  */
2206
2207 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2208                                           int mp_hash)
2209 {
2210         struct net_device *dev_out = NULL;
2211         __u8 tos = RT_FL_TOS(fl4);
2212         unsigned int flags = 0;
2213         struct fib_result res;
2214         struct rtable *rth;
2215         int orig_oif;
2216         int err;
2217
2218         res.tclassid    = 0;
2219         res.fi          = NULL;
2220         res.table       = NULL;
2221
2222         orig_oif = fl4->flowi4_oif;
2223
2224         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2225         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2226         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2227                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2228
2229         rcu_read_lock();
2230         if (fl4->saddr) {
2231                 if (ipv4_is_multicast(fl4->saddr) ||
2232                     ipv4_is_lbcast(fl4->saddr) ||
2233                     ipv4_is_zeronet(fl4->saddr)) {
2234                         rth = ERR_PTR(-EINVAL);
2235                         goto out;
2236                 }
2237
2238                 rth = ERR_PTR(-ENETUNREACH);
2239
2240                 /* I removed check for oif == dev_out->oif here.
2241                    It was wrong for two reasons:
2242                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2243                       is assigned to multiple interfaces.
2244                    2. Moreover, we are allowed to send packets with saddr
2245                       of another iface. --ANK
2246                  */
2247
2248                 if (fl4->flowi4_oif == 0 &&
2249                     (ipv4_is_multicast(fl4->daddr) ||
2250                      ipv4_is_lbcast(fl4->daddr))) {
2251                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2252                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2253                         if (!dev_out)
2254                                 goto out;
2255
2256                         /* Special hack: user can direct multicasts
2257                            and limited broadcast via necessary interface
2258                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2259                            This hack is not just for fun, it allows
2260                            vic,vat and friends to work.
2261                            They bind socket to loopback, set ttl to zero
2262                            and expect that it will work.
2263                            From the viewpoint of routing cache they are broken,
2264                            because we are not allowed to build multicast path
2265                            with loopback source addr (look, routing cache
2266                            cannot know, that ttl is zero, so that packet
2267                            will not leave this host and route is valid).
2268                            Luckily, this hack is good workaround.
2269                          */
2270
2271                         fl4->flowi4_oif = dev_out->ifindex;
2272                         goto make_route;
2273                 }
2274
2275                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2276                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2277                         if (!__ip_dev_find(net, fl4->saddr, false))
2278                                 goto out;
2279                 }
2280         }
2281
2282
2283         if (fl4->flowi4_oif) {
2284                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2285                 rth = ERR_PTR(-ENODEV);
2286                 if (!dev_out)
2287                         goto out;
2288
2289                 /* RACE: Check return value of inet_select_addr instead. */
2290                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2291                         rth = ERR_PTR(-ENETUNREACH);
2292                         goto out;
2293                 }
2294                 if (ipv4_is_local_multicast(fl4->daddr) ||
2295                     ipv4_is_lbcast(fl4->daddr) ||
2296                     fl4->flowi4_proto == IPPROTO_IGMP) {
2297                         if (!fl4->saddr)
2298                                 fl4->saddr = inet_select_addr(dev_out, 0,
2299                                                               RT_SCOPE_LINK);
2300                         goto make_route;
2301                 }
2302                 if (!fl4->saddr) {
2303                         if (ipv4_is_multicast(fl4->daddr))
2304                                 fl4->saddr = inet_select_addr(dev_out, 0,
2305                                                               fl4->flowi4_scope);
2306                         else if (!fl4->daddr)
2307                                 fl4->saddr = inet_select_addr(dev_out, 0,
2308                                                               RT_SCOPE_HOST);
2309                 }
2310
2311                 rth = l3mdev_get_rtable(dev_out, fl4);
2312                 if (rth)
2313                         goto out;
2314         }
2315
2316         if (!fl4->daddr) {
2317                 fl4->daddr = fl4->saddr;
2318                 if (!fl4->daddr)
2319                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2320                 dev_out = net->loopback_dev;
2321                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2322                 res.type = RTN_LOCAL;
2323                 flags |= RTCF_LOCAL;
2324                 goto make_route;
2325         }
2326
2327         err = fib_lookup(net, fl4, &res, 0);
2328         if (err) {
2329                 res.fi = NULL;
2330                 res.table = NULL;
2331                 if (fl4->flowi4_oif &&
2332                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2333                         /* Apparently, routing tables are wrong. Assume,
2334                            that the destination is on link.
2335
2336                            WHY? DW.
2337                            Because we are allowed to send to iface
2338                            even if it has NO routes and NO assigned
2339                            addresses. When oif is specified, routing
2340                            tables are looked up with only one purpose:
2341                            to catch if destination is gatewayed, rather than
2342                            direct. Moreover, if MSG_DONTROUTE is set,
2343                            we send packet, ignoring both routing tables
2344                            and ifaddr state. --ANK
2345
2346
2347                            We could make it even if oif is unknown,
2348                            likely IPv6, but we do not.
2349                          */
2350
2351                         if (fl4->saddr == 0)
2352                                 fl4->saddr = inet_select_addr(dev_out, 0,
2353                                                               RT_SCOPE_LINK);
2354                         res.type = RTN_UNICAST;
2355                         goto make_route;
2356                 }
2357                 rth = ERR_PTR(err);
2358                 goto out;
2359         }
2360
2361         if (res.type == RTN_LOCAL) {
2362                 if (!fl4->saddr) {
2363                         if (res.fi->fib_prefsrc)
2364                                 fl4->saddr = res.fi->fib_prefsrc;
2365                         else
2366                                 fl4->saddr = fl4->daddr;
2367                 }
2368                 dev_out = net->loopback_dev;
2369                 fl4->flowi4_oif = dev_out->ifindex;
2370                 flags |= RTCF_LOCAL;
2371                 goto make_route;
2372         }
2373
2374         fib_select_path(net, &res, fl4, mp_hash);
2375
2376         dev_out = FIB_RES_DEV(res);
2377         fl4->flowi4_oif = dev_out->ifindex;
2378
2379
2380 make_route:
2381         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2382
2383 out:
2384         rcu_read_unlock();
2385         return rth;
2386 }
2387 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2388
2389 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2390 {
2391         return NULL;
2392 }
2393
2394 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2395 {
2396         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2397
2398         return mtu ? : dst->dev->mtu;
2399 }
2400
2401 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2402                                           struct sk_buff *skb, u32 mtu)
2403 {
2404 }
2405
2406 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2407                                        struct sk_buff *skb)
2408 {
2409 }
2410
2411 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2412                                           unsigned long old)
2413 {
2414         return NULL;
2415 }
2416
2417 static struct dst_ops ipv4_dst_blackhole_ops = {
2418         .family                 =       AF_INET,
2419         .check                  =       ipv4_blackhole_dst_check,
2420         .mtu                    =       ipv4_blackhole_mtu,
2421         .default_advmss         =       ipv4_default_advmss,
2422         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2423         .redirect               =       ipv4_rt_blackhole_redirect,
2424         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2425         .neigh_lookup           =       ipv4_neigh_lookup,
2426 };
2427
2428 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2429 {
2430         struct rtable *ort = (struct rtable *) dst_orig;
2431         struct rtable *rt;
2432
2433         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2434         if (rt) {
2435                 struct dst_entry *new = &rt->dst;
2436
2437                 new->__use = 1;
2438                 new->input = dst_discard;
2439                 new->output = dst_discard_out;
2440
2441                 new->dev = ort->dst.dev;
2442                 if (new->dev)
2443                         dev_hold(new->dev);
2444
2445                 rt->rt_is_input = ort->rt_is_input;
2446                 rt->rt_iif = ort->rt_iif;
2447                 rt->rt_pmtu = ort->rt_pmtu;
2448                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2449
2450                 rt->rt_genid = rt_genid_ipv4(net);
2451                 rt->rt_flags = ort->rt_flags;
2452                 rt->rt_type = ort->rt_type;
2453                 rt->rt_gateway = ort->rt_gateway;
2454                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2455
2456                 INIT_LIST_HEAD(&rt->rt_uncached);
2457                 dst_free(new);
2458         }
2459
2460         dst_release(dst_orig);
2461
2462         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2463 }
2464
2465 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2466                                     const struct sock *sk)
2467 {
2468         struct rtable *rt = __ip_route_output_key(net, flp4);
2469
2470         if (IS_ERR(rt))
2471                 return rt;
2472
2473         if (flp4->flowi4_proto)
2474                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2475                                                         flowi4_to_flowi(flp4),
2476                                                         sk, 0);
2477
2478         return rt;
2479 }
2480 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2481
2482 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2483                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2484                         u32 seq, int event, int nowait, unsigned int flags)
2485 {
2486         struct rtable *rt = skb_rtable(skb);
2487         struct rtmsg *r;
2488         struct nlmsghdr *nlh;
2489         unsigned long expires = 0;
2490         u32 error;
2491         u32 metrics[RTAX_MAX];
2492
2493         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2494         if (!nlh)
2495                 return -EMSGSIZE;
2496
2497         r = nlmsg_data(nlh);
2498         r->rtm_family    = AF_INET;
2499         r->rtm_dst_len  = 32;
2500         r->rtm_src_len  = 0;
2501         r->rtm_tos      = fl4->flowi4_tos;
2502         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2503         if (nla_put_u32(skb, RTA_TABLE, table_id))
2504                 goto nla_put_failure;
2505         r->rtm_type     = rt->rt_type;
2506         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2507         r->rtm_protocol = RTPROT_UNSPEC;
2508         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2509         if (rt->rt_flags & RTCF_NOTIFY)
2510                 r->rtm_flags |= RTM_F_NOTIFY;
2511         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2512                 r->rtm_flags |= RTCF_DOREDIRECT;
2513
2514         if (nla_put_in_addr(skb, RTA_DST, dst))
2515                 goto nla_put_failure;
2516         if (src) {
2517                 r->rtm_src_len = 32;
2518                 if (nla_put_in_addr(skb, RTA_SRC, src))
2519                         goto nla_put_failure;
2520         }
2521         if (rt->dst.dev &&
2522             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2523                 goto nla_put_failure;
2524 #ifdef CONFIG_IP_ROUTE_CLASSID
2525         if (rt->dst.tclassid &&
2526             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2527                 goto nla_put_failure;
2528 #endif
2529         if (!rt_is_input_route(rt) &&
2530             fl4->saddr != src) {
2531                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2532                         goto nla_put_failure;
2533         }
2534         if (rt->rt_uses_gateway &&
2535             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2536                 goto nla_put_failure;
2537
2538         expires = rt->dst.expires;
2539         if (expires) {
2540                 unsigned long now = jiffies;
2541
2542                 if (time_before(now, expires))
2543                         expires -= now;
2544                 else
2545                         expires = 0;
2546         }
2547
2548         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2549         if (rt->rt_pmtu && expires)
2550                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2551         if (rt->rt_mtu_locked && expires)
2552                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2553         if (rtnetlink_put_metrics(skb, metrics) < 0)
2554                 goto nla_put_failure;
2555
2556         if (fl4->flowi4_mark &&
2557             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2558                 goto nla_put_failure;
2559
2560         error = rt->dst.error;
2561
2562         if (rt_is_input_route(rt)) {
2563 #ifdef CONFIG_IP_MROUTE
2564                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2565                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2566                         int err = ipmr_get_route(net, skb,
2567                                                  fl4->saddr, fl4->daddr,
2568                                                  r, nowait, portid);
2569
2570                         if (err <= 0) {
2571                                 if (!nowait) {
2572                                         if (err == 0)
2573                                                 return 0;
2574                                         goto nla_put_failure;
2575                                 } else {
2576                                         if (err == -EMSGSIZE)
2577                                                 goto nla_put_failure;
2578                                         error = err;
2579                                 }
2580                         }
2581                 } else
2582 #endif
2583                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2584                                 goto nla_put_failure;
2585         }
2586
2587         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2588                 goto nla_put_failure;
2589
2590         nlmsg_end(skb, nlh);
2591         return 0;
2592
2593 nla_put_failure:
2594         nlmsg_cancel(skb, nlh);
2595         return -EMSGSIZE;
2596 }
2597
2598 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2599 {
2600         struct net *net = sock_net(in_skb->sk);
2601         struct rtmsg *rtm;
2602         struct nlattr *tb[RTA_MAX+1];
2603         struct rtable *rt = NULL;
2604         struct flowi4 fl4;
2605         __be32 dst = 0;
2606         __be32 src = 0;
2607         u32 iif;
2608         int err;
2609         int mark;
2610         struct sk_buff *skb;
2611         u32 table_id = RT_TABLE_MAIN;
2612
2613         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2614         if (err < 0)
2615                 goto errout;
2616
2617         rtm = nlmsg_data(nlh);
2618
2619         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2620         if (!skb) {
2621                 err = -ENOBUFS;
2622                 goto errout;
2623         }
2624
2625         /* Reserve room for dummy headers, this skb can pass
2626            through good chunk of routing engine.
2627          */
2628         skb_reset_mac_header(skb);
2629         skb_reset_network_header(skb);
2630
2631         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2632         ip_hdr(skb)->protocol = IPPROTO_UDP;
2633         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2634
2635         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2636         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2637         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2638         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2639
2640         memset(&fl4, 0, sizeof(fl4));
2641         fl4.daddr = dst;
2642         fl4.saddr = src;
2643         fl4.flowi4_tos = rtm->rtm_tos;
2644         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2645         fl4.flowi4_mark = mark;
2646
2647         if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2648                 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2649
2650         if (iif) {
2651                 struct net_device *dev;
2652
2653                 dev = __dev_get_by_index(net, iif);
2654                 if (!dev) {
2655                         err = -ENODEV;
2656                         goto errout_free;
2657                 }
2658
2659                 skb->protocol   = htons(ETH_P_IP);
2660                 skb->dev        = dev;
2661                 skb->mark       = mark;
2662                 local_bh_disable();
2663                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2664                 local_bh_enable();
2665
2666                 rt = skb_rtable(skb);
2667                 if (err == 0 && rt->dst.error)
2668                         err = -rt->dst.error;
2669         } else {
2670                 rt = ip_route_output_key(net, &fl4);
2671
2672                 err = 0;
2673                 if (IS_ERR(rt))
2674                         err = PTR_ERR(rt);
2675         }
2676
2677         if (err)
2678                 goto errout_free;
2679
2680         skb_dst_set(skb, &rt->dst);
2681         if (rtm->rtm_flags & RTM_F_NOTIFY)
2682                 rt->rt_flags |= RTCF_NOTIFY;
2683
2684         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2685                 table_id = rt->rt_table_id;
2686
2687         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2688                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2689                            RTM_NEWROUTE, 0, 0);
2690         if (err < 0)
2691                 goto errout_free;
2692
2693         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2694 errout:
2695         return err;
2696
2697 errout_free:
2698         kfree_skb(skb);
2699         goto errout;
2700 }
2701
2702 void ip_rt_multicast_event(struct in_device *in_dev)
2703 {
2704         rt_cache_flush(dev_net(in_dev->dev));
2705 }
2706
2707 #ifdef CONFIG_SYSCTL
2708 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2709 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2710 static int ip_rt_gc_elasticity __read_mostly    = 8;
2711 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2712
2713 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2714                                         void __user *buffer,
2715                                         size_t *lenp, loff_t *ppos)
2716 {
2717         struct net *net = (struct net *)__ctl->extra1;
2718
2719         if (write) {
2720                 rt_cache_flush(net);
2721                 fnhe_genid_bump(net);
2722                 return 0;
2723         }
2724
2725         return -EINVAL;
2726 }
2727
2728 static struct ctl_table ipv4_route_table[] = {
2729         {
2730                 .procname       = "gc_thresh",
2731                 .data           = &ipv4_dst_ops.gc_thresh,
2732                 .maxlen         = sizeof(int),
2733                 .mode           = 0644,
2734                 .proc_handler   = proc_dointvec,
2735         },
2736         {
2737                 .procname       = "max_size",
2738                 .data           = &ip_rt_max_size,
2739                 .maxlen         = sizeof(int),
2740                 .mode           = 0644,
2741                 .proc_handler   = proc_dointvec,
2742         },
2743         {
2744                 /*  Deprecated. Use gc_min_interval_ms */
2745
2746                 .procname       = "gc_min_interval",
2747                 .data           = &ip_rt_gc_min_interval,
2748                 .maxlen         = sizeof(int),
2749                 .mode           = 0644,
2750                 .proc_handler   = proc_dointvec_jiffies,
2751         },
2752         {
2753                 .procname       = "gc_min_interval_ms",
2754                 .data           = &ip_rt_gc_min_interval,
2755                 .maxlen         = sizeof(int),
2756                 .mode           = 0644,
2757                 .proc_handler   = proc_dointvec_ms_jiffies,
2758         },
2759         {
2760                 .procname       = "gc_timeout",
2761                 .data           = &ip_rt_gc_timeout,
2762                 .maxlen         = sizeof(int),
2763                 .mode           = 0644,
2764                 .proc_handler   = proc_dointvec_jiffies,
2765         },
2766         {
2767                 .procname       = "gc_interval",
2768                 .data           = &ip_rt_gc_interval,
2769                 .maxlen         = sizeof(int),
2770                 .mode           = 0644,
2771                 .proc_handler   = proc_dointvec_jiffies,
2772         },
2773         {
2774                 .procname       = "redirect_load",
2775                 .data           = &ip_rt_redirect_load,
2776                 .maxlen         = sizeof(int),
2777                 .mode           = 0644,
2778                 .proc_handler   = proc_dointvec,
2779         },
2780         {
2781                 .procname       = "redirect_number",
2782                 .data           = &ip_rt_redirect_number,
2783                 .maxlen         = sizeof(int),
2784                 .mode           = 0644,
2785                 .proc_handler   = proc_dointvec,
2786         },
2787         {
2788                 .procname       = "redirect_silence",
2789                 .data           = &ip_rt_redirect_silence,
2790                 .maxlen         = sizeof(int),
2791                 .mode           = 0644,
2792                 .proc_handler   = proc_dointvec,
2793         },
2794         {
2795                 .procname       = "error_cost",
2796                 .data           = &ip_rt_error_cost,
2797                 .maxlen         = sizeof(int),
2798                 .mode           = 0644,
2799                 .proc_handler   = proc_dointvec,
2800         },
2801         {
2802                 .procname       = "error_burst",
2803                 .data           = &ip_rt_error_burst,
2804                 .maxlen         = sizeof(int),
2805                 .mode           = 0644,
2806                 .proc_handler   = proc_dointvec,
2807         },
2808         {
2809                 .procname       = "gc_elasticity",
2810                 .data           = &ip_rt_gc_elasticity,
2811                 .maxlen         = sizeof(int),
2812                 .mode           = 0644,
2813                 .proc_handler   = proc_dointvec,
2814         },
2815         {
2816                 .procname       = "mtu_expires",
2817                 .data           = &ip_rt_mtu_expires,
2818                 .maxlen         = sizeof(int),
2819                 .mode           = 0644,
2820                 .proc_handler   = proc_dointvec_jiffies,
2821         },
2822         {
2823                 .procname       = "min_pmtu",
2824                 .data           = &ip_rt_min_pmtu,
2825                 .maxlen         = sizeof(int),
2826                 .mode           = 0644,
2827                 .proc_handler   = proc_dointvec_minmax,
2828                 .extra1         = &ip_min_valid_pmtu,
2829         },
2830         {
2831                 .procname       = "min_adv_mss",
2832                 .data           = &ip_rt_min_advmss,
2833                 .maxlen         = sizeof(int),
2834                 .mode           = 0644,
2835                 .proc_handler   = proc_dointvec,
2836         },
2837         { }
2838 };
2839
2840 static struct ctl_table ipv4_route_flush_table[] = {
2841         {
2842                 .procname       = "flush",
2843                 .maxlen         = sizeof(int),
2844                 .mode           = 0200,
2845                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2846         },
2847         { },
2848 };
2849
2850 static __net_init int sysctl_route_net_init(struct net *net)
2851 {
2852         struct ctl_table *tbl;
2853
2854         tbl = ipv4_route_flush_table;
2855         if (!net_eq(net, &init_net)) {
2856                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2857                 if (!tbl)
2858                         goto err_dup;
2859
2860                 /* Don't export sysctls to unprivileged users */
2861                 if (net->user_ns != &init_user_ns)
2862                         tbl[0].procname = NULL;
2863         }
2864         tbl[0].extra1 = net;
2865
2866         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2867         if (!net->ipv4.route_hdr)
2868                 goto err_reg;
2869         return 0;
2870
2871 err_reg:
2872         if (tbl != ipv4_route_flush_table)
2873                 kfree(tbl);
2874 err_dup:
2875         return -ENOMEM;
2876 }
2877
2878 static __net_exit void sysctl_route_net_exit(struct net *net)
2879 {
2880         struct ctl_table *tbl;
2881
2882         tbl = net->ipv4.route_hdr->ctl_table_arg;
2883         unregister_net_sysctl_table(net->ipv4.route_hdr);
2884         BUG_ON(tbl == ipv4_route_flush_table);
2885         kfree(tbl);
2886 }
2887
2888 static __net_initdata struct pernet_operations sysctl_route_ops = {
2889         .init = sysctl_route_net_init,
2890         .exit = sysctl_route_net_exit,
2891 };
2892 #endif
2893
2894 static __net_init int rt_genid_init(struct net *net)
2895 {
2896         atomic_set(&net->ipv4.rt_genid, 0);
2897         atomic_set(&net->fnhe_genid, 0);
2898         get_random_bytes(&net->ipv4.dev_addr_genid,
2899                          sizeof(net->ipv4.dev_addr_genid));
2900         return 0;
2901 }
2902
2903 static __net_initdata struct pernet_operations rt_genid_ops = {
2904         .init = rt_genid_init,
2905 };
2906
2907 static int __net_init ipv4_inetpeer_init(struct net *net)
2908 {
2909         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2910
2911         if (!bp)
2912                 return -ENOMEM;
2913         inet_peer_base_init(bp);
2914         net->ipv4.peers = bp;
2915         return 0;
2916 }
2917
2918 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2919 {
2920         struct inet_peer_base *bp = net->ipv4.peers;
2921
2922         net->ipv4.peers = NULL;
2923         inetpeer_invalidate_tree(bp);
2924         kfree(bp);
2925 }
2926
2927 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2928         .init   =       ipv4_inetpeer_init,
2929         .exit   =       ipv4_inetpeer_exit,
2930 };
2931
2932 #ifdef CONFIG_IP_ROUTE_CLASSID
2933 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2934 #endif /* CONFIG_IP_ROUTE_CLASSID */
2935
2936 int __init ip_rt_init(void)
2937 {
2938         int rc = 0;
2939         int cpu;
2940
2941         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2942         if (!ip_idents)
2943                 panic("IP: failed to allocate ip_idents\n");
2944
2945         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2946
2947         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2948         if (!ip_tstamps)
2949                 panic("IP: failed to allocate ip_tstamps\n");
2950
2951         for_each_possible_cpu(cpu) {
2952                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2953
2954                 INIT_LIST_HEAD(&ul->head);
2955                 spin_lock_init(&ul->lock);
2956         }
2957 #ifdef CONFIG_IP_ROUTE_CLASSID
2958         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2959         if (!ip_rt_acct)
2960                 panic("IP: failed to allocate ip_rt_acct\n");
2961 #endif
2962
2963         ipv4_dst_ops.kmem_cachep =
2964                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2965                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2966
2967         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2968
2969         if (dst_entries_init(&ipv4_dst_ops) < 0)
2970                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2971
2972         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2973                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2974
2975         ipv4_dst_ops.gc_thresh = ~0;
2976         ip_rt_max_size = INT_MAX;
2977
2978         devinet_init();
2979         ip_fib_init();
2980
2981         if (ip_rt_proc_init())
2982                 pr_err("Unable to create route proc files\n");
2983 #ifdef CONFIG_XFRM
2984         xfrm_init();
2985         xfrm4_init();
2986 #endif
2987         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2988
2989 #ifdef CONFIG_SYSCTL
2990         register_pernet_subsys(&sysctl_route_ops);
2991 #endif
2992         register_pernet_subsys(&rt_genid_ops);
2993         register_pernet_subsys(&ipv4_inetpeer_ops);
2994         return rc;
2995 }
2996
2997 #ifdef CONFIG_SYSCTL
2998 /*
2999  * We really need to sanitize the damn ipv4 init order, then all
3000  * this nonsense will go away.
3001  */
3002 void __init ip_static_sysctl_init(void)
3003 {
3004         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3005 }
3006 #endif