Linux-libre 4.4.228-gnu
[librecmc/linux-libre.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         hlist_for_each_entry_rcu(t, head, hash_node) {
159                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
160                     t->parms.iph.saddr != 0 ||
161                     t->parms.iph.daddr != 0 ||
162                     !(t->dev->flags & IFF_UP))
163                         continue;
164
165                 if (t->parms.link == link)
166                         return t;
167                 else if (!cand)
168                         cand = t;
169         }
170
171         if (cand)
172                 return cand;
173
174         t = rcu_dereference(itn->collect_md_tun);
175         if (t)
176                 return t;
177
178         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
179                 return netdev_priv(itn->fb_tunnel_dev);
180
181         return NULL;
182 }
183 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
184
185 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
186                                     struct ip_tunnel_parm *parms)
187 {
188         unsigned int h;
189         __be32 remote;
190         __be32 i_key = parms->i_key;
191
192         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
193                 remote = parms->iph.daddr;
194         else
195                 remote = 0;
196
197         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
198                 i_key = 0;
199
200         h = ip_tunnel_hash(i_key, remote);
201         return &itn->tunnels[h];
202 }
203
204 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 {
206         struct hlist_head *head = ip_bucket(itn, &t->parms);
207
208         if (t->collect_md)
209                 rcu_assign_pointer(itn->collect_md_tun, t);
210         hlist_add_head_rcu(&t->hash_node, head);
211 }
212
213 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
214 {
215         if (t->collect_md)
216                 rcu_assign_pointer(itn->collect_md_tun, NULL);
217         hlist_del_init_rcu(&t->hash_node);
218 }
219
220 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
221                                         struct ip_tunnel_parm *parms,
222                                         int type)
223 {
224         __be32 remote = parms->iph.daddr;
225         __be32 local = parms->iph.saddr;
226         __be32 key = parms->i_key;
227         __be16 flags = parms->i_flags;
228         int link = parms->link;
229         struct ip_tunnel *t = NULL;
230         struct hlist_head *head = ip_bucket(itn, parms);
231
232         hlist_for_each_entry_rcu(t, head, hash_node) {
233                 if (local == t->parms.iph.saddr &&
234                     remote == t->parms.iph.daddr &&
235                     link == t->parms.link &&
236                     type == t->dev->type &&
237                     ip_tunnel_key_match(&t->parms, flags, key))
238                         break;
239         }
240         return t;
241 }
242
243 static struct net_device *__ip_tunnel_create(struct net *net,
244                                              const struct rtnl_link_ops *ops,
245                                              struct ip_tunnel_parm *parms)
246 {
247         int err;
248         struct ip_tunnel *tunnel;
249         struct net_device *dev;
250         char name[IFNAMSIZ];
251
252         err = -E2BIG;
253         if (parms->name[0]) {
254                 if (!dev_valid_name(parms->name))
255                         goto failed;
256                 strlcpy(name, parms->name, IFNAMSIZ);
257         } else {
258                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
259                         goto failed;
260                 strcpy(name, ops->kind);
261                 strcat(name, "%d");
262         }
263
264         ASSERT_RTNL();
265         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
266         if (!dev) {
267                 err = -ENOMEM;
268                 goto failed;
269         }
270         dev_net_set(dev, net);
271
272         dev->rtnl_link_ops = ops;
273
274         tunnel = netdev_priv(dev);
275         tunnel->parms = *parms;
276         tunnel->net = net;
277
278         err = register_netdevice(dev);
279         if (err)
280                 goto failed_free;
281
282         return dev;
283
284 failed_free:
285         free_netdev(dev);
286 failed:
287         return ERR_PTR(err);
288 }
289
290 static inline void init_tunnel_flow(struct flowi4 *fl4,
291                                     int proto,
292                                     __be32 daddr, __be32 saddr,
293                                     __be32 key, __u8 tos, int oif)
294 {
295         memset(fl4, 0, sizeof(*fl4));
296         fl4->flowi4_oif = oif;
297         fl4->daddr = daddr;
298         fl4->saddr = saddr;
299         fl4->flowi4_tos = tos;
300         fl4->flowi4_proto = proto;
301         fl4->fl4_gre_key = key;
302 }
303
304 static int ip_tunnel_bind_dev(struct net_device *dev)
305 {
306         struct net_device *tdev = NULL;
307         struct ip_tunnel *tunnel = netdev_priv(dev);
308         const struct iphdr *iph;
309         int hlen = LL_MAX_HEADER;
310         int mtu = ETH_DATA_LEN;
311         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
312
313         iph = &tunnel->parms.iph;
314
315         /* Guess output device to choose reasonable mtu and needed_headroom */
316         if (iph->daddr) {
317                 struct flowi4 fl4;
318                 struct rtable *rt;
319
320                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
321                                  iph->saddr, tunnel->parms.o_key,
322                                  RT_TOS(iph->tos), tunnel->parms.link);
323                 rt = ip_route_output_key(tunnel->net, &fl4);
324
325                 if (!IS_ERR(rt)) {
326                         tdev = rt->dst.dev;
327                         ip_rt_put(rt);
328                 }
329                 if (dev->type != ARPHRD_ETHER)
330                         dev->flags |= IFF_POINTOPOINT;
331
332                 dst_cache_reset(&tunnel->dst_cache);
333         }
334
335         if (!tdev && tunnel->parms.link)
336                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
337
338         if (tdev) {
339                 hlen = tdev->hard_header_len + tdev->needed_headroom;
340                 mtu = tdev->mtu;
341         }
342
343         dev->needed_headroom = t_hlen + hlen;
344         mtu -= (dev->hard_header_len + t_hlen);
345
346         if (mtu < IPV4_MIN_MTU)
347                 mtu = IPV4_MIN_MTU;
348
349         return mtu;
350 }
351
352 static struct ip_tunnel *ip_tunnel_create(struct net *net,
353                                           struct ip_tunnel_net *itn,
354                                           struct ip_tunnel_parm *parms)
355 {
356         struct ip_tunnel *nt;
357         struct net_device *dev;
358
359         BUG_ON(!itn->fb_tunnel_dev);
360         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
361         if (IS_ERR(dev))
362                 return ERR_CAST(dev);
363
364         dev->mtu = ip_tunnel_bind_dev(dev);
365
366         nt = netdev_priv(dev);
367         ip_tunnel_add(itn, nt);
368         return nt;
369 }
370
371 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
372                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
373                   bool log_ecn_error)
374 {
375         struct pcpu_sw_netstats *tstats;
376         const struct iphdr *iph = ip_hdr(skb);
377         int err;
378
379 #ifdef CONFIG_NET_IPGRE_BROADCAST
380         if (ipv4_is_multicast(iph->daddr)) {
381                 tunnel->dev->stats.multicast++;
382                 skb->pkt_type = PACKET_BROADCAST;
383         }
384 #endif
385
386         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
387              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
388                 tunnel->dev->stats.rx_crc_errors++;
389                 tunnel->dev->stats.rx_errors++;
390                 goto drop;
391         }
392
393         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
394                 if (!(tpi->flags&TUNNEL_SEQ) ||
395                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
396                         tunnel->dev->stats.rx_fifo_errors++;
397                         tunnel->dev->stats.rx_errors++;
398                         goto drop;
399                 }
400                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
401         }
402
403         skb_reset_network_header(skb);
404
405         err = IP_ECN_decapsulate(iph, skb);
406         if (unlikely(err)) {
407                 if (log_ecn_error)
408                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
409                                         &iph->saddr, iph->tos);
410                 if (err > 1) {
411                         ++tunnel->dev->stats.rx_frame_errors;
412                         ++tunnel->dev->stats.rx_errors;
413                         goto drop;
414                 }
415         }
416
417         tstats = this_cpu_ptr(tunnel->dev->tstats);
418         u64_stats_update_begin(&tstats->syncp);
419         tstats->rx_packets++;
420         tstats->rx_bytes += skb->len;
421         u64_stats_update_end(&tstats->syncp);
422
423         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
424
425         if (tunnel->dev->type == ARPHRD_ETHER) {
426                 skb->protocol = eth_type_trans(skb, tunnel->dev);
427                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
428         } else {
429                 skb->dev = tunnel->dev;
430         }
431
432         if (tun_dst)
433                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
434
435         gro_cells_receive(&tunnel->gro_cells, skb);
436         return 0;
437
438 drop:
439         kfree_skb(skb);
440         return 0;
441 }
442 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
443
444 static int ip_encap_hlen(struct ip_tunnel_encap *e)
445 {
446         const struct ip_tunnel_encap_ops *ops;
447         int hlen = -EINVAL;
448
449         if (e->type == TUNNEL_ENCAP_NONE)
450                 return 0;
451
452         if (e->type >= MAX_IPTUN_ENCAP_OPS)
453                 return -EINVAL;
454
455         rcu_read_lock();
456         ops = rcu_dereference(iptun_encaps[e->type]);
457         if (likely(ops && ops->encap_hlen))
458                 hlen = ops->encap_hlen(e);
459         rcu_read_unlock();
460
461         return hlen;
462 }
463
464 const struct ip_tunnel_encap_ops __rcu *
465                 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
466
467 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
468                             unsigned int num)
469 {
470         if (num >= MAX_IPTUN_ENCAP_OPS)
471                 return -ERANGE;
472
473         return !cmpxchg((const struct ip_tunnel_encap_ops **)
474                         &iptun_encaps[num],
475                         NULL, ops) ? 0 : -1;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
478
479 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
480                             unsigned int num)
481 {
482         int ret;
483
484         if (num >= MAX_IPTUN_ENCAP_OPS)
485                 return -ERANGE;
486
487         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
488                        &iptun_encaps[num],
489                        ops, NULL) == ops) ? 0 : -1;
490
491         synchronize_net();
492
493         return ret;
494 }
495 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
496
497 int ip_tunnel_encap_setup(struct ip_tunnel *t,
498                           struct ip_tunnel_encap *ipencap)
499 {
500         int hlen;
501
502         memset(&t->encap, 0, sizeof(t->encap));
503
504         hlen = ip_encap_hlen(ipencap);
505         if (hlen < 0)
506                 return hlen;
507
508         t->encap.type = ipencap->type;
509         t->encap.sport = ipencap->sport;
510         t->encap.dport = ipencap->dport;
511         t->encap.flags = ipencap->flags;
512
513         t->encap_hlen = hlen;
514         t->hlen = t->encap_hlen + t->tun_hlen;
515
516         return 0;
517 }
518 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
519
520 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
521                     u8 *protocol, struct flowi4 *fl4)
522 {
523         const struct ip_tunnel_encap_ops *ops;
524         int ret = -EINVAL;
525
526         if (t->encap.type == TUNNEL_ENCAP_NONE)
527                 return 0;
528
529         if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
530                 return -EINVAL;
531
532         rcu_read_lock();
533         ops = rcu_dereference(iptun_encaps[t->encap.type]);
534         if (likely(ops && ops->build_header))
535                 ret = ops->build_header(skb, &t->encap, protocol, fl4);
536         rcu_read_unlock();
537
538         return ret;
539 }
540 EXPORT_SYMBOL(ip_tunnel_encap);
541
542 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
543                             struct rtable *rt, __be16 df,
544                             const struct iphdr *inner_iph)
545 {
546         struct ip_tunnel *tunnel = netdev_priv(dev);
547         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
548         int mtu;
549
550         if (df)
551                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
552                                         - sizeof(struct iphdr) - tunnel->hlen;
553         else
554                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
555
556         if (skb_dst(skb))
557                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
558
559         if (skb->protocol == htons(ETH_P_IP)) {
560                 if (!skb_is_gso(skb) &&
561                     (inner_iph->frag_off & htons(IP_DF)) &&
562                     mtu < pkt_size) {
563                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
564                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
565                         return -E2BIG;
566                 }
567         }
568 #if IS_ENABLED(CONFIG_IPV6)
569         else if (skb->protocol == htons(ETH_P_IPV6)) {
570                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
571
572                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
573                            mtu >= IPV6_MIN_MTU) {
574                         if ((tunnel->parms.iph.daddr &&
575                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
576                             rt6->rt6i_dst.plen == 128) {
577                                 rt6->rt6i_flags |= RTF_MODIFIED;
578                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
579                         }
580                 }
581
582                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
583                                         mtu < pkt_size) {
584                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
585                         return -E2BIG;
586                 }
587         }
588 #endif
589         return 0;
590 }
591
592 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
593                     const struct iphdr *tnl_params, u8 protocol)
594 {
595         struct ip_tunnel *tunnel = netdev_priv(dev);
596         unsigned int inner_nhdr_len = 0;
597         const struct iphdr *inner_iph;
598         struct flowi4 fl4;
599         u8     tos, ttl;
600         __be16 df;
601         struct rtable *rt;              /* Route to the other host */
602         unsigned int max_headroom;      /* The extra header space needed */
603         __be32 dst;
604         int err;
605         bool connected;
606
607         /* ensure we can access the inner net header, for several users below */
608         if (skb->protocol == htons(ETH_P_IP))
609                 inner_nhdr_len = sizeof(struct iphdr);
610         else if (skb->protocol == htons(ETH_P_IPV6))
611                 inner_nhdr_len = sizeof(struct ipv6hdr);
612         if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
613                 goto tx_error;
614
615         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
616         connected = (tunnel->parms.iph.daddr != 0);
617
618         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
619
620         dst = tnl_params->daddr;
621         if (dst == 0) {
622                 /* NBMA tunnel */
623
624                 if (!skb_dst(skb)) {
625                         dev->stats.tx_fifo_errors++;
626                         goto tx_error;
627                 }
628
629                 if (skb->protocol == htons(ETH_P_IP)) {
630                         rt = skb_rtable(skb);
631                         dst = rt_nexthop(rt, inner_iph->daddr);
632                 }
633 #if IS_ENABLED(CONFIG_IPV6)
634                 else if (skb->protocol == htons(ETH_P_IPV6)) {
635                         const struct in6_addr *addr6;
636                         struct neighbour *neigh;
637                         bool do_tx_error_icmp;
638                         int addr_type;
639
640                         neigh = dst_neigh_lookup(skb_dst(skb),
641                                                  &ipv6_hdr(skb)->daddr);
642                         if (!neigh)
643                                 goto tx_error;
644
645                         addr6 = (const struct in6_addr *)&neigh->primary_key;
646                         addr_type = ipv6_addr_type(addr6);
647
648                         if (addr_type == IPV6_ADDR_ANY) {
649                                 addr6 = &ipv6_hdr(skb)->daddr;
650                                 addr_type = ipv6_addr_type(addr6);
651                         }
652
653                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
654                                 do_tx_error_icmp = true;
655                         else {
656                                 do_tx_error_icmp = false;
657                                 dst = addr6->s6_addr32[3];
658                         }
659                         neigh_release(neigh);
660                         if (do_tx_error_icmp)
661                                 goto tx_error_icmp;
662                 }
663 #endif
664                 else
665                         goto tx_error;
666
667                 connected = false;
668         }
669
670         tos = tnl_params->tos;
671         if (tos & 0x1) {
672                 tos &= ~0x1;
673                 if (skb->protocol == htons(ETH_P_IP)) {
674                         tos = inner_iph->tos;
675                         connected = false;
676                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
677                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
678                         connected = false;
679                 }
680         }
681
682         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
683                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
684
685         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
686                 goto tx_error;
687
688         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
689                          NULL;
690
691         if (!rt) {
692                 rt = ip_route_output_key(tunnel->net, &fl4);
693
694                 if (IS_ERR(rt)) {
695                         dev->stats.tx_carrier_errors++;
696                         goto tx_error;
697                 }
698                 if (connected)
699                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
700                                           fl4.saddr);
701         }
702
703         if (rt->dst.dev == dev) {
704                 ip_rt_put(rt);
705                 dev->stats.collisions++;
706                 goto tx_error;
707         }
708
709         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
710                 ip_rt_put(rt);
711                 goto tx_error;
712         }
713
714         if (tunnel->err_count > 0) {
715                 if (time_before(jiffies,
716                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
717                         tunnel->err_count--;
718
719                         dst_link_failure(skb);
720                 } else
721                         tunnel->err_count = 0;
722         }
723
724         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
725         ttl = tnl_params->ttl;
726         if (ttl == 0) {
727                 if (skb->protocol == htons(ETH_P_IP))
728                         ttl = inner_iph->ttl;
729 #if IS_ENABLED(CONFIG_IPV6)
730                 else if (skb->protocol == htons(ETH_P_IPV6))
731                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
732 #endif
733                 else
734                         ttl = ip4_dst_hoplimit(&rt->dst);
735         }
736
737         df = tnl_params->frag_off;
738         if (skb->protocol == htons(ETH_P_IP))
739                 df |= (inner_iph->frag_off&htons(IP_DF));
740
741         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
742                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
743         if (max_headroom > dev->needed_headroom)
744                 dev->needed_headroom = max_headroom;
745
746         if (skb_cow_head(skb, dev->needed_headroom)) {
747                 ip_rt_put(rt);
748                 dev->stats.tx_dropped++;
749                 kfree_skb(skb);
750                 return;
751         }
752
753         err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
754                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
755         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
756
757         return;
758
759 #if IS_ENABLED(CONFIG_IPV6)
760 tx_error_icmp:
761         dst_link_failure(skb);
762 #endif
763 tx_error:
764         dev->stats.tx_errors++;
765         kfree_skb(skb);
766 }
767 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
768
769 static void ip_tunnel_update(struct ip_tunnel_net *itn,
770                              struct ip_tunnel *t,
771                              struct net_device *dev,
772                              struct ip_tunnel_parm *p,
773                              bool set_mtu)
774 {
775         ip_tunnel_del(itn, t);
776         t->parms.iph.saddr = p->iph.saddr;
777         t->parms.iph.daddr = p->iph.daddr;
778         t->parms.i_key = p->i_key;
779         t->parms.o_key = p->o_key;
780         if (dev->type != ARPHRD_ETHER) {
781                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
782                 memcpy(dev->broadcast, &p->iph.daddr, 4);
783         }
784         ip_tunnel_add(itn, t);
785
786         t->parms.iph.ttl = p->iph.ttl;
787         t->parms.iph.tos = p->iph.tos;
788         t->parms.iph.frag_off = p->iph.frag_off;
789
790         if (t->parms.link != p->link) {
791                 int mtu;
792
793                 t->parms.link = p->link;
794                 mtu = ip_tunnel_bind_dev(dev);
795                 if (set_mtu)
796                         dev->mtu = mtu;
797         }
798         dst_cache_reset(&t->dst_cache);
799         netdev_state_change(dev);
800 }
801
802 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
803 {
804         int err = 0;
805         struct ip_tunnel *t = netdev_priv(dev);
806         struct net *net = t->net;
807         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
808
809         BUG_ON(!itn->fb_tunnel_dev);
810         switch (cmd) {
811         case SIOCGETTUNNEL:
812                 if (dev == itn->fb_tunnel_dev) {
813                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
814                         if (!t)
815                                 t = netdev_priv(dev);
816                 }
817                 memcpy(p, &t->parms, sizeof(*p));
818                 break;
819
820         case SIOCADDTUNNEL:
821         case SIOCCHGTUNNEL:
822                 err = -EPERM;
823                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
824                         goto done;
825                 if (p->iph.ttl)
826                         p->iph.frag_off |= htons(IP_DF);
827                 if (!(p->i_flags & VTI_ISVTI)) {
828                         if (!(p->i_flags & TUNNEL_KEY))
829                                 p->i_key = 0;
830                         if (!(p->o_flags & TUNNEL_KEY))
831                                 p->o_key = 0;
832                 }
833
834                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
835
836                 if (cmd == SIOCADDTUNNEL) {
837                         if (!t) {
838                                 t = ip_tunnel_create(net, itn, p);
839                                 err = PTR_ERR_OR_ZERO(t);
840                                 break;
841                         }
842
843                         err = -EEXIST;
844                         break;
845                 }
846                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
847                         if (t) {
848                                 if (t->dev != dev) {
849                                         err = -EEXIST;
850                                         break;
851                                 }
852                         } else {
853                                 unsigned int nflags = 0;
854
855                                 if (ipv4_is_multicast(p->iph.daddr))
856                                         nflags = IFF_BROADCAST;
857                                 else if (p->iph.daddr)
858                                         nflags = IFF_POINTOPOINT;
859
860                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
861                                         err = -EINVAL;
862                                         break;
863                                 }
864
865                                 t = netdev_priv(dev);
866                         }
867                 }
868
869                 if (t) {
870                         err = 0;
871                         ip_tunnel_update(itn, t, dev, p, true);
872                 } else {
873                         err = -ENOENT;
874                 }
875                 break;
876
877         case SIOCDELTUNNEL:
878                 err = -EPERM;
879                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
880                         goto done;
881
882                 if (dev == itn->fb_tunnel_dev) {
883                         err = -ENOENT;
884                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
885                         if (!t)
886                                 goto done;
887                         err = -EPERM;
888                         if (t == netdev_priv(itn->fb_tunnel_dev))
889                                 goto done;
890                         dev = t->dev;
891                 }
892                 unregister_netdevice(dev);
893                 err = 0;
894                 break;
895
896         default:
897                 err = -EINVAL;
898         }
899
900 done:
901         return err;
902 }
903 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
904
905 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
906 {
907         struct ip_tunnel *tunnel = netdev_priv(dev);
908         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
909         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
910
911         if (new_mtu < 68)
912                 return -EINVAL;
913
914         if (new_mtu > max_mtu) {
915                 if (strict)
916                         return -EINVAL;
917
918                 new_mtu = max_mtu;
919         }
920
921         dev->mtu = new_mtu;
922         return 0;
923 }
924 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
925
926 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
927 {
928         return __ip_tunnel_change_mtu(dev, new_mtu, true);
929 }
930 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
931
932 static void ip_tunnel_dev_free(struct net_device *dev)
933 {
934         struct ip_tunnel *tunnel = netdev_priv(dev);
935
936         gro_cells_destroy(&tunnel->gro_cells);
937         dst_cache_destroy(&tunnel->dst_cache);
938         free_percpu(dev->tstats);
939         free_netdev(dev);
940 }
941
942 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
943 {
944         struct ip_tunnel *tunnel = netdev_priv(dev);
945         struct ip_tunnel_net *itn;
946
947         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
948
949         if (itn->fb_tunnel_dev != dev) {
950                 ip_tunnel_del(itn, netdev_priv(dev));
951                 unregister_netdevice_queue(dev, head);
952         }
953 }
954 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
955
956 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
957 {
958         struct ip_tunnel *tunnel = netdev_priv(dev);
959
960         return tunnel->net;
961 }
962 EXPORT_SYMBOL(ip_tunnel_get_link_net);
963
964 int ip_tunnel_get_iflink(const struct net_device *dev)
965 {
966         struct ip_tunnel *tunnel = netdev_priv(dev);
967
968         return tunnel->parms.link;
969 }
970 EXPORT_SYMBOL(ip_tunnel_get_iflink);
971
972 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
973                                   struct rtnl_link_ops *ops, char *devname)
974 {
975         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
976         struct ip_tunnel_parm parms;
977         unsigned int i;
978
979         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
980                 INIT_HLIST_HEAD(&itn->tunnels[i]);
981
982         if (!ops) {
983                 itn->fb_tunnel_dev = NULL;
984                 return 0;
985         }
986
987         memset(&parms, 0, sizeof(parms));
988         if (devname)
989                 strlcpy(parms.name, devname, IFNAMSIZ);
990
991         rtnl_lock();
992         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
993         /* FB netdevice is special: we have one, and only one per netns.
994          * Allowing to move it to another netns is clearly unsafe.
995          */
996         if (!IS_ERR(itn->fb_tunnel_dev)) {
997                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
998                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
999                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1000         }
1001         rtnl_unlock();
1002
1003         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1004 }
1005 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1006
1007 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1008                               struct rtnl_link_ops *ops)
1009 {
1010         struct net *net = dev_net(itn->fb_tunnel_dev);
1011         struct net_device *dev, *aux;
1012         int h;
1013
1014         for_each_netdev_safe(net, dev, aux)
1015                 if (dev->rtnl_link_ops == ops)
1016                         unregister_netdevice_queue(dev, head);
1017
1018         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1019                 struct ip_tunnel *t;
1020                 struct hlist_node *n;
1021                 struct hlist_head *thead = &itn->tunnels[h];
1022
1023                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1024                         /* If dev is in the same netns, it has already
1025                          * been added to the list by the previous loop.
1026                          */
1027                         if (!net_eq(dev_net(t->dev), net))
1028                                 unregister_netdevice_queue(t->dev, head);
1029         }
1030 }
1031
1032 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1033 {
1034         LIST_HEAD(list);
1035
1036         rtnl_lock();
1037         ip_tunnel_destroy(itn, &list, ops);
1038         unregister_netdevice_many(&list);
1039         rtnl_unlock();
1040 }
1041 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1042
1043 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1044                       struct ip_tunnel_parm *p)
1045 {
1046         struct ip_tunnel *nt;
1047         struct net *net = dev_net(dev);
1048         struct ip_tunnel_net *itn;
1049         int mtu;
1050         int err;
1051
1052         nt = netdev_priv(dev);
1053         itn = net_generic(net, nt->ip_tnl_net_id);
1054
1055         if (nt->collect_md) {
1056                 if (rtnl_dereference(itn->collect_md_tun))
1057                         return -EEXIST;
1058         } else {
1059                 if (ip_tunnel_find(itn, p, dev->type))
1060                         return -EEXIST;
1061         }
1062
1063         nt->net = net;
1064         nt->parms = *p;
1065         err = register_netdevice(dev);
1066         if (err)
1067                 goto out;
1068
1069         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1070                 eth_hw_addr_random(dev);
1071
1072         mtu = ip_tunnel_bind_dev(dev);
1073         if (!tb[IFLA_MTU])
1074                 dev->mtu = mtu;
1075
1076         ip_tunnel_add(itn, nt);
1077 out:
1078         return err;
1079 }
1080 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1081
1082 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1083                          struct ip_tunnel_parm *p)
1084 {
1085         struct ip_tunnel *t;
1086         struct ip_tunnel *tunnel = netdev_priv(dev);
1087         struct net *net = tunnel->net;
1088         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1089
1090         if (dev == itn->fb_tunnel_dev)
1091                 return -EINVAL;
1092
1093         t = ip_tunnel_find(itn, p, dev->type);
1094
1095         if (t) {
1096                 if (t->dev != dev)
1097                         return -EEXIST;
1098         } else {
1099                 t = tunnel;
1100
1101                 if (dev->type != ARPHRD_ETHER) {
1102                         unsigned int nflags = 0;
1103
1104                         if (ipv4_is_multicast(p->iph.daddr))
1105                                 nflags = IFF_BROADCAST;
1106                         else if (p->iph.daddr)
1107                                 nflags = IFF_POINTOPOINT;
1108
1109                         if ((dev->flags ^ nflags) &
1110                             (IFF_POINTOPOINT | IFF_BROADCAST))
1111                                 return -EINVAL;
1112                 }
1113         }
1114
1115         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1116         return 0;
1117 }
1118 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1119
1120 int ip_tunnel_init(struct net_device *dev)
1121 {
1122         struct ip_tunnel *tunnel = netdev_priv(dev);
1123         struct iphdr *iph = &tunnel->parms.iph;
1124         int err;
1125
1126         dev->destructor = ip_tunnel_dev_free;
1127         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1128         if (!dev->tstats)
1129                 return -ENOMEM;
1130
1131         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1132         if (err) {
1133                 free_percpu(dev->tstats);
1134                 return err;
1135         }
1136
1137         err = gro_cells_init(&tunnel->gro_cells, dev);
1138         if (err) {
1139                 dst_cache_destroy(&tunnel->dst_cache);
1140                 free_percpu(dev->tstats);
1141                 return err;
1142         }
1143
1144         tunnel->dev = dev;
1145         tunnel->net = dev_net(dev);
1146         strcpy(tunnel->parms.name, dev->name);
1147         iph->version            = 4;
1148         iph->ihl                = 5;
1149
1150         if (tunnel->collect_md)
1151                 netif_keep_dst(dev);
1152         return 0;
1153 }
1154 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1155
1156 void ip_tunnel_uninit(struct net_device *dev)
1157 {
1158         struct ip_tunnel *tunnel = netdev_priv(dev);
1159         struct net *net = tunnel->net;
1160         struct ip_tunnel_net *itn;
1161
1162         itn = net_generic(net, tunnel->ip_tnl_net_id);
1163         /* fb_tunnel_dev will be unregisted in net-exit call. */
1164         if (itn->fb_tunnel_dev != dev)
1165                 ip_tunnel_del(itn, netdev_priv(dev));
1166
1167         dst_cache_reset(&tunnel->dst_cache);
1168 }
1169 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1170
1171 /* Do least required initialization, rest of init is done in tunnel_init call */
1172 void ip_tunnel_setup(struct net_device *dev, int net_id)
1173 {
1174         struct ip_tunnel *tunnel = netdev_priv(dev);
1175         tunnel->ip_tnl_net_id = net_id;
1176 }
1177 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1178
1179 MODULE_LICENSE("GPL");