Linux-libre 5.7.6-gnu
[librecmc/linux-libre.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         unsigned int hash;
89         struct ip_tunnel *t, *cand = NULL;
90         struct hlist_head *head;
91
92         hash = ip_tunnel_hash(key, remote);
93         head = &itn->tunnels[hash];
94
95         hlist_for_each_entry_rcu(t, head, hash_node) {
96                 if (local != t->parms.iph.saddr ||
97                     remote != t->parms.iph.daddr ||
98                     !(t->dev->flags & IFF_UP))
99                         continue;
100
101                 if (!ip_tunnel_key_match(&t->parms, flags, key))
102                         continue;
103
104                 if (t->parms.link == link)
105                         return t;
106                 else
107                         cand = t;
108         }
109
110         hlist_for_each_entry_rcu(t, head, hash_node) {
111                 if (remote != t->parms.iph.daddr ||
112                     t->parms.iph.saddr != 0 ||
113                     !(t->dev->flags & IFF_UP))
114                         continue;
115
116                 if (!ip_tunnel_key_match(&t->parms, flags, key))
117                         continue;
118
119                 if (t->parms.link == link)
120                         return t;
121                 else if (!cand)
122                         cand = t;
123         }
124
125         hash = ip_tunnel_hash(key, 0);
126         head = &itn->tunnels[hash];
127
128         hlist_for_each_entry_rcu(t, head, hash_node) {
129                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
130                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
131                         continue;
132
133                 if (!(t->dev->flags & IFF_UP))
134                         continue;
135
136                 if (!ip_tunnel_key_match(&t->parms, flags, key))
137                         continue;
138
139                 if (t->parms.link == link)
140                         return t;
141                 else if (!cand)
142                         cand = t;
143         }
144
145         hlist_for_each_entry_rcu(t, head, hash_node) {
146                 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
147                     t->parms.iph.saddr != 0 ||
148                     t->parms.iph.daddr != 0 ||
149                     !(t->dev->flags & IFF_UP))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (cand)
159                 return cand;
160
161         t = rcu_dereference(itn->collect_md_tun);
162         if (t && t->dev->flags & IFF_UP)
163                 return t;
164
165         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
166                 return netdev_priv(itn->fb_tunnel_dev);
167
168         return NULL;
169 }
170 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
171
172 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
173                                     struct ip_tunnel_parm *parms)
174 {
175         unsigned int h;
176         __be32 remote;
177         __be32 i_key = parms->i_key;
178
179         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
180                 remote = parms->iph.daddr;
181         else
182                 remote = 0;
183
184         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
185                 i_key = 0;
186
187         h = ip_tunnel_hash(i_key, remote);
188         return &itn->tunnels[h];
189 }
190
191 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
192 {
193         struct hlist_head *head = ip_bucket(itn, &t->parms);
194
195         if (t->collect_md)
196                 rcu_assign_pointer(itn->collect_md_tun, t);
197         hlist_add_head_rcu(&t->hash_node, head);
198 }
199
200 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
201 {
202         if (t->collect_md)
203                 rcu_assign_pointer(itn->collect_md_tun, NULL);
204         hlist_del_init_rcu(&t->hash_node);
205 }
206
207 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
208                                         struct ip_tunnel_parm *parms,
209                                         int type)
210 {
211         __be32 remote = parms->iph.daddr;
212         __be32 local = parms->iph.saddr;
213         __be32 key = parms->i_key;
214         __be16 flags = parms->i_flags;
215         int link = parms->link;
216         struct ip_tunnel *t = NULL;
217         struct hlist_head *head = ip_bucket(itn, parms);
218
219         hlist_for_each_entry_rcu(t, head, hash_node) {
220                 if (local == t->parms.iph.saddr &&
221                     remote == t->parms.iph.daddr &&
222                     link == t->parms.link &&
223                     type == t->dev->type &&
224                     ip_tunnel_key_match(&t->parms, flags, key))
225                         break;
226         }
227         return t;
228 }
229
230 static struct net_device *__ip_tunnel_create(struct net *net,
231                                              const struct rtnl_link_ops *ops,
232                                              struct ip_tunnel_parm *parms)
233 {
234         int err;
235         struct ip_tunnel *tunnel;
236         struct net_device *dev;
237         char name[IFNAMSIZ];
238
239         err = -E2BIG;
240         if (parms->name[0]) {
241                 if (!dev_valid_name(parms->name))
242                         goto failed;
243                 strlcpy(name, parms->name, IFNAMSIZ);
244         } else {
245                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
246                         goto failed;
247                 strcpy(name, ops->kind);
248                 strcat(name, "%d");
249         }
250
251         ASSERT_RTNL();
252         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
253         if (!dev) {
254                 err = -ENOMEM;
255                 goto failed;
256         }
257         dev_net_set(dev, net);
258
259         dev->rtnl_link_ops = ops;
260
261         tunnel = netdev_priv(dev);
262         tunnel->parms = *parms;
263         tunnel->net = net;
264
265         err = register_netdevice(dev);
266         if (err)
267                 goto failed_free;
268
269         return dev;
270
271 failed_free:
272         free_netdev(dev);
273 failed:
274         return ERR_PTR(err);
275 }
276
277 static int ip_tunnel_bind_dev(struct net_device *dev)
278 {
279         struct net_device *tdev = NULL;
280         struct ip_tunnel *tunnel = netdev_priv(dev);
281         const struct iphdr *iph;
282         int hlen = LL_MAX_HEADER;
283         int mtu = ETH_DATA_LEN;
284         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
285
286         iph = &tunnel->parms.iph;
287
288         /* Guess output device to choose reasonable mtu and needed_headroom */
289         if (iph->daddr) {
290                 struct flowi4 fl4;
291                 struct rtable *rt;
292
293                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
294                                     iph->saddr, tunnel->parms.o_key,
295                                     RT_TOS(iph->tos), tunnel->parms.link,
296                                     tunnel->fwmark, 0);
297                 rt = ip_route_output_key(tunnel->net, &fl4);
298
299                 if (!IS_ERR(rt)) {
300                         tdev = rt->dst.dev;
301                         ip_rt_put(rt);
302                 }
303                 if (dev->type != ARPHRD_ETHER)
304                         dev->flags |= IFF_POINTOPOINT;
305
306                 dst_cache_reset(&tunnel->dst_cache);
307         }
308
309         if (!tdev && tunnel->parms.link)
310                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
311
312         if (tdev) {
313                 hlen = tdev->hard_header_len + tdev->needed_headroom;
314                 mtu = min(tdev->mtu, IP_MAX_MTU);
315         }
316
317         dev->needed_headroom = t_hlen + hlen;
318         mtu -= (dev->hard_header_len + t_hlen);
319
320         if (mtu < IPV4_MIN_MTU)
321                 mtu = IPV4_MIN_MTU;
322
323         return mtu;
324 }
325
326 static struct ip_tunnel *ip_tunnel_create(struct net *net,
327                                           struct ip_tunnel_net *itn,
328                                           struct ip_tunnel_parm *parms)
329 {
330         struct ip_tunnel *nt;
331         struct net_device *dev;
332         int t_hlen;
333         int mtu;
334         int err;
335
336         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
337         if (IS_ERR(dev))
338                 return ERR_CAST(dev);
339
340         mtu = ip_tunnel_bind_dev(dev);
341         err = dev_set_mtu(dev, mtu);
342         if (err)
343                 goto err_dev_set_mtu;
344
345         nt = netdev_priv(dev);
346         t_hlen = nt->hlen + sizeof(struct iphdr);
347         dev->min_mtu = ETH_MIN_MTU;
348         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
349         ip_tunnel_add(itn, nt);
350         return nt;
351
352 err_dev_set_mtu:
353         unregister_netdevice(dev);
354         return ERR_PTR(err);
355 }
356
357 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
358                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
359                   bool log_ecn_error)
360 {
361         struct pcpu_sw_netstats *tstats;
362         const struct iphdr *iph = ip_hdr(skb);
363         int err;
364
365 #ifdef CONFIG_NET_IPGRE_BROADCAST
366         if (ipv4_is_multicast(iph->daddr)) {
367                 tunnel->dev->stats.multicast++;
368                 skb->pkt_type = PACKET_BROADCAST;
369         }
370 #endif
371
372         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
373              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
374                 tunnel->dev->stats.rx_crc_errors++;
375                 tunnel->dev->stats.rx_errors++;
376                 goto drop;
377         }
378
379         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
380                 if (!(tpi->flags&TUNNEL_SEQ) ||
381                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
382                         tunnel->dev->stats.rx_fifo_errors++;
383                         tunnel->dev->stats.rx_errors++;
384                         goto drop;
385                 }
386                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
387         }
388
389         skb_reset_network_header(skb);
390
391         err = IP_ECN_decapsulate(iph, skb);
392         if (unlikely(err)) {
393                 if (log_ecn_error)
394                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
395                                         &iph->saddr, iph->tos);
396                 if (err > 1) {
397                         ++tunnel->dev->stats.rx_frame_errors;
398                         ++tunnel->dev->stats.rx_errors;
399                         goto drop;
400                 }
401         }
402
403         tstats = this_cpu_ptr(tunnel->dev->tstats);
404         u64_stats_update_begin(&tstats->syncp);
405         tstats->rx_packets++;
406         tstats->rx_bytes += skb->len;
407         u64_stats_update_end(&tstats->syncp);
408
409         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
410
411         if (tunnel->dev->type == ARPHRD_ETHER) {
412                 skb->protocol = eth_type_trans(skb, tunnel->dev);
413                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
414         } else {
415                 skb->dev = tunnel->dev;
416         }
417
418         if (tun_dst)
419                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
420
421         gro_cells_receive(&tunnel->gro_cells, skb);
422         return 0;
423
424 drop:
425         if (tun_dst)
426                 dst_release((struct dst_entry *)tun_dst);
427         kfree_skb(skb);
428         return 0;
429 }
430 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
431
432 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
433                             unsigned int num)
434 {
435         if (num >= MAX_IPTUN_ENCAP_OPS)
436                 return -ERANGE;
437
438         return !cmpxchg((const struct ip_tunnel_encap_ops **)
439                         &iptun_encaps[num],
440                         NULL, ops) ? 0 : -1;
441 }
442 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
443
444 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
445                             unsigned int num)
446 {
447         int ret;
448
449         if (num >= MAX_IPTUN_ENCAP_OPS)
450                 return -ERANGE;
451
452         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
453                        &iptun_encaps[num],
454                        ops, NULL) == ops) ? 0 : -1;
455
456         synchronize_net();
457
458         return ret;
459 }
460 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
461
462 int ip_tunnel_encap_setup(struct ip_tunnel *t,
463                           struct ip_tunnel_encap *ipencap)
464 {
465         int hlen;
466
467         memset(&t->encap, 0, sizeof(t->encap));
468
469         hlen = ip_encap_hlen(ipencap);
470         if (hlen < 0)
471                 return hlen;
472
473         t->encap.type = ipencap->type;
474         t->encap.sport = ipencap->sport;
475         t->encap.dport = ipencap->dport;
476         t->encap.flags = ipencap->flags;
477
478         t->encap_hlen = hlen;
479         t->hlen = t->encap_hlen + t->tun_hlen;
480
481         return 0;
482 }
483 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
484
485 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
486                             struct rtable *rt, __be16 df,
487                             const struct iphdr *inner_iph,
488                             int tunnel_hlen, __be32 dst, bool md)
489 {
490         struct ip_tunnel *tunnel = netdev_priv(dev);
491         int pkt_size;
492         int mtu;
493
494         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
495         pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
496
497         if (df)
498                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
499                                         - sizeof(struct iphdr) - tunnel_hlen;
500         else
501                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
502
503         if (skb_valid_dst(skb))
504                 skb_dst_update_pmtu_no_confirm(skb, mtu);
505
506         if (skb->protocol == htons(ETH_P_IP)) {
507                 if (!skb_is_gso(skb) &&
508                     (inner_iph->frag_off & htons(IP_DF)) &&
509                     mtu < pkt_size) {
510                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
511                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
512                         return -E2BIG;
513                 }
514         }
515 #if IS_ENABLED(CONFIG_IPV6)
516         else if (skb->protocol == htons(ETH_P_IPV6)) {
517                 struct rt6_info *rt6;
518                 __be32 daddr;
519
520                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
521                                            NULL;
522                 daddr = md ? dst : tunnel->parms.iph.daddr;
523
524                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
525                            mtu >= IPV6_MIN_MTU) {
526                         if ((daddr && !ipv4_is_multicast(daddr)) ||
527                             rt6->rt6i_dst.plen == 128) {
528                                 rt6->rt6i_flags |= RTF_MODIFIED;
529                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
530                         }
531                 }
532
533                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
534                                         mtu < pkt_size) {
535                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
536                         return -E2BIG;
537                 }
538         }
539 #endif
540         return 0;
541 }
542
543 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
544                        u8 proto, int tunnel_hlen)
545 {
546         struct ip_tunnel *tunnel = netdev_priv(dev);
547         u32 headroom = sizeof(struct iphdr);
548         struct ip_tunnel_info *tun_info;
549         const struct ip_tunnel_key *key;
550         const struct iphdr *inner_iph;
551         struct rtable *rt = NULL;
552         struct flowi4 fl4;
553         __be16 df = 0;
554         u8 tos, ttl;
555         bool use_cache;
556
557         tun_info = skb_tunnel_info(skb);
558         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
559                      ip_tunnel_info_af(tun_info) != AF_INET))
560                 goto tx_error;
561         key = &tun_info->key;
562         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
563         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
564         tos = key->tos;
565         if (tos == 1) {
566                 if (skb->protocol == htons(ETH_P_IP))
567                         tos = inner_iph->tos;
568                 else if (skb->protocol == htons(ETH_P_IPV6))
569                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
570         }
571         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
572                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
573                             0, skb->mark, skb_get_hash(skb));
574         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
575                 goto tx_error;
576
577         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
578         if (use_cache)
579                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
580         if (!rt) {
581                 rt = ip_route_output_key(tunnel->net, &fl4);
582                 if (IS_ERR(rt)) {
583                         dev->stats.tx_carrier_errors++;
584                         goto tx_error;
585                 }
586                 if (use_cache)
587                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
588                                           fl4.saddr);
589         }
590         if (rt->dst.dev == dev) {
591                 ip_rt_put(rt);
592                 dev->stats.collisions++;
593                 goto tx_error;
594         }
595
596         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
597                 df = htons(IP_DF);
598         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
599                             key->u.ipv4.dst, true)) {
600                 ip_rt_put(rt);
601                 goto tx_error;
602         }
603
604         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
605         ttl = key->ttl;
606         if (ttl == 0) {
607                 if (skb->protocol == htons(ETH_P_IP))
608                         ttl = inner_iph->ttl;
609                 else if (skb->protocol == htons(ETH_P_IPV6))
610                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
611                 else
612                         ttl = ip4_dst_hoplimit(&rt->dst);
613         }
614
615         if (!df && skb->protocol == htons(ETH_P_IP))
616                 df = inner_iph->frag_off & htons(IP_DF);
617
618         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
619         if (headroom > dev->needed_headroom)
620                 dev->needed_headroom = headroom;
621
622         if (skb_cow_head(skb, dev->needed_headroom)) {
623                 ip_rt_put(rt);
624                 goto tx_dropped;
625         }
626         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
627                       df, !net_eq(tunnel->net, dev_net(dev)));
628         return;
629 tx_error:
630         dev->stats.tx_errors++;
631         goto kfree;
632 tx_dropped:
633         dev->stats.tx_dropped++;
634 kfree:
635         kfree_skb(skb);
636 }
637 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
638
639 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
640                     const struct iphdr *tnl_params, u8 protocol)
641 {
642         struct ip_tunnel *tunnel = netdev_priv(dev);
643         struct ip_tunnel_info *tun_info = NULL;
644         const struct iphdr *inner_iph;
645         unsigned int max_headroom;      /* The extra header space needed */
646         struct rtable *rt = NULL;               /* Route to the other host */
647         bool use_cache = false;
648         struct flowi4 fl4;
649         bool md = false;
650         bool connected;
651         u8 tos, ttl;
652         __be32 dst;
653         __be16 df;
654
655         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
656         connected = (tunnel->parms.iph.daddr != 0);
657
658         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
659
660         dst = tnl_params->daddr;
661         if (dst == 0) {
662                 /* NBMA tunnel */
663
664                 if (!skb_dst(skb)) {
665                         dev->stats.tx_fifo_errors++;
666                         goto tx_error;
667                 }
668
669                 tun_info = skb_tunnel_info(skb);
670                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
671                     ip_tunnel_info_af(tun_info) == AF_INET &&
672                     tun_info->key.u.ipv4.dst) {
673                         dst = tun_info->key.u.ipv4.dst;
674                         md = true;
675                         connected = true;
676                 }
677                 else if (skb->protocol == htons(ETH_P_IP)) {
678                         rt = skb_rtable(skb);
679                         dst = rt_nexthop(rt, inner_iph->daddr);
680                 }
681 #if IS_ENABLED(CONFIG_IPV6)
682                 else if (skb->protocol == htons(ETH_P_IPV6)) {
683                         const struct in6_addr *addr6;
684                         struct neighbour *neigh;
685                         bool do_tx_error_icmp;
686                         int addr_type;
687
688                         neigh = dst_neigh_lookup(skb_dst(skb),
689                                                  &ipv6_hdr(skb)->daddr);
690                         if (!neigh)
691                                 goto tx_error;
692
693                         addr6 = (const struct in6_addr *)&neigh->primary_key;
694                         addr_type = ipv6_addr_type(addr6);
695
696                         if (addr_type == IPV6_ADDR_ANY) {
697                                 addr6 = &ipv6_hdr(skb)->daddr;
698                                 addr_type = ipv6_addr_type(addr6);
699                         }
700
701                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
702                                 do_tx_error_icmp = true;
703                         else {
704                                 do_tx_error_icmp = false;
705                                 dst = addr6->s6_addr32[3];
706                         }
707                         neigh_release(neigh);
708                         if (do_tx_error_icmp)
709                                 goto tx_error_icmp;
710                 }
711 #endif
712                 else
713                         goto tx_error;
714
715                 if (!md)
716                         connected = false;
717         }
718
719         tos = tnl_params->tos;
720         if (tos & 0x1) {
721                 tos &= ~0x1;
722                 if (skb->protocol == htons(ETH_P_IP)) {
723                         tos = inner_iph->tos;
724                         connected = false;
725                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
726                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
727                         connected = false;
728                 }
729         }
730
731         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
732                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
733                             tunnel->fwmark, skb_get_hash(skb));
734
735         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
736                 goto tx_error;
737
738         if (connected && md) {
739                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
740                 if (use_cache)
741                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
742                                                &fl4.saddr);
743         } else {
744                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
745                                                 &fl4.saddr) : NULL;
746         }
747
748         if (!rt) {
749                 rt = ip_route_output_key(tunnel->net, &fl4);
750
751                 if (IS_ERR(rt)) {
752                         dev->stats.tx_carrier_errors++;
753                         goto tx_error;
754                 }
755                 if (use_cache)
756                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
757                                           fl4.saddr);
758                 else if (!md && connected)
759                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
760                                           fl4.saddr);
761         }
762
763         if (rt->dst.dev == dev) {
764                 ip_rt_put(rt);
765                 dev->stats.collisions++;
766                 goto tx_error;
767         }
768
769         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
770                             0, 0, false)) {
771                 ip_rt_put(rt);
772                 goto tx_error;
773         }
774
775         if (tunnel->err_count > 0) {
776                 if (time_before(jiffies,
777                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
778                         tunnel->err_count--;
779
780                         dst_link_failure(skb);
781                 } else
782                         tunnel->err_count = 0;
783         }
784
785         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
786         ttl = tnl_params->ttl;
787         if (ttl == 0) {
788                 if (skb->protocol == htons(ETH_P_IP))
789                         ttl = inner_iph->ttl;
790 #if IS_ENABLED(CONFIG_IPV6)
791                 else if (skb->protocol == htons(ETH_P_IPV6))
792                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
793 #endif
794                 else
795                         ttl = ip4_dst_hoplimit(&rt->dst);
796         }
797
798         df = tnl_params->frag_off;
799         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
800                 df |= (inner_iph->frag_off&htons(IP_DF));
801
802         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
803                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
804         if (max_headroom > dev->needed_headroom)
805                 dev->needed_headroom = max_headroom;
806
807         if (skb_cow_head(skb, dev->needed_headroom)) {
808                 ip_rt_put(rt);
809                 dev->stats.tx_dropped++;
810                 kfree_skb(skb);
811                 return;
812         }
813
814         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
815                       df, !net_eq(tunnel->net, dev_net(dev)));
816         return;
817
818 #if IS_ENABLED(CONFIG_IPV6)
819 tx_error_icmp:
820         dst_link_failure(skb);
821 #endif
822 tx_error:
823         dev->stats.tx_errors++;
824         kfree_skb(skb);
825 }
826 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
827
828 static void ip_tunnel_update(struct ip_tunnel_net *itn,
829                              struct ip_tunnel *t,
830                              struct net_device *dev,
831                              struct ip_tunnel_parm *p,
832                              bool set_mtu,
833                              __u32 fwmark)
834 {
835         ip_tunnel_del(itn, t);
836         t->parms.iph.saddr = p->iph.saddr;
837         t->parms.iph.daddr = p->iph.daddr;
838         t->parms.i_key = p->i_key;
839         t->parms.o_key = p->o_key;
840         if (dev->type != ARPHRD_ETHER) {
841                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
842                 memcpy(dev->broadcast, &p->iph.daddr, 4);
843         }
844         ip_tunnel_add(itn, t);
845
846         t->parms.iph.ttl = p->iph.ttl;
847         t->parms.iph.tos = p->iph.tos;
848         t->parms.iph.frag_off = p->iph.frag_off;
849
850         if (t->parms.link != p->link || t->fwmark != fwmark) {
851                 int mtu;
852
853                 t->parms.link = p->link;
854                 t->fwmark = fwmark;
855                 mtu = ip_tunnel_bind_dev(dev);
856                 if (set_mtu)
857                         dev->mtu = mtu;
858         }
859         dst_cache_reset(&t->dst_cache);
860         netdev_state_change(dev);
861 }
862
863 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
864 {
865         int err = 0;
866         struct ip_tunnel *t = netdev_priv(dev);
867         struct net *net = t->net;
868         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
869
870         switch (cmd) {
871         case SIOCGETTUNNEL:
872                 if (dev == itn->fb_tunnel_dev) {
873                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
874                         if (!t)
875                                 t = netdev_priv(dev);
876                 }
877                 memcpy(p, &t->parms, sizeof(*p));
878                 break;
879
880         case SIOCADDTUNNEL:
881         case SIOCCHGTUNNEL:
882                 err = -EPERM;
883                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
884                         goto done;
885                 if (p->iph.ttl)
886                         p->iph.frag_off |= htons(IP_DF);
887                 if (!(p->i_flags & VTI_ISVTI)) {
888                         if (!(p->i_flags & TUNNEL_KEY))
889                                 p->i_key = 0;
890                         if (!(p->o_flags & TUNNEL_KEY))
891                                 p->o_key = 0;
892                 }
893
894                 t = ip_tunnel_find(itn, p, itn->type);
895
896                 if (cmd == SIOCADDTUNNEL) {
897                         if (!t) {
898                                 t = ip_tunnel_create(net, itn, p);
899                                 err = PTR_ERR_OR_ZERO(t);
900                                 break;
901                         }
902
903                         err = -EEXIST;
904                         break;
905                 }
906                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
907                         if (t) {
908                                 if (t->dev != dev) {
909                                         err = -EEXIST;
910                                         break;
911                                 }
912                         } else {
913                                 unsigned int nflags = 0;
914
915                                 if (ipv4_is_multicast(p->iph.daddr))
916                                         nflags = IFF_BROADCAST;
917                                 else if (p->iph.daddr)
918                                         nflags = IFF_POINTOPOINT;
919
920                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
921                                         err = -EINVAL;
922                                         break;
923                                 }
924
925                                 t = netdev_priv(dev);
926                         }
927                 }
928
929                 if (t) {
930                         err = 0;
931                         ip_tunnel_update(itn, t, dev, p, true, 0);
932                 } else {
933                         err = -ENOENT;
934                 }
935                 break;
936
937         case SIOCDELTUNNEL:
938                 err = -EPERM;
939                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
940                         goto done;
941
942                 if (dev == itn->fb_tunnel_dev) {
943                         err = -ENOENT;
944                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
945                         if (!t)
946                                 goto done;
947                         err = -EPERM;
948                         if (t == netdev_priv(itn->fb_tunnel_dev))
949                                 goto done;
950                         dev = t->dev;
951                 }
952                 unregister_netdevice(dev);
953                 err = 0;
954                 break;
955
956         default:
957                 err = -EINVAL;
958         }
959
960 done:
961         return err;
962 }
963 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
964
965 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
966 {
967         struct ip_tunnel *tunnel = netdev_priv(dev);
968         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
969         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
970
971         if (new_mtu < ETH_MIN_MTU)
972                 return -EINVAL;
973
974         if (new_mtu > max_mtu) {
975                 if (strict)
976                         return -EINVAL;
977
978                 new_mtu = max_mtu;
979         }
980
981         dev->mtu = new_mtu;
982         return 0;
983 }
984 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
985
986 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
987 {
988         return __ip_tunnel_change_mtu(dev, new_mtu, true);
989 }
990 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
991
992 static void ip_tunnel_dev_free(struct net_device *dev)
993 {
994         struct ip_tunnel *tunnel = netdev_priv(dev);
995
996         gro_cells_destroy(&tunnel->gro_cells);
997         dst_cache_destroy(&tunnel->dst_cache);
998         free_percpu(dev->tstats);
999 }
1000
1001 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1002 {
1003         struct ip_tunnel *tunnel = netdev_priv(dev);
1004         struct ip_tunnel_net *itn;
1005
1006         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1007
1008         if (itn->fb_tunnel_dev != dev) {
1009                 ip_tunnel_del(itn, netdev_priv(dev));
1010                 unregister_netdevice_queue(dev, head);
1011         }
1012 }
1013 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1014
1015 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1016 {
1017         struct ip_tunnel *tunnel = netdev_priv(dev);
1018
1019         return tunnel->net;
1020 }
1021 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1022
1023 int ip_tunnel_get_iflink(const struct net_device *dev)
1024 {
1025         struct ip_tunnel *tunnel = netdev_priv(dev);
1026
1027         return tunnel->parms.link;
1028 }
1029 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1030
1031 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1032                                   struct rtnl_link_ops *ops, char *devname)
1033 {
1034         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1035         struct ip_tunnel_parm parms;
1036         unsigned int i;
1037
1038         itn->rtnl_link_ops = ops;
1039         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1040                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1041
1042         if (!ops || !net_has_fallback_tunnels(net)) {
1043                 struct ip_tunnel_net *it_init_net;
1044
1045                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1046                 itn->type = it_init_net->type;
1047                 itn->fb_tunnel_dev = NULL;
1048                 return 0;
1049         }
1050
1051         memset(&parms, 0, sizeof(parms));
1052         if (devname)
1053                 strlcpy(parms.name, devname, IFNAMSIZ);
1054
1055         rtnl_lock();
1056         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1057         /* FB netdevice is special: we have one, and only one per netns.
1058          * Allowing to move it to another netns is clearly unsafe.
1059          */
1060         if (!IS_ERR(itn->fb_tunnel_dev)) {
1061                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1062                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1063                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1064                 itn->type = itn->fb_tunnel_dev->type;
1065         }
1066         rtnl_unlock();
1067
1068         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1069 }
1070 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1071
1072 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1073                               struct list_head *head,
1074                               struct rtnl_link_ops *ops)
1075 {
1076         struct net_device *dev, *aux;
1077         int h;
1078
1079         for_each_netdev_safe(net, dev, aux)
1080                 if (dev->rtnl_link_ops == ops)
1081                         unregister_netdevice_queue(dev, head);
1082
1083         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1084                 struct ip_tunnel *t;
1085                 struct hlist_node *n;
1086                 struct hlist_head *thead = &itn->tunnels[h];
1087
1088                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1089                         /* If dev is in the same netns, it has already
1090                          * been added to the list by the previous loop.
1091                          */
1092                         if (!net_eq(dev_net(t->dev), net))
1093                                 unregister_netdevice_queue(t->dev, head);
1094         }
1095 }
1096
1097 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1098                            struct rtnl_link_ops *ops)
1099 {
1100         struct ip_tunnel_net *itn;
1101         struct net *net;
1102         LIST_HEAD(list);
1103
1104         rtnl_lock();
1105         list_for_each_entry(net, net_list, exit_list) {
1106                 itn = net_generic(net, id);
1107                 ip_tunnel_destroy(net, itn, &list, ops);
1108         }
1109         unregister_netdevice_many(&list);
1110         rtnl_unlock();
1111 }
1112 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1113
1114 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1115                       struct ip_tunnel_parm *p, __u32 fwmark)
1116 {
1117         struct ip_tunnel *nt;
1118         struct net *net = dev_net(dev);
1119         struct ip_tunnel_net *itn;
1120         int mtu;
1121         int err;
1122
1123         nt = netdev_priv(dev);
1124         itn = net_generic(net, nt->ip_tnl_net_id);
1125
1126         if (nt->collect_md) {
1127                 if (rtnl_dereference(itn->collect_md_tun))
1128                         return -EEXIST;
1129         } else {
1130                 if (ip_tunnel_find(itn, p, dev->type))
1131                         return -EEXIST;
1132         }
1133
1134         nt->net = net;
1135         nt->parms = *p;
1136         nt->fwmark = fwmark;
1137         err = register_netdevice(dev);
1138         if (err)
1139                 goto err_register_netdevice;
1140
1141         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1142                 eth_hw_addr_random(dev);
1143
1144         mtu = ip_tunnel_bind_dev(dev);
1145         if (tb[IFLA_MTU]) {
1146                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1147
1148                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1149                             (unsigned int)(max - sizeof(struct iphdr)));
1150         }
1151
1152         err = dev_set_mtu(dev, mtu);
1153         if (err)
1154                 goto err_dev_set_mtu;
1155
1156         ip_tunnel_add(itn, nt);
1157         return 0;
1158
1159 err_dev_set_mtu:
1160         unregister_netdevice(dev);
1161 err_register_netdevice:
1162         return err;
1163 }
1164 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1165
1166 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1167                          struct ip_tunnel_parm *p, __u32 fwmark)
1168 {
1169         struct ip_tunnel *t;
1170         struct ip_tunnel *tunnel = netdev_priv(dev);
1171         struct net *net = tunnel->net;
1172         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1173
1174         if (dev == itn->fb_tunnel_dev)
1175                 return -EINVAL;
1176
1177         t = ip_tunnel_find(itn, p, dev->type);
1178
1179         if (t) {
1180                 if (t->dev != dev)
1181                         return -EEXIST;
1182         } else {
1183                 t = tunnel;
1184
1185                 if (dev->type != ARPHRD_ETHER) {
1186                         unsigned int nflags = 0;
1187
1188                         if (ipv4_is_multicast(p->iph.daddr))
1189                                 nflags = IFF_BROADCAST;
1190                         else if (p->iph.daddr)
1191                                 nflags = IFF_POINTOPOINT;
1192
1193                         if ((dev->flags ^ nflags) &
1194                             (IFF_POINTOPOINT | IFF_BROADCAST))
1195                                 return -EINVAL;
1196                 }
1197         }
1198
1199         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1200         return 0;
1201 }
1202 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1203
1204 int ip_tunnel_init(struct net_device *dev)
1205 {
1206         struct ip_tunnel *tunnel = netdev_priv(dev);
1207         struct iphdr *iph = &tunnel->parms.iph;
1208         int err;
1209
1210         dev->needs_free_netdev = true;
1211         dev->priv_destructor = ip_tunnel_dev_free;
1212         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1213         if (!dev->tstats)
1214                 return -ENOMEM;
1215
1216         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1217         if (err) {
1218                 free_percpu(dev->tstats);
1219                 return err;
1220         }
1221
1222         err = gro_cells_init(&tunnel->gro_cells, dev);
1223         if (err) {
1224                 dst_cache_destroy(&tunnel->dst_cache);
1225                 free_percpu(dev->tstats);
1226                 return err;
1227         }
1228
1229         tunnel->dev = dev;
1230         tunnel->net = dev_net(dev);
1231         strcpy(tunnel->parms.name, dev->name);
1232         iph->version            = 4;
1233         iph->ihl                = 5;
1234
1235         if (tunnel->collect_md)
1236                 netif_keep_dst(dev);
1237         return 0;
1238 }
1239 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1240
1241 void ip_tunnel_uninit(struct net_device *dev)
1242 {
1243         struct ip_tunnel *tunnel = netdev_priv(dev);
1244         struct net *net = tunnel->net;
1245         struct ip_tunnel_net *itn;
1246
1247         itn = net_generic(net, tunnel->ip_tnl_net_id);
1248         /* fb_tunnel_dev will be unregisted in net-exit call. */
1249         if (itn->fb_tunnel_dev != dev)
1250                 ip_tunnel_del(itn, netdev_priv(dev));
1251
1252         dst_cache_reset(&tunnel->dst_cache);
1253 }
1254 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1255
1256 /* Do least required initialization, rest of init is done in tunnel_init call */
1257 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1258 {
1259         struct ip_tunnel *tunnel = netdev_priv(dev);
1260         tunnel->ip_tnl_net_id = net_id;
1261 }
1262 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1263
1264 MODULE_LICENSE("GPL");