Linux-libre 5.3.12-gnu
[librecmc/linux-libre.git] / net / ipv4 / ip_tunnel.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55         return hash_32((__force u32)key ^ (__force u32)remote,
56                          IP_TNL_HASH_BITS);
57 }
58
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60                                 __be16 flags, __be32 key)
61 {
62         if (p->i_flags & TUNNEL_KEY) {
63                 if (flags & TUNNEL_KEY)
64                         return key == p->i_key;
65                 else
66                         /* key expected, none present */
67                         return false;
68         } else
69                 return !(flags & TUNNEL_KEY);
70 }
71
72 /* Fallback tunnel: no source, no destination, no key, no options
73
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84                                    int link, __be16 flags,
85                                    __be32 remote, __be32 local,
86                                    __be32 key)
87 {
88         unsigned int hash;
89         struct ip_tunnel *t, *cand = NULL;
90         struct hlist_head *head;
91
92         hash = ip_tunnel_hash(key, remote);
93         head = &itn->tunnels[hash];
94
95         hlist_for_each_entry_rcu(t, head, hash_node) {
96                 if (local != t->parms.iph.saddr ||
97                     remote != t->parms.iph.daddr ||
98                     !(t->dev->flags & IFF_UP))
99                         continue;
100
101                 if (!ip_tunnel_key_match(&t->parms, flags, key))
102                         continue;
103
104                 if (t->parms.link == link)
105                         return t;
106                 else
107                         cand = t;
108         }
109
110         hlist_for_each_entry_rcu(t, head, hash_node) {
111                 if (remote != t->parms.iph.daddr ||
112                     t->parms.iph.saddr != 0 ||
113                     !(t->dev->flags & IFF_UP))
114                         continue;
115
116                 if (!ip_tunnel_key_match(&t->parms, flags, key))
117                         continue;
118
119                 if (t->parms.link == link)
120                         return t;
121                 else if (!cand)
122                         cand = t;
123         }
124
125         hash = ip_tunnel_hash(key, 0);
126         head = &itn->tunnels[hash];
127
128         hlist_for_each_entry_rcu(t, head, hash_node) {
129                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
130                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
131                         continue;
132
133                 if (!(t->dev->flags & IFF_UP))
134                         continue;
135
136                 if (!ip_tunnel_key_match(&t->parms, flags, key))
137                         continue;
138
139                 if (t->parms.link == link)
140                         return t;
141                 else if (!cand)
142                         cand = t;
143         }
144
145         if (flags & TUNNEL_NO_KEY)
146                 goto skip_key_lookup;
147
148         hlist_for_each_entry_rcu(t, head, hash_node) {
149                 if (t->parms.i_key != key ||
150                     t->parms.iph.saddr != 0 ||
151                     t->parms.iph.daddr != 0 ||
152                     !(t->dev->flags & IFF_UP))
153                         continue;
154
155                 if (t->parms.link == link)
156                         return t;
157                 else if (!cand)
158                         cand = t;
159         }
160
161 skip_key_lookup:
162         if (cand)
163                 return cand;
164
165         t = rcu_dereference(itn->collect_md_tun);
166         if (t && t->dev->flags & IFF_UP)
167                 return t;
168
169         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
170                 return netdev_priv(itn->fb_tunnel_dev);
171
172         return NULL;
173 }
174 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
175
176 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
177                                     struct ip_tunnel_parm *parms)
178 {
179         unsigned int h;
180         __be32 remote;
181         __be32 i_key = parms->i_key;
182
183         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
184                 remote = parms->iph.daddr;
185         else
186                 remote = 0;
187
188         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
189                 i_key = 0;
190
191         h = ip_tunnel_hash(i_key, remote);
192         return &itn->tunnels[h];
193 }
194
195 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
196 {
197         struct hlist_head *head = ip_bucket(itn, &t->parms);
198
199         if (t->collect_md)
200                 rcu_assign_pointer(itn->collect_md_tun, t);
201         hlist_add_head_rcu(&t->hash_node, head);
202 }
203
204 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
205 {
206         if (t->collect_md)
207                 rcu_assign_pointer(itn->collect_md_tun, NULL);
208         hlist_del_init_rcu(&t->hash_node);
209 }
210
211 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
212                                         struct ip_tunnel_parm *parms,
213                                         int type)
214 {
215         __be32 remote = parms->iph.daddr;
216         __be32 local = parms->iph.saddr;
217         __be32 key = parms->i_key;
218         __be16 flags = parms->i_flags;
219         int link = parms->link;
220         struct ip_tunnel *t = NULL;
221         struct hlist_head *head = ip_bucket(itn, parms);
222
223         hlist_for_each_entry_rcu(t, head, hash_node) {
224                 if (local == t->parms.iph.saddr &&
225                     remote == t->parms.iph.daddr &&
226                     link == t->parms.link &&
227                     type == t->dev->type &&
228                     ip_tunnel_key_match(&t->parms, flags, key))
229                         break;
230         }
231         return t;
232 }
233
234 static struct net_device *__ip_tunnel_create(struct net *net,
235                                              const struct rtnl_link_ops *ops,
236                                              struct ip_tunnel_parm *parms)
237 {
238         int err;
239         struct ip_tunnel *tunnel;
240         struct net_device *dev;
241         char name[IFNAMSIZ];
242
243         err = -E2BIG;
244         if (parms->name[0]) {
245                 if (!dev_valid_name(parms->name))
246                         goto failed;
247                 strlcpy(name, parms->name, IFNAMSIZ);
248         } else {
249                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
250                         goto failed;
251                 strcpy(name, ops->kind);
252                 strcat(name, "%d");
253         }
254
255         ASSERT_RTNL();
256         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
257         if (!dev) {
258                 err = -ENOMEM;
259                 goto failed;
260         }
261         dev_net_set(dev, net);
262
263         dev->rtnl_link_ops = ops;
264
265         tunnel = netdev_priv(dev);
266         tunnel->parms = *parms;
267         tunnel->net = net;
268
269         err = register_netdevice(dev);
270         if (err)
271                 goto failed_free;
272
273         return dev;
274
275 failed_free:
276         free_netdev(dev);
277 failed:
278         return ERR_PTR(err);
279 }
280
281 static int ip_tunnel_bind_dev(struct net_device *dev)
282 {
283         struct net_device *tdev = NULL;
284         struct ip_tunnel *tunnel = netdev_priv(dev);
285         const struct iphdr *iph;
286         int hlen = LL_MAX_HEADER;
287         int mtu = ETH_DATA_LEN;
288         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
289
290         iph = &tunnel->parms.iph;
291
292         /* Guess output device to choose reasonable mtu and needed_headroom */
293         if (iph->daddr) {
294                 struct flowi4 fl4;
295                 struct rtable *rt;
296
297                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
298                                     iph->saddr, tunnel->parms.o_key,
299                                     RT_TOS(iph->tos), tunnel->parms.link,
300                                     tunnel->fwmark, 0);
301                 rt = ip_route_output_key(tunnel->net, &fl4);
302
303                 if (!IS_ERR(rt)) {
304                         tdev = rt->dst.dev;
305                         ip_rt_put(rt);
306                 }
307                 if (dev->type != ARPHRD_ETHER)
308                         dev->flags |= IFF_POINTOPOINT;
309
310                 dst_cache_reset(&tunnel->dst_cache);
311         }
312
313         if (!tdev && tunnel->parms.link)
314                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
315
316         if (tdev) {
317                 hlen = tdev->hard_header_len + tdev->needed_headroom;
318                 mtu = min(tdev->mtu, IP_MAX_MTU);
319         }
320
321         dev->needed_headroom = t_hlen + hlen;
322         mtu -= (dev->hard_header_len + t_hlen);
323
324         if (mtu < IPV4_MIN_MTU)
325                 mtu = IPV4_MIN_MTU;
326
327         return mtu;
328 }
329
330 static struct ip_tunnel *ip_tunnel_create(struct net *net,
331                                           struct ip_tunnel_net *itn,
332                                           struct ip_tunnel_parm *parms)
333 {
334         struct ip_tunnel *nt;
335         struct net_device *dev;
336         int t_hlen;
337         int mtu;
338         int err;
339
340         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
341         if (IS_ERR(dev))
342                 return ERR_CAST(dev);
343
344         mtu = ip_tunnel_bind_dev(dev);
345         err = dev_set_mtu(dev, mtu);
346         if (err)
347                 goto err_dev_set_mtu;
348
349         nt = netdev_priv(dev);
350         t_hlen = nt->hlen + sizeof(struct iphdr);
351         dev->min_mtu = ETH_MIN_MTU;
352         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
353         ip_tunnel_add(itn, nt);
354         return nt;
355
356 err_dev_set_mtu:
357         unregister_netdevice(dev);
358         return ERR_PTR(err);
359 }
360
361 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
362                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
363                   bool log_ecn_error)
364 {
365         struct pcpu_sw_netstats *tstats;
366         const struct iphdr *iph = ip_hdr(skb);
367         int err;
368
369 #ifdef CONFIG_NET_IPGRE_BROADCAST
370         if (ipv4_is_multicast(iph->daddr)) {
371                 tunnel->dev->stats.multicast++;
372                 skb->pkt_type = PACKET_BROADCAST;
373         }
374 #endif
375
376         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
377              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
378                 tunnel->dev->stats.rx_crc_errors++;
379                 tunnel->dev->stats.rx_errors++;
380                 goto drop;
381         }
382
383         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
384                 if (!(tpi->flags&TUNNEL_SEQ) ||
385                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
386                         tunnel->dev->stats.rx_fifo_errors++;
387                         tunnel->dev->stats.rx_errors++;
388                         goto drop;
389                 }
390                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
391         }
392
393         skb_reset_network_header(skb);
394
395         err = IP_ECN_decapsulate(iph, skb);
396         if (unlikely(err)) {
397                 if (log_ecn_error)
398                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
399                                         &iph->saddr, iph->tos);
400                 if (err > 1) {
401                         ++tunnel->dev->stats.rx_frame_errors;
402                         ++tunnel->dev->stats.rx_errors;
403                         goto drop;
404                 }
405         }
406
407         tstats = this_cpu_ptr(tunnel->dev->tstats);
408         u64_stats_update_begin(&tstats->syncp);
409         tstats->rx_packets++;
410         tstats->rx_bytes += skb->len;
411         u64_stats_update_end(&tstats->syncp);
412
413         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
414
415         if (tunnel->dev->type == ARPHRD_ETHER) {
416                 skb->protocol = eth_type_trans(skb, tunnel->dev);
417                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
418         } else {
419                 skb->dev = tunnel->dev;
420         }
421
422         if (tun_dst)
423                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
424
425         gro_cells_receive(&tunnel->gro_cells, skb);
426         return 0;
427
428 drop:
429         if (tun_dst)
430                 dst_release((struct dst_entry *)tun_dst);
431         kfree_skb(skb);
432         return 0;
433 }
434 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
435
436 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
437                             unsigned int num)
438 {
439         if (num >= MAX_IPTUN_ENCAP_OPS)
440                 return -ERANGE;
441
442         return !cmpxchg((const struct ip_tunnel_encap_ops **)
443                         &iptun_encaps[num],
444                         NULL, ops) ? 0 : -1;
445 }
446 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
447
448 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
449                             unsigned int num)
450 {
451         int ret;
452
453         if (num >= MAX_IPTUN_ENCAP_OPS)
454                 return -ERANGE;
455
456         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
457                        &iptun_encaps[num],
458                        ops, NULL) == ops) ? 0 : -1;
459
460         synchronize_net();
461
462         return ret;
463 }
464 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
465
466 int ip_tunnel_encap_setup(struct ip_tunnel *t,
467                           struct ip_tunnel_encap *ipencap)
468 {
469         int hlen;
470
471         memset(&t->encap, 0, sizeof(t->encap));
472
473         hlen = ip_encap_hlen(ipencap);
474         if (hlen < 0)
475                 return hlen;
476
477         t->encap.type = ipencap->type;
478         t->encap.sport = ipencap->sport;
479         t->encap.dport = ipencap->dport;
480         t->encap.flags = ipencap->flags;
481
482         t->encap_hlen = hlen;
483         t->hlen = t->encap_hlen + t->tun_hlen;
484
485         return 0;
486 }
487 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
488
489 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
490                             struct rtable *rt, __be16 df,
491                             const struct iphdr *inner_iph,
492                             int tunnel_hlen, __be32 dst, bool md)
493 {
494         struct ip_tunnel *tunnel = netdev_priv(dev);
495         int pkt_size;
496         int mtu;
497
498         tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
499         pkt_size = skb->len - tunnel_hlen - dev->hard_header_len;
500
501         if (df)
502                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
503                                         - sizeof(struct iphdr) - tunnel_hlen;
504         else
505                 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
506
507         if (skb_valid_dst(skb))
508                 skb_dst_update_pmtu(skb, mtu);
509
510         if (skb->protocol == htons(ETH_P_IP)) {
511                 if (!skb_is_gso(skb) &&
512                     (inner_iph->frag_off & htons(IP_DF)) &&
513                     mtu < pkt_size) {
514                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
515                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
516                         return -E2BIG;
517                 }
518         }
519 #if IS_ENABLED(CONFIG_IPV6)
520         else if (skb->protocol == htons(ETH_P_IPV6)) {
521                 struct rt6_info *rt6;
522                 __be32 daddr;
523
524                 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
525                                            NULL;
526                 daddr = md ? dst : tunnel->parms.iph.daddr;
527
528                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
529                            mtu >= IPV6_MIN_MTU) {
530                         if ((daddr && !ipv4_is_multicast(daddr)) ||
531                             rt6->rt6i_dst.plen == 128) {
532                                 rt6->rt6i_flags |= RTF_MODIFIED;
533                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
534                         }
535                 }
536
537                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
538                                         mtu < pkt_size) {
539                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
540                         return -E2BIG;
541                 }
542         }
543 #endif
544         return 0;
545 }
546
547 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
548                        u8 proto, int tunnel_hlen)
549 {
550         struct ip_tunnel *tunnel = netdev_priv(dev);
551         u32 headroom = sizeof(struct iphdr);
552         struct ip_tunnel_info *tun_info;
553         const struct ip_tunnel_key *key;
554         const struct iphdr *inner_iph;
555         struct rtable *rt = NULL;
556         struct flowi4 fl4;
557         __be16 df = 0;
558         u8 tos, ttl;
559         bool use_cache;
560
561         tun_info = skb_tunnel_info(skb);
562         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
563                      ip_tunnel_info_af(tun_info) != AF_INET))
564                 goto tx_error;
565         key = &tun_info->key;
566         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
567         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
568         tos = key->tos;
569         if (tos == 1) {
570                 if (skb->protocol == htons(ETH_P_IP))
571                         tos = inner_iph->tos;
572                 else if (skb->protocol == htons(ETH_P_IPV6))
573                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
574         }
575         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
576                             tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
577                             0, skb->mark, skb_get_hash(skb));
578         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
579                 goto tx_error;
580
581         use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
582         if (use_cache)
583                 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
584         if (!rt) {
585                 rt = ip_route_output_key(tunnel->net, &fl4);
586                 if (IS_ERR(rt)) {
587                         dev->stats.tx_carrier_errors++;
588                         goto tx_error;
589                 }
590                 if (use_cache)
591                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
592                                           fl4.saddr);
593         }
594         if (rt->dst.dev == dev) {
595                 ip_rt_put(rt);
596                 dev->stats.collisions++;
597                 goto tx_error;
598         }
599
600         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
601                 df = htons(IP_DF);
602         if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
603                             key->u.ipv4.dst, true)) {
604                 ip_rt_put(rt);
605                 goto tx_error;
606         }
607
608         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
609         ttl = key->ttl;
610         if (ttl == 0) {
611                 if (skb->protocol == htons(ETH_P_IP))
612                         ttl = inner_iph->ttl;
613                 else if (skb->protocol == htons(ETH_P_IPV6))
614                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
615                 else
616                         ttl = ip4_dst_hoplimit(&rt->dst);
617         }
618
619         if (!df && skb->protocol == htons(ETH_P_IP))
620                 df = inner_iph->frag_off & htons(IP_DF);
621
622         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
623         if (headroom > dev->needed_headroom)
624                 dev->needed_headroom = headroom;
625
626         if (skb_cow_head(skb, dev->needed_headroom)) {
627                 ip_rt_put(rt);
628                 goto tx_dropped;
629         }
630         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
631                       df, !net_eq(tunnel->net, dev_net(dev)));
632         return;
633 tx_error:
634         dev->stats.tx_errors++;
635         goto kfree;
636 tx_dropped:
637         dev->stats.tx_dropped++;
638 kfree:
639         kfree_skb(skb);
640 }
641 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
642
643 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
644                     const struct iphdr *tnl_params, u8 protocol)
645 {
646         struct ip_tunnel *tunnel = netdev_priv(dev);
647         struct ip_tunnel_info *tun_info = NULL;
648         const struct iphdr *inner_iph;
649         unsigned int max_headroom;      /* The extra header space needed */
650         struct rtable *rt = NULL;               /* Route to the other host */
651         bool use_cache = false;
652         struct flowi4 fl4;
653         bool md = false;
654         bool connected;
655         u8 tos, ttl;
656         __be32 dst;
657         __be16 df;
658
659         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
660         connected = (tunnel->parms.iph.daddr != 0);
661
662         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
663
664         dst = tnl_params->daddr;
665         if (dst == 0) {
666                 /* NBMA tunnel */
667
668                 if (!skb_dst(skb)) {
669                         dev->stats.tx_fifo_errors++;
670                         goto tx_error;
671                 }
672
673                 tun_info = skb_tunnel_info(skb);
674                 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
675                     ip_tunnel_info_af(tun_info) == AF_INET &&
676                     tun_info->key.u.ipv4.dst) {
677                         dst = tun_info->key.u.ipv4.dst;
678                         md = true;
679                         connected = true;
680                 }
681                 else if (skb->protocol == htons(ETH_P_IP)) {
682                         rt = skb_rtable(skb);
683                         dst = rt_nexthop(rt, inner_iph->daddr);
684                 }
685 #if IS_ENABLED(CONFIG_IPV6)
686                 else if (skb->protocol == htons(ETH_P_IPV6)) {
687                         const struct in6_addr *addr6;
688                         struct neighbour *neigh;
689                         bool do_tx_error_icmp;
690                         int addr_type;
691
692                         neigh = dst_neigh_lookup(skb_dst(skb),
693                                                  &ipv6_hdr(skb)->daddr);
694                         if (!neigh)
695                                 goto tx_error;
696
697                         addr6 = (const struct in6_addr *)&neigh->primary_key;
698                         addr_type = ipv6_addr_type(addr6);
699
700                         if (addr_type == IPV6_ADDR_ANY) {
701                                 addr6 = &ipv6_hdr(skb)->daddr;
702                                 addr_type = ipv6_addr_type(addr6);
703                         }
704
705                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
706                                 do_tx_error_icmp = true;
707                         else {
708                                 do_tx_error_icmp = false;
709                                 dst = addr6->s6_addr32[3];
710                         }
711                         neigh_release(neigh);
712                         if (do_tx_error_icmp)
713                                 goto tx_error_icmp;
714                 }
715 #endif
716                 else
717                         goto tx_error;
718
719                 if (!md)
720                         connected = false;
721         }
722
723         tos = tnl_params->tos;
724         if (tos & 0x1) {
725                 tos &= ~0x1;
726                 if (skb->protocol == htons(ETH_P_IP)) {
727                         tos = inner_iph->tos;
728                         connected = false;
729                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
730                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
731                         connected = false;
732                 }
733         }
734
735         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
736                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
737                             tunnel->fwmark, skb_get_hash(skb));
738
739         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
740                 goto tx_error;
741
742         if (connected && md) {
743                 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
744                 if (use_cache)
745                         rt = dst_cache_get_ip4(&tun_info->dst_cache,
746                                                &fl4.saddr);
747         } else {
748                 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
749                                                 &fl4.saddr) : NULL;
750         }
751
752         if (!rt) {
753                 rt = ip_route_output_key(tunnel->net, &fl4);
754
755                 if (IS_ERR(rt)) {
756                         dev->stats.tx_carrier_errors++;
757                         goto tx_error;
758                 }
759                 if (use_cache)
760                         dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
761                                           fl4.saddr);
762                 else if (!md && connected)
763                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
764                                           fl4.saddr);
765         }
766
767         if (rt->dst.dev == dev) {
768                 ip_rt_put(rt);
769                 dev->stats.collisions++;
770                 goto tx_error;
771         }
772
773         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph,
774                             0, 0, false)) {
775                 ip_rt_put(rt);
776                 goto tx_error;
777         }
778
779         if (tunnel->err_count > 0) {
780                 if (time_before(jiffies,
781                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
782                         tunnel->err_count--;
783
784                         dst_link_failure(skb);
785                 } else
786                         tunnel->err_count = 0;
787         }
788
789         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
790         ttl = tnl_params->ttl;
791         if (ttl == 0) {
792                 if (skb->protocol == htons(ETH_P_IP))
793                         ttl = inner_iph->ttl;
794 #if IS_ENABLED(CONFIG_IPV6)
795                 else if (skb->protocol == htons(ETH_P_IPV6))
796                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
797 #endif
798                 else
799                         ttl = ip4_dst_hoplimit(&rt->dst);
800         }
801
802         df = tnl_params->frag_off;
803         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
804                 df |= (inner_iph->frag_off&htons(IP_DF));
805
806         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
807                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
808         if (max_headroom > dev->needed_headroom)
809                 dev->needed_headroom = max_headroom;
810
811         if (skb_cow_head(skb, dev->needed_headroom)) {
812                 ip_rt_put(rt);
813                 dev->stats.tx_dropped++;
814                 kfree_skb(skb);
815                 return;
816         }
817
818         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
819                       df, !net_eq(tunnel->net, dev_net(dev)));
820         return;
821
822 #if IS_ENABLED(CONFIG_IPV6)
823 tx_error_icmp:
824         dst_link_failure(skb);
825 #endif
826 tx_error:
827         dev->stats.tx_errors++;
828         kfree_skb(skb);
829 }
830 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
831
832 static void ip_tunnel_update(struct ip_tunnel_net *itn,
833                              struct ip_tunnel *t,
834                              struct net_device *dev,
835                              struct ip_tunnel_parm *p,
836                              bool set_mtu,
837                              __u32 fwmark)
838 {
839         ip_tunnel_del(itn, t);
840         t->parms.iph.saddr = p->iph.saddr;
841         t->parms.iph.daddr = p->iph.daddr;
842         t->parms.i_key = p->i_key;
843         t->parms.o_key = p->o_key;
844         if (dev->type != ARPHRD_ETHER) {
845                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
846                 memcpy(dev->broadcast, &p->iph.daddr, 4);
847         }
848         ip_tunnel_add(itn, t);
849
850         t->parms.iph.ttl = p->iph.ttl;
851         t->parms.iph.tos = p->iph.tos;
852         t->parms.iph.frag_off = p->iph.frag_off;
853
854         if (t->parms.link != p->link || t->fwmark != fwmark) {
855                 int mtu;
856
857                 t->parms.link = p->link;
858                 t->fwmark = fwmark;
859                 mtu = ip_tunnel_bind_dev(dev);
860                 if (set_mtu)
861                         dev->mtu = mtu;
862         }
863         dst_cache_reset(&t->dst_cache);
864         netdev_state_change(dev);
865 }
866
867 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
868 {
869         int err = 0;
870         struct ip_tunnel *t = netdev_priv(dev);
871         struct net *net = t->net;
872         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
873
874         switch (cmd) {
875         case SIOCGETTUNNEL:
876                 if (dev == itn->fb_tunnel_dev) {
877                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
878                         if (!t)
879                                 t = netdev_priv(dev);
880                 }
881                 memcpy(p, &t->parms, sizeof(*p));
882                 break;
883
884         case SIOCADDTUNNEL:
885         case SIOCCHGTUNNEL:
886                 err = -EPERM;
887                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
888                         goto done;
889                 if (p->iph.ttl)
890                         p->iph.frag_off |= htons(IP_DF);
891                 if (!(p->i_flags & VTI_ISVTI)) {
892                         if (!(p->i_flags & TUNNEL_KEY))
893                                 p->i_key = 0;
894                         if (!(p->o_flags & TUNNEL_KEY))
895                                 p->o_key = 0;
896                 }
897
898                 t = ip_tunnel_find(itn, p, itn->type);
899
900                 if (cmd == SIOCADDTUNNEL) {
901                         if (!t) {
902                                 t = ip_tunnel_create(net, itn, p);
903                                 err = PTR_ERR_OR_ZERO(t);
904                                 break;
905                         }
906
907                         err = -EEXIST;
908                         break;
909                 }
910                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
911                         if (t) {
912                                 if (t->dev != dev) {
913                                         err = -EEXIST;
914                                         break;
915                                 }
916                         } else {
917                                 unsigned int nflags = 0;
918
919                                 if (ipv4_is_multicast(p->iph.daddr))
920                                         nflags = IFF_BROADCAST;
921                                 else if (p->iph.daddr)
922                                         nflags = IFF_POINTOPOINT;
923
924                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
925                                         err = -EINVAL;
926                                         break;
927                                 }
928
929                                 t = netdev_priv(dev);
930                         }
931                 }
932
933                 if (t) {
934                         err = 0;
935                         ip_tunnel_update(itn, t, dev, p, true, 0);
936                 } else {
937                         err = -ENOENT;
938                 }
939                 break;
940
941         case SIOCDELTUNNEL:
942                 err = -EPERM;
943                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
944                         goto done;
945
946                 if (dev == itn->fb_tunnel_dev) {
947                         err = -ENOENT;
948                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
949                         if (!t)
950                                 goto done;
951                         err = -EPERM;
952                         if (t == netdev_priv(itn->fb_tunnel_dev))
953                                 goto done;
954                         dev = t->dev;
955                 }
956                 unregister_netdevice(dev);
957                 err = 0;
958                 break;
959
960         default:
961                 err = -EINVAL;
962         }
963
964 done:
965         return err;
966 }
967 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
968
969 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
970 {
971         struct ip_tunnel *tunnel = netdev_priv(dev);
972         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
973         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
974
975         if (new_mtu < ETH_MIN_MTU)
976                 return -EINVAL;
977
978         if (new_mtu > max_mtu) {
979                 if (strict)
980                         return -EINVAL;
981
982                 new_mtu = max_mtu;
983         }
984
985         dev->mtu = new_mtu;
986         return 0;
987 }
988 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
989
990 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
991 {
992         return __ip_tunnel_change_mtu(dev, new_mtu, true);
993 }
994 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
995
996 static void ip_tunnel_dev_free(struct net_device *dev)
997 {
998         struct ip_tunnel *tunnel = netdev_priv(dev);
999
1000         gro_cells_destroy(&tunnel->gro_cells);
1001         dst_cache_destroy(&tunnel->dst_cache);
1002         free_percpu(dev->tstats);
1003 }
1004
1005 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1006 {
1007         struct ip_tunnel *tunnel = netdev_priv(dev);
1008         struct ip_tunnel_net *itn;
1009
1010         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1011
1012         if (itn->fb_tunnel_dev != dev) {
1013                 ip_tunnel_del(itn, netdev_priv(dev));
1014                 unregister_netdevice_queue(dev, head);
1015         }
1016 }
1017 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1018
1019 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1020 {
1021         struct ip_tunnel *tunnel = netdev_priv(dev);
1022
1023         return tunnel->net;
1024 }
1025 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1026
1027 int ip_tunnel_get_iflink(const struct net_device *dev)
1028 {
1029         struct ip_tunnel *tunnel = netdev_priv(dev);
1030
1031         return tunnel->parms.link;
1032 }
1033 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1034
1035 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1036                                   struct rtnl_link_ops *ops, char *devname)
1037 {
1038         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1039         struct ip_tunnel_parm parms;
1040         unsigned int i;
1041
1042         itn->rtnl_link_ops = ops;
1043         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1044                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1045
1046         if (!ops || !net_has_fallback_tunnels(net)) {
1047                 struct ip_tunnel_net *it_init_net;
1048
1049                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1050                 itn->type = it_init_net->type;
1051                 itn->fb_tunnel_dev = NULL;
1052                 return 0;
1053         }
1054
1055         memset(&parms, 0, sizeof(parms));
1056         if (devname)
1057                 strlcpy(parms.name, devname, IFNAMSIZ);
1058
1059         rtnl_lock();
1060         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1061         /* FB netdevice is special: we have one, and only one per netns.
1062          * Allowing to move it to another netns is clearly unsafe.
1063          */
1064         if (!IS_ERR(itn->fb_tunnel_dev)) {
1065                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1066                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1067                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1068                 itn->type = itn->fb_tunnel_dev->type;
1069         }
1070         rtnl_unlock();
1071
1072         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1073 }
1074 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1075
1076 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1077                               struct list_head *head,
1078                               struct rtnl_link_ops *ops)
1079 {
1080         struct net_device *dev, *aux;
1081         int h;
1082
1083         for_each_netdev_safe(net, dev, aux)
1084                 if (dev->rtnl_link_ops == ops)
1085                         unregister_netdevice_queue(dev, head);
1086
1087         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1088                 struct ip_tunnel *t;
1089                 struct hlist_node *n;
1090                 struct hlist_head *thead = &itn->tunnels[h];
1091
1092                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1093                         /* If dev is in the same netns, it has already
1094                          * been added to the list by the previous loop.
1095                          */
1096                         if (!net_eq(dev_net(t->dev), net))
1097                                 unregister_netdevice_queue(t->dev, head);
1098         }
1099 }
1100
1101 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1102                            struct rtnl_link_ops *ops)
1103 {
1104         struct ip_tunnel_net *itn;
1105         struct net *net;
1106         LIST_HEAD(list);
1107
1108         rtnl_lock();
1109         list_for_each_entry(net, net_list, exit_list) {
1110                 itn = net_generic(net, id);
1111                 ip_tunnel_destroy(net, itn, &list, ops);
1112         }
1113         unregister_netdevice_many(&list);
1114         rtnl_unlock();
1115 }
1116 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1117
1118 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1119                       struct ip_tunnel_parm *p, __u32 fwmark)
1120 {
1121         struct ip_tunnel *nt;
1122         struct net *net = dev_net(dev);
1123         struct ip_tunnel_net *itn;
1124         int mtu;
1125         int err;
1126
1127         nt = netdev_priv(dev);
1128         itn = net_generic(net, nt->ip_tnl_net_id);
1129
1130         if (nt->collect_md) {
1131                 if (rtnl_dereference(itn->collect_md_tun))
1132                         return -EEXIST;
1133         } else {
1134                 if (ip_tunnel_find(itn, p, dev->type))
1135                         return -EEXIST;
1136         }
1137
1138         nt->net = net;
1139         nt->parms = *p;
1140         nt->fwmark = fwmark;
1141         err = register_netdevice(dev);
1142         if (err)
1143                 goto err_register_netdevice;
1144
1145         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1146                 eth_hw_addr_random(dev);
1147
1148         mtu = ip_tunnel_bind_dev(dev);
1149         if (tb[IFLA_MTU]) {
1150                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1151
1152                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1153                             (unsigned int)(max - sizeof(struct iphdr)));
1154         }
1155
1156         err = dev_set_mtu(dev, mtu);
1157         if (err)
1158                 goto err_dev_set_mtu;
1159
1160         ip_tunnel_add(itn, nt);
1161         return 0;
1162
1163 err_dev_set_mtu:
1164         unregister_netdevice(dev);
1165 err_register_netdevice:
1166         return err;
1167 }
1168 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1169
1170 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1171                          struct ip_tunnel_parm *p, __u32 fwmark)
1172 {
1173         struct ip_tunnel *t;
1174         struct ip_tunnel *tunnel = netdev_priv(dev);
1175         struct net *net = tunnel->net;
1176         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1177
1178         if (dev == itn->fb_tunnel_dev)
1179                 return -EINVAL;
1180
1181         t = ip_tunnel_find(itn, p, dev->type);
1182
1183         if (t) {
1184                 if (t->dev != dev)
1185                         return -EEXIST;
1186         } else {
1187                 t = tunnel;
1188
1189                 if (dev->type != ARPHRD_ETHER) {
1190                         unsigned int nflags = 0;
1191
1192                         if (ipv4_is_multicast(p->iph.daddr))
1193                                 nflags = IFF_BROADCAST;
1194                         else if (p->iph.daddr)
1195                                 nflags = IFF_POINTOPOINT;
1196
1197                         if ((dev->flags ^ nflags) &
1198                             (IFF_POINTOPOINT | IFF_BROADCAST))
1199                                 return -EINVAL;
1200                 }
1201         }
1202
1203         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1204         return 0;
1205 }
1206 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1207
1208 int ip_tunnel_init(struct net_device *dev)
1209 {
1210         struct ip_tunnel *tunnel = netdev_priv(dev);
1211         struct iphdr *iph = &tunnel->parms.iph;
1212         int err;
1213
1214         dev->needs_free_netdev = true;
1215         dev->priv_destructor = ip_tunnel_dev_free;
1216         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1217         if (!dev->tstats)
1218                 return -ENOMEM;
1219
1220         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1221         if (err) {
1222                 free_percpu(dev->tstats);
1223                 return err;
1224         }
1225
1226         err = gro_cells_init(&tunnel->gro_cells, dev);
1227         if (err) {
1228                 dst_cache_destroy(&tunnel->dst_cache);
1229                 free_percpu(dev->tstats);
1230                 return err;
1231         }
1232
1233         tunnel->dev = dev;
1234         tunnel->net = dev_net(dev);
1235         strcpy(tunnel->parms.name, dev->name);
1236         iph->version            = 4;
1237         iph->ihl                = 5;
1238
1239         if (tunnel->collect_md) {
1240                 dev->features |= NETIF_F_NETNS_LOCAL;
1241                 netif_keep_dst(dev);
1242         }
1243         return 0;
1244 }
1245 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1246
1247 void ip_tunnel_uninit(struct net_device *dev)
1248 {
1249         struct ip_tunnel *tunnel = netdev_priv(dev);
1250         struct net *net = tunnel->net;
1251         struct ip_tunnel_net *itn;
1252
1253         itn = net_generic(net, tunnel->ip_tnl_net_id);
1254         /* fb_tunnel_dev will be unregisted in net-exit call. */
1255         if (itn->fb_tunnel_dev != dev)
1256                 ip_tunnel_del(itn, netdev_priv(dev));
1257
1258         dst_cache_reset(&tunnel->dst_cache);
1259 }
1260 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1261
1262 /* Do least required initialization, rest of init is done in tunnel_init call */
1263 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1264 {
1265         struct ip_tunnel *tunnel = netdev_priv(dev);
1266         tunnel->ip_tnl_net_id = net_id;
1267 }
1268 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1269
1270 MODULE_LICENSE("GPL");