Merge tag 'kbuild-misc-v4.16' of git://git.kernel.org/pub/scm/linux/kernel/git/masahi...
[sfrench/cifs-2.6.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t && t->dev->flags & IFF_UP)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         if (parms->name[0])
257                 strlcpy(name, parms->name, IFNAMSIZ);
258         else {
259                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260                         err = -E2BIG;
261                         goto failed;
262                 }
263                 strlcpy(name, ops->kind, IFNAMSIZ);
264                 strncat(name, "%d", 2);
265         }
266
267         ASSERT_RTNL();
268         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269         if (!dev) {
270                 err = -ENOMEM;
271                 goto failed;
272         }
273         dev_net_set(dev, net);
274
275         dev->rtnl_link_ops = ops;
276
277         tunnel = netdev_priv(dev);
278         tunnel->parms = *parms;
279         tunnel->net = net;
280
281         err = register_netdevice(dev);
282         if (err)
283                 goto failed_free;
284
285         return dev;
286
287 failed_free:
288         free_netdev(dev);
289 failed:
290         return ERR_PTR(err);
291 }
292
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294                                     int proto,
295                                     __be32 daddr, __be32 saddr,
296                                     __be32 key, __u8 tos, int oif,
297                                     __u32 mark)
298 {
299         memset(fl4, 0, sizeof(*fl4));
300         fl4->flowi4_oif = oif;
301         fl4->daddr = daddr;
302         fl4->saddr = saddr;
303         fl4->flowi4_tos = tos;
304         fl4->flowi4_proto = proto;
305         fl4->fl4_gre_key = key;
306         fl4->flowi4_mark = mark;
307 }
308
309 static int ip_tunnel_bind_dev(struct net_device *dev)
310 {
311         struct net_device *tdev = NULL;
312         struct ip_tunnel *tunnel = netdev_priv(dev);
313         const struct iphdr *iph;
314         int hlen = LL_MAX_HEADER;
315         int mtu = ETH_DATA_LEN;
316         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
317
318         iph = &tunnel->parms.iph;
319
320         /* Guess output device to choose reasonable mtu and needed_headroom */
321         if (iph->daddr) {
322                 struct flowi4 fl4;
323                 struct rtable *rt;
324
325                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
326                                  iph->saddr, tunnel->parms.o_key,
327                                  RT_TOS(iph->tos), tunnel->parms.link,
328                                  tunnel->fwmark);
329                 rt = ip_route_output_key(tunnel->net, &fl4);
330
331                 if (!IS_ERR(rt)) {
332                         tdev = rt->dst.dev;
333                         ip_rt_put(rt);
334                 }
335                 if (dev->type != ARPHRD_ETHER)
336                         dev->flags |= IFF_POINTOPOINT;
337
338                 dst_cache_reset(&tunnel->dst_cache);
339         }
340
341         if (!tdev && tunnel->parms.link)
342                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
343
344         if (tdev) {
345                 hlen = tdev->hard_header_len + tdev->needed_headroom;
346                 mtu = tdev->mtu;
347         }
348
349         dev->needed_headroom = t_hlen + hlen;
350         mtu -= (dev->hard_header_len + t_hlen);
351
352         if (mtu < IPV4_MIN_MTU)
353                 mtu = IPV4_MIN_MTU;
354
355         return mtu;
356 }
357
358 static struct ip_tunnel *ip_tunnel_create(struct net *net,
359                                           struct ip_tunnel_net *itn,
360                                           struct ip_tunnel_parm *parms)
361 {
362         struct ip_tunnel *nt;
363         struct net_device *dev;
364         int t_hlen;
365
366         BUG_ON(!itn->fb_tunnel_dev);
367         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
368         if (IS_ERR(dev))
369                 return ERR_CAST(dev);
370
371         dev->mtu = ip_tunnel_bind_dev(dev);
372
373         nt = netdev_priv(dev);
374         t_hlen = nt->hlen + sizeof(struct iphdr);
375         dev->min_mtu = ETH_MIN_MTU;
376         dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
377         ip_tunnel_add(itn, nt);
378         return nt;
379 }
380
381 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
382                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
383                   bool log_ecn_error)
384 {
385         struct pcpu_sw_netstats *tstats;
386         const struct iphdr *iph = ip_hdr(skb);
387         int err;
388
389 #ifdef CONFIG_NET_IPGRE_BROADCAST
390         if (ipv4_is_multicast(iph->daddr)) {
391                 tunnel->dev->stats.multicast++;
392                 skb->pkt_type = PACKET_BROADCAST;
393         }
394 #endif
395
396         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
397              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
398                 tunnel->dev->stats.rx_crc_errors++;
399                 tunnel->dev->stats.rx_errors++;
400                 goto drop;
401         }
402
403         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
404                 if (!(tpi->flags&TUNNEL_SEQ) ||
405                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
406                         tunnel->dev->stats.rx_fifo_errors++;
407                         tunnel->dev->stats.rx_errors++;
408                         goto drop;
409                 }
410                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
411         }
412
413         skb_reset_network_header(skb);
414
415         err = IP_ECN_decapsulate(iph, skb);
416         if (unlikely(err)) {
417                 if (log_ecn_error)
418                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
419                                         &iph->saddr, iph->tos);
420                 if (err > 1) {
421                         ++tunnel->dev->stats.rx_frame_errors;
422                         ++tunnel->dev->stats.rx_errors;
423                         goto drop;
424                 }
425         }
426
427         tstats = this_cpu_ptr(tunnel->dev->tstats);
428         u64_stats_update_begin(&tstats->syncp);
429         tstats->rx_packets++;
430         tstats->rx_bytes += skb->len;
431         u64_stats_update_end(&tstats->syncp);
432
433         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
434
435         if (tunnel->dev->type == ARPHRD_ETHER) {
436                 skb->protocol = eth_type_trans(skb, tunnel->dev);
437                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
438         } else {
439                 skb->dev = tunnel->dev;
440         }
441
442         if (tun_dst)
443                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
444
445         gro_cells_receive(&tunnel->gro_cells, skb);
446         return 0;
447
448 drop:
449         if (tun_dst)
450                 dst_release((struct dst_entry *)tun_dst);
451         kfree_skb(skb);
452         return 0;
453 }
454 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
455
456 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
457                             unsigned int num)
458 {
459         if (num >= MAX_IPTUN_ENCAP_OPS)
460                 return -ERANGE;
461
462         return !cmpxchg((const struct ip_tunnel_encap_ops **)
463                         &iptun_encaps[num],
464                         NULL, ops) ? 0 : -1;
465 }
466 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
467
468 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
469                             unsigned int num)
470 {
471         int ret;
472
473         if (num >= MAX_IPTUN_ENCAP_OPS)
474                 return -ERANGE;
475
476         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
477                        &iptun_encaps[num],
478                        ops, NULL) == ops) ? 0 : -1;
479
480         synchronize_net();
481
482         return ret;
483 }
484 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
485
486 int ip_tunnel_encap_setup(struct ip_tunnel *t,
487                           struct ip_tunnel_encap *ipencap)
488 {
489         int hlen;
490
491         memset(&t->encap, 0, sizeof(t->encap));
492
493         hlen = ip_encap_hlen(ipencap);
494         if (hlen < 0)
495                 return hlen;
496
497         t->encap.type = ipencap->type;
498         t->encap.sport = ipencap->sport;
499         t->encap.dport = ipencap->dport;
500         t->encap.flags = ipencap->flags;
501
502         t->encap_hlen = hlen;
503         t->hlen = t->encap_hlen + t->tun_hlen;
504
505         return 0;
506 }
507 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
508
509 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
510                             struct rtable *rt, __be16 df,
511                             const struct iphdr *inner_iph)
512 {
513         struct ip_tunnel *tunnel = netdev_priv(dev);
514         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
515         int mtu;
516
517         if (df)
518                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
519                                         - sizeof(struct iphdr) - tunnel->hlen;
520         else
521                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
522
523         skb_dst_update_pmtu(skb, mtu);
524
525         if (skb->protocol == htons(ETH_P_IP)) {
526                 if (!skb_is_gso(skb) &&
527                     (inner_iph->frag_off & htons(IP_DF)) &&
528                     mtu < pkt_size) {
529                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
530                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
531                         return -E2BIG;
532                 }
533         }
534 #if IS_ENABLED(CONFIG_IPV6)
535         else if (skb->protocol == htons(ETH_P_IPV6)) {
536                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
537
538                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
539                            mtu >= IPV6_MIN_MTU) {
540                         if ((tunnel->parms.iph.daddr &&
541                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
542                             rt6->rt6i_dst.plen == 128) {
543                                 rt6->rt6i_flags |= RTF_MODIFIED;
544                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
545                         }
546                 }
547
548                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
549                                         mtu < pkt_size) {
550                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
551                         return -E2BIG;
552                 }
553         }
554 #endif
555         return 0;
556 }
557
558 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
559 {
560         struct ip_tunnel *tunnel = netdev_priv(dev);
561         u32 headroom = sizeof(struct iphdr);
562         struct ip_tunnel_info *tun_info;
563         const struct ip_tunnel_key *key;
564         const struct iphdr *inner_iph;
565         struct rtable *rt;
566         struct flowi4 fl4;
567         __be16 df = 0;
568         u8 tos, ttl;
569
570         tun_info = skb_tunnel_info(skb);
571         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
572                      ip_tunnel_info_af(tun_info) != AF_INET))
573                 goto tx_error;
574         key = &tun_info->key;
575         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
576         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
577         tos = key->tos;
578         if (tos == 1) {
579                 if (skb->protocol == htons(ETH_P_IP))
580                         tos = inner_iph->tos;
581                 else if (skb->protocol == htons(ETH_P_IPV6))
582                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
583         }
584         init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
585                          RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
586         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
587                 goto tx_error;
588         rt = ip_route_output_key(tunnel->net, &fl4);
589         if (IS_ERR(rt)) {
590                 dev->stats.tx_carrier_errors++;
591                 goto tx_error;
592         }
593         if (rt->dst.dev == dev) {
594                 ip_rt_put(rt);
595                 dev->stats.collisions++;
596                 goto tx_error;
597         }
598         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
599         ttl = key->ttl;
600         if (ttl == 0) {
601                 if (skb->protocol == htons(ETH_P_IP))
602                         ttl = inner_iph->ttl;
603                 else if (skb->protocol == htons(ETH_P_IPV6))
604                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
605                 else
606                         ttl = ip4_dst_hoplimit(&rt->dst);
607         }
608         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
609                 df = htons(IP_DF);
610         else if (skb->protocol == htons(ETH_P_IP))
611                 df = inner_iph->frag_off & htons(IP_DF);
612         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
613         if (headroom > dev->needed_headroom)
614                 dev->needed_headroom = headroom;
615
616         if (skb_cow_head(skb, dev->needed_headroom)) {
617                 ip_rt_put(rt);
618                 goto tx_dropped;
619         }
620         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
621                       df, !net_eq(tunnel->net, dev_net(dev)));
622         return;
623 tx_error:
624         dev->stats.tx_errors++;
625         goto kfree;
626 tx_dropped:
627         dev->stats.tx_dropped++;
628 kfree:
629         kfree_skb(skb);
630 }
631 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
632
633 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
634                     const struct iphdr *tnl_params, u8 protocol)
635 {
636         struct ip_tunnel *tunnel = netdev_priv(dev);
637         const struct iphdr *inner_iph;
638         struct flowi4 fl4;
639         u8     tos, ttl;
640         __be16 df;
641         struct rtable *rt;              /* Route to the other host */
642         unsigned int max_headroom;      /* The extra header space needed */
643         __be32 dst;
644         bool connected;
645
646         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
647         connected = (tunnel->parms.iph.daddr != 0);
648
649         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
650
651         dst = tnl_params->daddr;
652         if (dst == 0) {
653                 /* NBMA tunnel */
654
655                 if (!skb_dst(skb)) {
656                         dev->stats.tx_fifo_errors++;
657                         goto tx_error;
658                 }
659
660                 if (skb->protocol == htons(ETH_P_IP)) {
661                         rt = skb_rtable(skb);
662                         dst = rt_nexthop(rt, inner_iph->daddr);
663                 }
664 #if IS_ENABLED(CONFIG_IPV6)
665                 else if (skb->protocol == htons(ETH_P_IPV6)) {
666                         const struct in6_addr *addr6;
667                         struct neighbour *neigh;
668                         bool do_tx_error_icmp;
669                         int addr_type;
670
671                         neigh = dst_neigh_lookup(skb_dst(skb),
672                                                  &ipv6_hdr(skb)->daddr);
673                         if (!neigh)
674                                 goto tx_error;
675
676                         addr6 = (const struct in6_addr *)&neigh->primary_key;
677                         addr_type = ipv6_addr_type(addr6);
678
679                         if (addr_type == IPV6_ADDR_ANY) {
680                                 addr6 = &ipv6_hdr(skb)->daddr;
681                                 addr_type = ipv6_addr_type(addr6);
682                         }
683
684                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
685                                 do_tx_error_icmp = true;
686                         else {
687                                 do_tx_error_icmp = false;
688                                 dst = addr6->s6_addr32[3];
689                         }
690                         neigh_release(neigh);
691                         if (do_tx_error_icmp)
692                                 goto tx_error_icmp;
693                 }
694 #endif
695                 else
696                         goto tx_error;
697
698                 connected = false;
699         }
700
701         tos = tnl_params->tos;
702         if (tos & 0x1) {
703                 tos &= ~0x1;
704                 if (skb->protocol == htons(ETH_P_IP)) {
705                         tos = inner_iph->tos;
706                         connected = false;
707                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
708                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
709                         connected = false;
710                 }
711         }
712
713         if (tunnel->fwmark) {
714                 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
715                                  tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
716                                  tunnel->fwmark);
717         }
718         else {
719                 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
720                                  tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
721                                  skb->mark);
722         }
723
724         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
725                 goto tx_error;
726
727         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
728                          NULL;
729
730         if (!rt) {
731                 rt = ip_route_output_key(tunnel->net, &fl4);
732
733                 if (IS_ERR(rt)) {
734                         dev->stats.tx_carrier_errors++;
735                         goto tx_error;
736                 }
737                 if (connected)
738                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
739                                           fl4.saddr);
740         }
741
742         if (rt->dst.dev == dev) {
743                 ip_rt_put(rt);
744                 dev->stats.collisions++;
745                 goto tx_error;
746         }
747
748         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
749                 ip_rt_put(rt);
750                 goto tx_error;
751         }
752
753         if (tunnel->err_count > 0) {
754                 if (time_before(jiffies,
755                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
756                         tunnel->err_count--;
757
758                         dst_link_failure(skb);
759                 } else
760                         tunnel->err_count = 0;
761         }
762
763         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
764         ttl = tnl_params->ttl;
765         if (ttl == 0) {
766                 if (skb->protocol == htons(ETH_P_IP))
767                         ttl = inner_iph->ttl;
768 #if IS_ENABLED(CONFIG_IPV6)
769                 else if (skb->protocol == htons(ETH_P_IPV6))
770                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
771 #endif
772                 else
773                         ttl = ip4_dst_hoplimit(&rt->dst);
774         }
775
776         df = tnl_params->frag_off;
777         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
778                 df |= (inner_iph->frag_off&htons(IP_DF));
779
780         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
781                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
782         if (max_headroom > dev->needed_headroom)
783                 dev->needed_headroom = max_headroom;
784
785         if (skb_cow_head(skb, dev->needed_headroom)) {
786                 ip_rt_put(rt);
787                 dev->stats.tx_dropped++;
788                 kfree_skb(skb);
789                 return;
790         }
791
792         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
793                       df, !net_eq(tunnel->net, dev_net(dev)));
794         return;
795
796 #if IS_ENABLED(CONFIG_IPV6)
797 tx_error_icmp:
798         dst_link_failure(skb);
799 #endif
800 tx_error:
801         dev->stats.tx_errors++;
802         kfree_skb(skb);
803 }
804 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
805
806 static void ip_tunnel_update(struct ip_tunnel_net *itn,
807                              struct ip_tunnel *t,
808                              struct net_device *dev,
809                              struct ip_tunnel_parm *p,
810                              bool set_mtu,
811                              __u32 fwmark)
812 {
813         ip_tunnel_del(itn, t);
814         t->parms.iph.saddr = p->iph.saddr;
815         t->parms.iph.daddr = p->iph.daddr;
816         t->parms.i_key = p->i_key;
817         t->parms.o_key = p->o_key;
818         if (dev->type != ARPHRD_ETHER) {
819                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
820                 memcpy(dev->broadcast, &p->iph.daddr, 4);
821         }
822         ip_tunnel_add(itn, t);
823
824         t->parms.iph.ttl = p->iph.ttl;
825         t->parms.iph.tos = p->iph.tos;
826         t->parms.iph.frag_off = p->iph.frag_off;
827
828         if (t->parms.link != p->link || t->fwmark != fwmark) {
829                 int mtu;
830
831                 t->parms.link = p->link;
832                 t->fwmark = fwmark;
833                 mtu = ip_tunnel_bind_dev(dev);
834                 if (set_mtu)
835                         dev->mtu = mtu;
836         }
837         dst_cache_reset(&t->dst_cache);
838         netdev_state_change(dev);
839 }
840
841 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
842 {
843         int err = 0;
844         struct ip_tunnel *t = netdev_priv(dev);
845         struct net *net = t->net;
846         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
847
848         BUG_ON(!itn->fb_tunnel_dev);
849         switch (cmd) {
850         case SIOCGETTUNNEL:
851                 if (dev == itn->fb_tunnel_dev) {
852                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
853                         if (!t)
854                                 t = netdev_priv(dev);
855                 }
856                 memcpy(p, &t->parms, sizeof(*p));
857                 break;
858
859         case SIOCADDTUNNEL:
860         case SIOCCHGTUNNEL:
861                 err = -EPERM;
862                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
863                         goto done;
864                 if (p->iph.ttl)
865                         p->iph.frag_off |= htons(IP_DF);
866                 if (!(p->i_flags & VTI_ISVTI)) {
867                         if (!(p->i_flags & TUNNEL_KEY))
868                                 p->i_key = 0;
869                         if (!(p->o_flags & TUNNEL_KEY))
870                                 p->o_key = 0;
871                 }
872
873                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
874
875                 if (cmd == SIOCADDTUNNEL) {
876                         if (!t) {
877                                 t = ip_tunnel_create(net, itn, p);
878                                 err = PTR_ERR_OR_ZERO(t);
879                                 break;
880                         }
881
882                         err = -EEXIST;
883                         break;
884                 }
885                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
886                         if (t) {
887                                 if (t->dev != dev) {
888                                         err = -EEXIST;
889                                         break;
890                                 }
891                         } else {
892                                 unsigned int nflags = 0;
893
894                                 if (ipv4_is_multicast(p->iph.daddr))
895                                         nflags = IFF_BROADCAST;
896                                 else if (p->iph.daddr)
897                                         nflags = IFF_POINTOPOINT;
898
899                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
900                                         err = -EINVAL;
901                                         break;
902                                 }
903
904                                 t = netdev_priv(dev);
905                         }
906                 }
907
908                 if (t) {
909                         err = 0;
910                         ip_tunnel_update(itn, t, dev, p, true, 0);
911                 } else {
912                         err = -ENOENT;
913                 }
914                 break;
915
916         case SIOCDELTUNNEL:
917                 err = -EPERM;
918                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
919                         goto done;
920
921                 if (dev == itn->fb_tunnel_dev) {
922                         err = -ENOENT;
923                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
924                         if (!t)
925                                 goto done;
926                         err = -EPERM;
927                         if (t == netdev_priv(itn->fb_tunnel_dev))
928                                 goto done;
929                         dev = t->dev;
930                 }
931                 unregister_netdevice(dev);
932                 err = 0;
933                 break;
934
935         default:
936                 err = -EINVAL;
937         }
938
939 done:
940         return err;
941 }
942 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
943
944 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
945 {
946         struct ip_tunnel *tunnel = netdev_priv(dev);
947         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
948         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
949
950         if (new_mtu < ETH_MIN_MTU)
951                 return -EINVAL;
952
953         if (new_mtu > max_mtu) {
954                 if (strict)
955                         return -EINVAL;
956
957                 new_mtu = max_mtu;
958         }
959
960         dev->mtu = new_mtu;
961         return 0;
962 }
963 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
964
965 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
966 {
967         return __ip_tunnel_change_mtu(dev, new_mtu, true);
968 }
969 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
970
971 static void ip_tunnel_dev_free(struct net_device *dev)
972 {
973         struct ip_tunnel *tunnel = netdev_priv(dev);
974
975         gro_cells_destroy(&tunnel->gro_cells);
976         dst_cache_destroy(&tunnel->dst_cache);
977         free_percpu(dev->tstats);
978 }
979
980 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
981 {
982         struct ip_tunnel *tunnel = netdev_priv(dev);
983         struct ip_tunnel_net *itn;
984
985         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
986
987         if (itn->fb_tunnel_dev != dev) {
988                 ip_tunnel_del(itn, netdev_priv(dev));
989                 unregister_netdevice_queue(dev, head);
990         }
991 }
992 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
993
994 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
995 {
996         struct ip_tunnel *tunnel = netdev_priv(dev);
997
998         return tunnel->net;
999 }
1000 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1001
1002 int ip_tunnel_get_iflink(const struct net_device *dev)
1003 {
1004         struct ip_tunnel *tunnel = netdev_priv(dev);
1005
1006         return tunnel->parms.link;
1007 }
1008 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1009
1010 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1011                                   struct rtnl_link_ops *ops, char *devname)
1012 {
1013         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1014         struct ip_tunnel_parm parms;
1015         unsigned int i;
1016
1017         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1018                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1019
1020         if (!ops) {
1021                 itn->fb_tunnel_dev = NULL;
1022                 return 0;
1023         }
1024
1025         memset(&parms, 0, sizeof(parms));
1026         if (devname)
1027                 strlcpy(parms.name, devname, IFNAMSIZ);
1028
1029         rtnl_lock();
1030         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1031         /* FB netdevice is special: we have one, and only one per netns.
1032          * Allowing to move it to another netns is clearly unsafe.
1033          */
1034         if (!IS_ERR(itn->fb_tunnel_dev)) {
1035                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1036                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1037                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1038         }
1039         rtnl_unlock();
1040
1041         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1042 }
1043 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1044
1045 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1046                               struct rtnl_link_ops *ops)
1047 {
1048         struct net *net = dev_net(itn->fb_tunnel_dev);
1049         struct net_device *dev, *aux;
1050         int h;
1051
1052         for_each_netdev_safe(net, dev, aux)
1053                 if (dev->rtnl_link_ops == ops)
1054                         unregister_netdevice_queue(dev, head);
1055
1056         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1057                 struct ip_tunnel *t;
1058                 struct hlist_node *n;
1059                 struct hlist_head *thead = &itn->tunnels[h];
1060
1061                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1062                         /* If dev is in the same netns, it has already
1063                          * been added to the list by the previous loop.
1064                          */
1065                         if (!net_eq(dev_net(t->dev), net))
1066                                 unregister_netdevice_queue(t->dev, head);
1067         }
1068 }
1069
1070 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1071                            struct rtnl_link_ops *ops)
1072 {
1073         struct ip_tunnel_net *itn;
1074         struct net *net;
1075         LIST_HEAD(list);
1076
1077         rtnl_lock();
1078         list_for_each_entry(net, net_list, exit_list) {
1079                 itn = net_generic(net, id);
1080                 ip_tunnel_destroy(itn, &list, ops);
1081         }
1082         unregister_netdevice_many(&list);
1083         rtnl_unlock();
1084 }
1085 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1086
1087 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1088                       struct ip_tunnel_parm *p, __u32 fwmark)
1089 {
1090         struct ip_tunnel *nt;
1091         struct net *net = dev_net(dev);
1092         struct ip_tunnel_net *itn;
1093         int mtu;
1094         int err;
1095
1096         nt = netdev_priv(dev);
1097         itn = net_generic(net, nt->ip_tnl_net_id);
1098
1099         if (nt->collect_md) {
1100                 if (rtnl_dereference(itn->collect_md_tun))
1101                         return -EEXIST;
1102         } else {
1103                 if (ip_tunnel_find(itn, p, dev->type))
1104                         return -EEXIST;
1105         }
1106
1107         nt->net = net;
1108         nt->parms = *p;
1109         nt->fwmark = fwmark;
1110         err = register_netdevice(dev);
1111         if (err)
1112                 goto out;
1113
1114         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1115                 eth_hw_addr_random(dev);
1116
1117         mtu = ip_tunnel_bind_dev(dev);
1118         if (!tb[IFLA_MTU])
1119                 dev->mtu = mtu;
1120
1121         ip_tunnel_add(itn, nt);
1122 out:
1123         return err;
1124 }
1125 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1126
1127 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1128                          struct ip_tunnel_parm *p, __u32 fwmark)
1129 {
1130         struct ip_tunnel *t;
1131         struct ip_tunnel *tunnel = netdev_priv(dev);
1132         struct net *net = tunnel->net;
1133         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1134
1135         if (dev == itn->fb_tunnel_dev)
1136                 return -EINVAL;
1137
1138         t = ip_tunnel_find(itn, p, dev->type);
1139
1140         if (t) {
1141                 if (t->dev != dev)
1142                         return -EEXIST;
1143         } else {
1144                 t = tunnel;
1145
1146                 if (dev->type != ARPHRD_ETHER) {
1147                         unsigned int nflags = 0;
1148
1149                         if (ipv4_is_multicast(p->iph.daddr))
1150                                 nflags = IFF_BROADCAST;
1151                         else if (p->iph.daddr)
1152                                 nflags = IFF_POINTOPOINT;
1153
1154                         if ((dev->flags ^ nflags) &
1155                             (IFF_POINTOPOINT | IFF_BROADCAST))
1156                                 return -EINVAL;
1157                 }
1158         }
1159
1160         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1161         return 0;
1162 }
1163 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1164
1165 int ip_tunnel_init(struct net_device *dev)
1166 {
1167         struct ip_tunnel *tunnel = netdev_priv(dev);
1168         struct iphdr *iph = &tunnel->parms.iph;
1169         int err;
1170
1171         dev->needs_free_netdev = true;
1172         dev->priv_destructor = ip_tunnel_dev_free;
1173         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1174         if (!dev->tstats)
1175                 return -ENOMEM;
1176
1177         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1178         if (err) {
1179                 free_percpu(dev->tstats);
1180                 return err;
1181         }
1182
1183         err = gro_cells_init(&tunnel->gro_cells, dev);
1184         if (err) {
1185                 dst_cache_destroy(&tunnel->dst_cache);
1186                 free_percpu(dev->tstats);
1187                 return err;
1188         }
1189
1190         tunnel->dev = dev;
1191         tunnel->net = dev_net(dev);
1192         strcpy(tunnel->parms.name, dev->name);
1193         iph->version            = 4;
1194         iph->ihl                = 5;
1195
1196         if (tunnel->collect_md) {
1197                 dev->features |= NETIF_F_NETNS_LOCAL;
1198                 netif_keep_dst(dev);
1199         }
1200         return 0;
1201 }
1202 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1203
1204 void ip_tunnel_uninit(struct net_device *dev)
1205 {
1206         struct ip_tunnel *tunnel = netdev_priv(dev);
1207         struct net *net = tunnel->net;
1208         struct ip_tunnel_net *itn;
1209
1210         itn = net_generic(net, tunnel->ip_tnl_net_id);
1211         /* fb_tunnel_dev will be unregisted in net-exit call. */
1212         if (itn->fb_tunnel_dev != dev)
1213                 ip_tunnel_del(itn, netdev_priv(dev));
1214
1215         dst_cache_reset(&tunnel->dst_cache);
1216 }
1217 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1218
1219 /* Do least required initialization, rest of init is done in tunnel_init call */
1220 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1221 {
1222         struct ip_tunnel *tunnel = netdev_priv(dev);
1223         tunnel->ip_tnl_net_id = net_id;
1224 }
1225 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1226
1227 MODULE_LICENSE("GPL");