net: common metrics init helper for FIB entries
[sfrench/cifs-2.6.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73                                 __be16 flags, __be32 key)
74 {
75         if (p->i_flags & TUNNEL_KEY) {
76                 if (flags & TUNNEL_KEY)
77                         return key == p->i_key;
78                 else
79                         /* key expected, none present */
80                         return false;
81         } else
82                 return !(flags & TUNNEL_KEY);
83 }
84
85 /* Fallback tunnel: no source, no destination, no key, no options
86
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97                                    int link, __be16 flags,
98                                    __be32 remote, __be32 local,
99                                    __be32 key)
100 {
101         unsigned int hash;
102         struct ip_tunnel *t, *cand = NULL;
103         struct hlist_head *head;
104
105         hash = ip_tunnel_hash(key, remote);
106         head = &itn->tunnels[hash];
107
108         hlist_for_each_entry_rcu(t, head, hash_node) {
109                 if (local != t->parms.iph.saddr ||
110                     remote != t->parms.iph.daddr ||
111                     !(t->dev->flags & IFF_UP))
112                         continue;
113
114                 if (!ip_tunnel_key_match(&t->parms, flags, key))
115                         continue;
116
117                 if (t->parms.link == link)
118                         return t;
119                 else
120                         cand = t;
121         }
122
123         hlist_for_each_entry_rcu(t, head, hash_node) {
124                 if (remote != t->parms.iph.daddr ||
125                     t->parms.iph.saddr != 0 ||
126                     !(t->dev->flags & IFF_UP))
127                         continue;
128
129                 if (!ip_tunnel_key_match(&t->parms, flags, key))
130                         continue;
131
132                 if (t->parms.link == link)
133                         return t;
134                 else if (!cand)
135                         cand = t;
136         }
137
138         hash = ip_tunnel_hash(key, 0);
139         head = &itn->tunnels[hash];
140
141         hlist_for_each_entry_rcu(t, head, hash_node) {
142                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144                         continue;
145
146                 if (!(t->dev->flags & IFF_UP))
147                         continue;
148
149                 if (!ip_tunnel_key_match(&t->parms, flags, key))
150                         continue;
151
152                 if (t->parms.link == link)
153                         return t;
154                 else if (!cand)
155                         cand = t;
156         }
157
158         if (flags & TUNNEL_NO_KEY)
159                 goto skip_key_lookup;
160
161         hlist_for_each_entry_rcu(t, head, hash_node) {
162                 if (t->parms.i_key != key ||
163                     t->parms.iph.saddr != 0 ||
164                     t->parms.iph.daddr != 0 ||
165                     !(t->dev->flags & IFF_UP))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else if (!cand)
171                         cand = t;
172         }
173
174 skip_key_lookup:
175         if (cand)
176                 return cand;
177
178         t = rcu_dereference(itn->collect_md_tun);
179         if (t && t->dev->flags & IFF_UP)
180                 return t;
181
182         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183                 return netdev_priv(itn->fb_tunnel_dev);
184
185         return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190                                     struct ip_tunnel_parm *parms)
191 {
192         unsigned int h;
193         __be32 remote;
194         __be32 i_key = parms->i_key;
195
196         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197                 remote = parms->iph.daddr;
198         else
199                 remote = 0;
200
201         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202                 i_key = 0;
203
204         h = ip_tunnel_hash(i_key, remote);
205         return &itn->tunnels[h];
206 }
207
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210         struct hlist_head *head = ip_bucket(itn, &t->parms);
211
212         if (t->collect_md)
213                 rcu_assign_pointer(itn->collect_md_tun, t);
214         hlist_add_head_rcu(&t->hash_node, head);
215 }
216
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219         if (t->collect_md)
220                 rcu_assign_pointer(itn->collect_md_tun, NULL);
221         hlist_del_init_rcu(&t->hash_node);
222 }
223
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225                                         struct ip_tunnel_parm *parms,
226                                         int type)
227 {
228         __be32 remote = parms->iph.daddr;
229         __be32 local = parms->iph.saddr;
230         __be32 key = parms->i_key;
231         __be16 flags = parms->i_flags;
232         int link = parms->link;
233         struct ip_tunnel *t = NULL;
234         struct hlist_head *head = ip_bucket(itn, parms);
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if (local == t->parms.iph.saddr &&
238                     remote == t->parms.iph.daddr &&
239                     link == t->parms.link &&
240                     type == t->dev->type &&
241                     ip_tunnel_key_match(&t->parms, flags, key))
242                         break;
243         }
244         return t;
245 }
246
247 static struct net_device *__ip_tunnel_create(struct net *net,
248                                              const struct rtnl_link_ops *ops,
249                                              struct ip_tunnel_parm *parms)
250 {
251         int err;
252         struct ip_tunnel *tunnel;
253         struct net_device *dev;
254         char name[IFNAMSIZ];
255
256         err = -E2BIG;
257         if (parms->name[0]) {
258                 if (!dev_valid_name(parms->name))
259                         goto failed;
260                 strlcpy(name, parms->name, IFNAMSIZ);
261         } else {
262                 if (strlen(ops->kind) > (IFNAMSIZ - 3))
263                         goto failed;
264                 strcpy(name, ops->kind);
265                 strcat(name, "%d");
266         }
267
268         ASSERT_RTNL();
269         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270         if (!dev) {
271                 err = -ENOMEM;
272                 goto failed;
273         }
274         dev_net_set(dev, net);
275
276         dev->rtnl_link_ops = ops;
277
278         tunnel = netdev_priv(dev);
279         tunnel->parms = *parms;
280         tunnel->net = net;
281
282         err = register_netdevice(dev);
283         if (err)
284                 goto failed_free;
285
286         return dev;
287
288 failed_free:
289         free_netdev(dev);
290 failed:
291         return ERR_PTR(err);
292 }
293
294 static int ip_tunnel_bind_dev(struct net_device *dev)
295 {
296         struct net_device *tdev = NULL;
297         struct ip_tunnel *tunnel = netdev_priv(dev);
298         const struct iphdr *iph;
299         int hlen = LL_MAX_HEADER;
300         int mtu = ETH_DATA_LEN;
301         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
302
303         iph = &tunnel->parms.iph;
304
305         /* Guess output device to choose reasonable mtu and needed_headroom */
306         if (iph->daddr) {
307                 struct flowi4 fl4;
308                 struct rtable *rt;
309
310                 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
311                                     iph->saddr, tunnel->parms.o_key,
312                                     RT_TOS(iph->tos), tunnel->parms.link,
313                                     tunnel->fwmark);
314                 rt = ip_route_output_key(tunnel->net, &fl4);
315
316                 if (!IS_ERR(rt)) {
317                         tdev = rt->dst.dev;
318                         ip_rt_put(rt);
319                 }
320                 if (dev->type != ARPHRD_ETHER)
321                         dev->flags |= IFF_POINTOPOINT;
322
323                 dst_cache_reset(&tunnel->dst_cache);
324         }
325
326         if (!tdev && tunnel->parms.link)
327                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
328
329         if (tdev) {
330                 hlen = tdev->hard_header_len + tdev->needed_headroom;
331                 mtu = min(tdev->mtu, IP_MAX_MTU);
332         }
333
334         dev->needed_headroom = t_hlen + hlen;
335         mtu -= (dev->hard_header_len + t_hlen);
336
337         if (mtu < IPV4_MIN_MTU)
338                 mtu = IPV4_MIN_MTU;
339
340         return mtu;
341 }
342
343 static struct ip_tunnel *ip_tunnel_create(struct net *net,
344                                           struct ip_tunnel_net *itn,
345                                           struct ip_tunnel_parm *parms)
346 {
347         struct ip_tunnel *nt;
348         struct net_device *dev;
349         int t_hlen;
350         int mtu;
351         int err;
352
353         dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
354         if (IS_ERR(dev))
355                 return ERR_CAST(dev);
356
357         mtu = ip_tunnel_bind_dev(dev);
358         err = dev_set_mtu(dev, mtu);
359         if (err)
360                 goto err_dev_set_mtu;
361
362         nt = netdev_priv(dev);
363         t_hlen = nt->hlen + sizeof(struct iphdr);
364         dev->min_mtu = ETH_MIN_MTU;
365         dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
366         ip_tunnel_add(itn, nt);
367         return nt;
368
369 err_dev_set_mtu:
370         unregister_netdevice(dev);
371         return ERR_PTR(err);
372 }
373
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376                   bool log_ecn_error)
377 {
378         struct pcpu_sw_netstats *tstats;
379         const struct iphdr *iph = ip_hdr(skb);
380         int err;
381
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383         if (ipv4_is_multicast(iph->daddr)) {
384                 tunnel->dev->stats.multicast++;
385                 skb->pkt_type = PACKET_BROADCAST;
386         }
387 #endif
388
389         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391                 tunnel->dev->stats.rx_crc_errors++;
392                 tunnel->dev->stats.rx_errors++;
393                 goto drop;
394         }
395
396         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397                 if (!(tpi->flags&TUNNEL_SEQ) ||
398                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399                         tunnel->dev->stats.rx_fifo_errors++;
400                         tunnel->dev->stats.rx_errors++;
401                         goto drop;
402                 }
403                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
404         }
405
406         skb_reset_network_header(skb);
407
408         err = IP_ECN_decapsulate(iph, skb);
409         if (unlikely(err)) {
410                 if (log_ecn_error)
411                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412                                         &iph->saddr, iph->tos);
413                 if (err > 1) {
414                         ++tunnel->dev->stats.rx_frame_errors;
415                         ++tunnel->dev->stats.rx_errors;
416                         goto drop;
417                 }
418         }
419
420         tstats = this_cpu_ptr(tunnel->dev->tstats);
421         u64_stats_update_begin(&tstats->syncp);
422         tstats->rx_packets++;
423         tstats->rx_bytes += skb->len;
424         u64_stats_update_end(&tstats->syncp);
425
426         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427
428         if (tunnel->dev->type == ARPHRD_ETHER) {
429                 skb->protocol = eth_type_trans(skb, tunnel->dev);
430                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431         } else {
432                 skb->dev = tunnel->dev;
433         }
434
435         if (tun_dst)
436                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
437
438         gro_cells_receive(&tunnel->gro_cells, skb);
439         return 0;
440
441 drop:
442         if (tun_dst)
443                 dst_release((struct dst_entry *)tun_dst);
444         kfree_skb(skb);
445         return 0;
446 }
447 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
448
449 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
450                             unsigned int num)
451 {
452         if (num >= MAX_IPTUN_ENCAP_OPS)
453                 return -ERANGE;
454
455         return !cmpxchg((const struct ip_tunnel_encap_ops **)
456                         &iptun_encaps[num],
457                         NULL, ops) ? 0 : -1;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
460
461 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462                             unsigned int num)
463 {
464         int ret;
465
466         if (num >= MAX_IPTUN_ENCAP_OPS)
467                 return -ERANGE;
468
469         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
470                        &iptun_encaps[num],
471                        ops, NULL) == ops) ? 0 : -1;
472
473         synchronize_net();
474
475         return ret;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
478
479 int ip_tunnel_encap_setup(struct ip_tunnel *t,
480                           struct ip_tunnel_encap *ipencap)
481 {
482         int hlen;
483
484         memset(&t->encap, 0, sizeof(t->encap));
485
486         hlen = ip_encap_hlen(ipencap);
487         if (hlen < 0)
488                 return hlen;
489
490         t->encap.type = ipencap->type;
491         t->encap.sport = ipencap->sport;
492         t->encap.dport = ipencap->dport;
493         t->encap.flags = ipencap->flags;
494
495         t->encap_hlen = hlen;
496         t->hlen = t->encap_hlen + t->tun_hlen;
497
498         return 0;
499 }
500 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
501
502 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
503                             struct rtable *rt, __be16 df,
504                             const struct iphdr *inner_iph)
505 {
506         struct ip_tunnel *tunnel = netdev_priv(dev);
507         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
508         int mtu;
509
510         if (df)
511                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
512                                         - sizeof(struct iphdr) - tunnel->hlen;
513         else
514                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
515
516         skb_dst_update_pmtu(skb, mtu);
517
518         if (skb->protocol == htons(ETH_P_IP)) {
519                 if (!skb_is_gso(skb) &&
520                     (inner_iph->frag_off & htons(IP_DF)) &&
521                     mtu < pkt_size) {
522                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
523                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
524                         return -E2BIG;
525                 }
526         }
527 #if IS_ENABLED(CONFIG_IPV6)
528         else if (skb->protocol == htons(ETH_P_IPV6)) {
529                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
530
531                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
532                            mtu >= IPV6_MIN_MTU) {
533                         if ((tunnel->parms.iph.daddr &&
534                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
535                             rt6->rt6i_dst.plen == 128) {
536                                 rt6->rt6i_flags |= RTF_MODIFIED;
537                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
538                         }
539                 }
540
541                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
542                                         mtu < pkt_size) {
543                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
544                         return -E2BIG;
545                 }
546         }
547 #endif
548         return 0;
549 }
550
551 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
552 {
553         struct ip_tunnel *tunnel = netdev_priv(dev);
554         u32 headroom = sizeof(struct iphdr);
555         struct ip_tunnel_info *tun_info;
556         const struct ip_tunnel_key *key;
557         const struct iphdr *inner_iph;
558         struct rtable *rt;
559         struct flowi4 fl4;
560         __be16 df = 0;
561         u8 tos, ttl;
562
563         tun_info = skb_tunnel_info(skb);
564         if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
565                      ip_tunnel_info_af(tun_info) != AF_INET))
566                 goto tx_error;
567         key = &tun_info->key;
568         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
569         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
570         tos = key->tos;
571         if (tos == 1) {
572                 if (skb->protocol == htons(ETH_P_IP))
573                         tos = inner_iph->tos;
574                 else if (skb->protocol == htons(ETH_P_IPV6))
575                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
576         }
577         ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
578                             RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
579         if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
580                 goto tx_error;
581         rt = ip_route_output_key(tunnel->net, &fl4);
582         if (IS_ERR(rt)) {
583                 dev->stats.tx_carrier_errors++;
584                 goto tx_error;
585         }
586         if (rt->dst.dev == dev) {
587                 ip_rt_put(rt);
588                 dev->stats.collisions++;
589                 goto tx_error;
590         }
591         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
592         ttl = key->ttl;
593         if (ttl == 0) {
594                 if (skb->protocol == htons(ETH_P_IP))
595                         ttl = inner_iph->ttl;
596                 else if (skb->protocol == htons(ETH_P_IPV6))
597                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
598                 else
599                         ttl = ip4_dst_hoplimit(&rt->dst);
600         }
601         if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
602                 df = htons(IP_DF);
603         else if (skb->protocol == htons(ETH_P_IP))
604                 df = inner_iph->frag_off & htons(IP_DF);
605         headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
606         if (headroom > dev->needed_headroom)
607                 dev->needed_headroom = headroom;
608
609         if (skb_cow_head(skb, dev->needed_headroom)) {
610                 ip_rt_put(rt);
611                 goto tx_dropped;
612         }
613         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
614                       df, !net_eq(tunnel->net, dev_net(dev)));
615         return;
616 tx_error:
617         dev->stats.tx_errors++;
618         goto kfree;
619 tx_dropped:
620         dev->stats.tx_dropped++;
621 kfree:
622         kfree_skb(skb);
623 }
624 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
625
626 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
627                     const struct iphdr *tnl_params, u8 protocol)
628 {
629         struct ip_tunnel *tunnel = netdev_priv(dev);
630         unsigned int inner_nhdr_len = 0;
631         const struct iphdr *inner_iph;
632         struct flowi4 fl4;
633         u8     tos, ttl;
634         __be16 df;
635         struct rtable *rt;              /* Route to the other host */
636         unsigned int max_headroom;      /* The extra header space needed */
637         __be32 dst;
638         bool connected;
639
640         /* ensure we can access the inner net header, for several users below */
641         if (skb->protocol == htons(ETH_P_IP))
642                 inner_nhdr_len = sizeof(struct iphdr);
643         else if (skb->protocol == htons(ETH_P_IPV6))
644                 inner_nhdr_len = sizeof(struct ipv6hdr);
645         if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
646                 goto tx_error;
647
648         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
649         connected = (tunnel->parms.iph.daddr != 0);
650
651         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
652
653         dst = tnl_params->daddr;
654         if (dst == 0) {
655                 /* NBMA tunnel */
656
657                 if (!skb_dst(skb)) {
658                         dev->stats.tx_fifo_errors++;
659                         goto tx_error;
660                 }
661
662                 if (skb->protocol == htons(ETH_P_IP)) {
663                         rt = skb_rtable(skb);
664                         dst = rt_nexthop(rt, inner_iph->daddr);
665                 }
666 #if IS_ENABLED(CONFIG_IPV6)
667                 else if (skb->protocol == htons(ETH_P_IPV6)) {
668                         const struct in6_addr *addr6;
669                         struct neighbour *neigh;
670                         bool do_tx_error_icmp;
671                         int addr_type;
672
673                         neigh = dst_neigh_lookup(skb_dst(skb),
674                                                  &ipv6_hdr(skb)->daddr);
675                         if (!neigh)
676                                 goto tx_error;
677
678                         addr6 = (const struct in6_addr *)&neigh->primary_key;
679                         addr_type = ipv6_addr_type(addr6);
680
681                         if (addr_type == IPV6_ADDR_ANY) {
682                                 addr6 = &ipv6_hdr(skb)->daddr;
683                                 addr_type = ipv6_addr_type(addr6);
684                         }
685
686                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
687                                 do_tx_error_icmp = true;
688                         else {
689                                 do_tx_error_icmp = false;
690                                 dst = addr6->s6_addr32[3];
691                         }
692                         neigh_release(neigh);
693                         if (do_tx_error_icmp)
694                                 goto tx_error_icmp;
695                 }
696 #endif
697                 else
698                         goto tx_error;
699
700                 connected = false;
701         }
702
703         tos = tnl_params->tos;
704         if (tos & 0x1) {
705                 tos &= ~0x1;
706                 if (skb->protocol == htons(ETH_P_IP)) {
707                         tos = inner_iph->tos;
708                         connected = false;
709                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
710                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
711                         connected = false;
712                 }
713         }
714
715         ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
716                             tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
717                             tunnel->fwmark);
718
719         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
720                 goto tx_error;
721
722         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
723                          NULL;
724
725         if (!rt) {
726                 rt = ip_route_output_key(tunnel->net, &fl4);
727
728                 if (IS_ERR(rt)) {
729                         dev->stats.tx_carrier_errors++;
730                         goto tx_error;
731                 }
732                 if (connected)
733                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
734                                           fl4.saddr);
735         }
736
737         if (rt->dst.dev == dev) {
738                 ip_rt_put(rt);
739                 dev->stats.collisions++;
740                 goto tx_error;
741         }
742
743         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
744                 ip_rt_put(rt);
745                 goto tx_error;
746         }
747
748         if (tunnel->err_count > 0) {
749                 if (time_before(jiffies,
750                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
751                         tunnel->err_count--;
752
753                         dst_link_failure(skb);
754                 } else
755                         tunnel->err_count = 0;
756         }
757
758         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
759         ttl = tnl_params->ttl;
760         if (ttl == 0) {
761                 if (skb->protocol == htons(ETH_P_IP))
762                         ttl = inner_iph->ttl;
763 #if IS_ENABLED(CONFIG_IPV6)
764                 else if (skb->protocol == htons(ETH_P_IPV6))
765                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
766 #endif
767                 else
768                         ttl = ip4_dst_hoplimit(&rt->dst);
769         }
770
771         df = tnl_params->frag_off;
772         if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
773                 df |= (inner_iph->frag_off&htons(IP_DF));
774
775         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
776                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
777         if (max_headroom > dev->needed_headroom)
778                 dev->needed_headroom = max_headroom;
779
780         if (skb_cow_head(skb, dev->needed_headroom)) {
781                 ip_rt_put(rt);
782                 dev->stats.tx_dropped++;
783                 kfree_skb(skb);
784                 return;
785         }
786
787         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
788                       df, !net_eq(tunnel->net, dev_net(dev)));
789         return;
790
791 #if IS_ENABLED(CONFIG_IPV6)
792 tx_error_icmp:
793         dst_link_failure(skb);
794 #endif
795 tx_error:
796         dev->stats.tx_errors++;
797         kfree_skb(skb);
798 }
799 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
800
801 static void ip_tunnel_update(struct ip_tunnel_net *itn,
802                              struct ip_tunnel *t,
803                              struct net_device *dev,
804                              struct ip_tunnel_parm *p,
805                              bool set_mtu,
806                              __u32 fwmark)
807 {
808         ip_tunnel_del(itn, t);
809         t->parms.iph.saddr = p->iph.saddr;
810         t->parms.iph.daddr = p->iph.daddr;
811         t->parms.i_key = p->i_key;
812         t->parms.o_key = p->o_key;
813         if (dev->type != ARPHRD_ETHER) {
814                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
815                 memcpy(dev->broadcast, &p->iph.daddr, 4);
816         }
817         ip_tunnel_add(itn, t);
818
819         t->parms.iph.ttl = p->iph.ttl;
820         t->parms.iph.tos = p->iph.tos;
821         t->parms.iph.frag_off = p->iph.frag_off;
822
823         if (t->parms.link != p->link || t->fwmark != fwmark) {
824                 int mtu;
825
826                 t->parms.link = p->link;
827                 t->fwmark = fwmark;
828                 mtu = ip_tunnel_bind_dev(dev);
829                 if (set_mtu)
830                         dev->mtu = mtu;
831         }
832         dst_cache_reset(&t->dst_cache);
833         netdev_state_change(dev);
834 }
835
836 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
837 {
838         int err = 0;
839         struct ip_tunnel *t = netdev_priv(dev);
840         struct net *net = t->net;
841         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
842
843         switch (cmd) {
844         case SIOCGETTUNNEL:
845                 if (dev == itn->fb_tunnel_dev) {
846                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
847                         if (!t)
848                                 t = netdev_priv(dev);
849                 }
850                 memcpy(p, &t->parms, sizeof(*p));
851                 break;
852
853         case SIOCADDTUNNEL:
854         case SIOCCHGTUNNEL:
855                 err = -EPERM;
856                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
857                         goto done;
858                 if (p->iph.ttl)
859                         p->iph.frag_off |= htons(IP_DF);
860                 if (!(p->i_flags & VTI_ISVTI)) {
861                         if (!(p->i_flags & TUNNEL_KEY))
862                                 p->i_key = 0;
863                         if (!(p->o_flags & TUNNEL_KEY))
864                                 p->o_key = 0;
865                 }
866
867                 t = ip_tunnel_find(itn, p, itn->type);
868
869                 if (cmd == SIOCADDTUNNEL) {
870                         if (!t) {
871                                 t = ip_tunnel_create(net, itn, p);
872                                 err = PTR_ERR_OR_ZERO(t);
873                                 break;
874                         }
875
876                         err = -EEXIST;
877                         break;
878                 }
879                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
880                         if (t) {
881                                 if (t->dev != dev) {
882                                         err = -EEXIST;
883                                         break;
884                                 }
885                         } else {
886                                 unsigned int nflags = 0;
887
888                                 if (ipv4_is_multicast(p->iph.daddr))
889                                         nflags = IFF_BROADCAST;
890                                 else if (p->iph.daddr)
891                                         nflags = IFF_POINTOPOINT;
892
893                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
894                                         err = -EINVAL;
895                                         break;
896                                 }
897
898                                 t = netdev_priv(dev);
899                         }
900                 }
901
902                 if (t) {
903                         err = 0;
904                         ip_tunnel_update(itn, t, dev, p, true, 0);
905                 } else {
906                         err = -ENOENT;
907                 }
908                 break;
909
910         case SIOCDELTUNNEL:
911                 err = -EPERM;
912                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
913                         goto done;
914
915                 if (dev == itn->fb_tunnel_dev) {
916                         err = -ENOENT;
917                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918                         if (!t)
919                                 goto done;
920                         err = -EPERM;
921                         if (t == netdev_priv(itn->fb_tunnel_dev))
922                                 goto done;
923                         dev = t->dev;
924                 }
925                 unregister_netdevice(dev);
926                 err = 0;
927                 break;
928
929         default:
930                 err = -EINVAL;
931         }
932
933 done:
934         return err;
935 }
936 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
937
938 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
939 {
940         struct ip_tunnel *tunnel = netdev_priv(dev);
941         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
942         int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
943
944         if (new_mtu < ETH_MIN_MTU)
945                 return -EINVAL;
946
947         if (new_mtu > max_mtu) {
948                 if (strict)
949                         return -EINVAL;
950
951                 new_mtu = max_mtu;
952         }
953
954         dev->mtu = new_mtu;
955         return 0;
956 }
957 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
958
959 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
960 {
961         return __ip_tunnel_change_mtu(dev, new_mtu, true);
962 }
963 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
964
965 static void ip_tunnel_dev_free(struct net_device *dev)
966 {
967         struct ip_tunnel *tunnel = netdev_priv(dev);
968
969         gro_cells_destroy(&tunnel->gro_cells);
970         dst_cache_destroy(&tunnel->dst_cache);
971         free_percpu(dev->tstats);
972 }
973
974 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
975 {
976         struct ip_tunnel *tunnel = netdev_priv(dev);
977         struct ip_tunnel_net *itn;
978
979         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
980
981         if (itn->fb_tunnel_dev != dev) {
982                 ip_tunnel_del(itn, netdev_priv(dev));
983                 unregister_netdevice_queue(dev, head);
984         }
985 }
986 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
987
988 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
989 {
990         struct ip_tunnel *tunnel = netdev_priv(dev);
991
992         return tunnel->net;
993 }
994 EXPORT_SYMBOL(ip_tunnel_get_link_net);
995
996 int ip_tunnel_get_iflink(const struct net_device *dev)
997 {
998         struct ip_tunnel *tunnel = netdev_priv(dev);
999
1000         return tunnel->parms.link;
1001 }
1002 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1003
1004 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1005                                   struct rtnl_link_ops *ops, char *devname)
1006 {
1007         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1008         struct ip_tunnel_parm parms;
1009         unsigned int i;
1010
1011         itn->rtnl_link_ops = ops;
1012         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1013                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1014
1015         if (!ops || !net_has_fallback_tunnels(net)) {
1016                 struct ip_tunnel_net *it_init_net;
1017
1018                 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1019                 itn->type = it_init_net->type;
1020                 itn->fb_tunnel_dev = NULL;
1021                 return 0;
1022         }
1023
1024         memset(&parms, 0, sizeof(parms));
1025         if (devname)
1026                 strlcpy(parms.name, devname, IFNAMSIZ);
1027
1028         rtnl_lock();
1029         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1030         /* FB netdevice is special: we have one, and only one per netns.
1031          * Allowing to move it to another netns is clearly unsafe.
1032          */
1033         if (!IS_ERR(itn->fb_tunnel_dev)) {
1034                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1035                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1036                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1037                 itn->type = itn->fb_tunnel_dev->type;
1038         }
1039         rtnl_unlock();
1040
1041         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1042 }
1043 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1044
1045 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1046                               struct list_head *head,
1047                               struct rtnl_link_ops *ops)
1048 {
1049         struct net_device *dev, *aux;
1050         int h;
1051
1052         for_each_netdev_safe(net, dev, aux)
1053                 if (dev->rtnl_link_ops == ops)
1054                         unregister_netdevice_queue(dev, head);
1055
1056         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1057                 struct ip_tunnel *t;
1058                 struct hlist_node *n;
1059                 struct hlist_head *thead = &itn->tunnels[h];
1060
1061                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1062                         /* If dev is in the same netns, it has already
1063                          * been added to the list by the previous loop.
1064                          */
1065                         if (!net_eq(dev_net(t->dev), net))
1066                                 unregister_netdevice_queue(t->dev, head);
1067         }
1068 }
1069
1070 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1071                            struct rtnl_link_ops *ops)
1072 {
1073         struct ip_tunnel_net *itn;
1074         struct net *net;
1075         LIST_HEAD(list);
1076
1077         rtnl_lock();
1078         list_for_each_entry(net, net_list, exit_list) {
1079                 itn = net_generic(net, id);
1080                 ip_tunnel_destroy(net, itn, &list, ops);
1081         }
1082         unregister_netdevice_many(&list);
1083         rtnl_unlock();
1084 }
1085 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1086
1087 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1088                       struct ip_tunnel_parm *p, __u32 fwmark)
1089 {
1090         struct ip_tunnel *nt;
1091         struct net *net = dev_net(dev);
1092         struct ip_tunnel_net *itn;
1093         int mtu;
1094         int err;
1095
1096         nt = netdev_priv(dev);
1097         itn = net_generic(net, nt->ip_tnl_net_id);
1098
1099         if (nt->collect_md) {
1100                 if (rtnl_dereference(itn->collect_md_tun))
1101                         return -EEXIST;
1102         } else {
1103                 if (ip_tunnel_find(itn, p, dev->type))
1104                         return -EEXIST;
1105         }
1106
1107         nt->net = net;
1108         nt->parms = *p;
1109         nt->fwmark = fwmark;
1110         err = register_netdevice(dev);
1111         if (err)
1112                 goto err_register_netdevice;
1113
1114         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1115                 eth_hw_addr_random(dev);
1116
1117         mtu = ip_tunnel_bind_dev(dev);
1118         if (tb[IFLA_MTU]) {
1119                 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1120
1121                 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1122                             (unsigned int)(max - sizeof(struct iphdr)));
1123         }
1124
1125         err = dev_set_mtu(dev, mtu);
1126         if (err)
1127                 goto err_dev_set_mtu;
1128
1129         ip_tunnel_add(itn, nt);
1130         return 0;
1131
1132 err_dev_set_mtu:
1133         unregister_netdevice(dev);
1134 err_register_netdevice:
1135         return err;
1136 }
1137 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1138
1139 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1140                          struct ip_tunnel_parm *p, __u32 fwmark)
1141 {
1142         struct ip_tunnel *t;
1143         struct ip_tunnel *tunnel = netdev_priv(dev);
1144         struct net *net = tunnel->net;
1145         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1146
1147         if (dev == itn->fb_tunnel_dev)
1148                 return -EINVAL;
1149
1150         t = ip_tunnel_find(itn, p, dev->type);
1151
1152         if (t) {
1153                 if (t->dev != dev)
1154                         return -EEXIST;
1155         } else {
1156                 t = tunnel;
1157
1158                 if (dev->type != ARPHRD_ETHER) {
1159                         unsigned int nflags = 0;
1160
1161                         if (ipv4_is_multicast(p->iph.daddr))
1162                                 nflags = IFF_BROADCAST;
1163                         else if (p->iph.daddr)
1164                                 nflags = IFF_POINTOPOINT;
1165
1166                         if ((dev->flags ^ nflags) &
1167                             (IFF_POINTOPOINT | IFF_BROADCAST))
1168                                 return -EINVAL;
1169                 }
1170         }
1171
1172         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1173         return 0;
1174 }
1175 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1176
1177 int ip_tunnel_init(struct net_device *dev)
1178 {
1179         struct ip_tunnel *tunnel = netdev_priv(dev);
1180         struct iphdr *iph = &tunnel->parms.iph;
1181         int err;
1182
1183         dev->needs_free_netdev = true;
1184         dev->priv_destructor = ip_tunnel_dev_free;
1185         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1186         if (!dev->tstats)
1187                 return -ENOMEM;
1188
1189         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1190         if (err) {
1191                 free_percpu(dev->tstats);
1192                 return err;
1193         }
1194
1195         err = gro_cells_init(&tunnel->gro_cells, dev);
1196         if (err) {
1197                 dst_cache_destroy(&tunnel->dst_cache);
1198                 free_percpu(dev->tstats);
1199                 return err;
1200         }
1201
1202         tunnel->dev = dev;
1203         tunnel->net = dev_net(dev);
1204         strcpy(tunnel->parms.name, dev->name);
1205         iph->version            = 4;
1206         iph->ihl                = 5;
1207
1208         if (tunnel->collect_md) {
1209                 dev->features |= NETIF_F_NETNS_LOCAL;
1210                 netif_keep_dst(dev);
1211         }
1212         return 0;
1213 }
1214 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1215
1216 void ip_tunnel_uninit(struct net_device *dev)
1217 {
1218         struct ip_tunnel *tunnel = netdev_priv(dev);
1219         struct net *net = tunnel->net;
1220         struct ip_tunnel_net *itn;
1221
1222         itn = net_generic(net, tunnel->ip_tnl_net_id);
1223         /* fb_tunnel_dev will be unregisted in net-exit call. */
1224         if (itn->fb_tunnel_dev != dev)
1225                 ip_tunnel_del(itn, netdev_priv(dev));
1226
1227         dst_cache_reset(&tunnel->dst_cache);
1228 }
1229 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1230
1231 /* Do least required initialization, rest of init is done in tunnel_init call */
1232 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1233 {
1234         struct ip_tunnel *tunnel = netdev_priv(dev);
1235         tunnel->ip_tnl_net_id = net_id;
1236 }
1237 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1238
1239 MODULE_LICENSE("GPL");