d201ea4440c910302df961a52d3bb87b8c79a071
[sfrench/cifs-2.6.git] / net / bridge / br_netfilter.c
1 /*
2  *      Handle firewalling
3  *      Linux ethernet bridge
4  *
5  *      Authors:
6  *      Lennert Buytenhek               <buytenh@gnu.org>
7  *      Bart De Schuymer                <bdschuym@pandora.be>
8  *
9  *      This program is free software; you can redistribute it and/or
10  *      modify it under the terms of the GNU General Public License
11  *      as published by the Free Software Foundation; either version
12  *      2 of the License, or (at your option) any later version.
13  *
14  *      Lennert dedicates this file to Kerstin Wurdinger.
15  */
16
17 #include <linux/module.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/ip.h>
21 #include <linux/netdevice.h>
22 #include <linux/skbuff.h>
23 #include <linux/if_arp.h>
24 #include <linux/if_ether.h>
25 #include <linux/if_vlan.h>
26 #include <linux/if_pppox.h>
27 #include <linux/ppp_defs.h>
28 #include <linux/netfilter_bridge.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/netfilter_ipv6.h>
31 #include <linux/netfilter_arp.h>
32 #include <linux/in_route.h>
33 #include <linux/inetdevice.h>
34
35 #include <net/ip.h>
36 #include <net/ipv6.h>
37 #include <net/route.h>
38 #include <net/netfilter/br_netfilter.h>
39
40 #include <asm/uaccess.h>
41 #include "br_private.h"
42 #ifdef CONFIG_SYSCTL
43 #include <linux/sysctl.h>
44 #endif
45
46 #ifdef CONFIG_SYSCTL
47 static struct ctl_table_header *brnf_sysctl_header;
48 static int brnf_call_iptables __read_mostly = 1;
49 static int brnf_call_ip6tables __read_mostly = 1;
50 static int brnf_call_arptables __read_mostly = 1;
51 static int brnf_filter_vlan_tagged __read_mostly = 0;
52 static int brnf_filter_pppoe_tagged __read_mostly = 0;
53 static int brnf_pass_vlan_indev __read_mostly = 0;
54 #else
55 #define brnf_call_iptables 1
56 #define brnf_call_ip6tables 1
57 #define brnf_call_arptables 1
58 #define brnf_filter_vlan_tagged 0
59 #define brnf_filter_pppoe_tagged 0
60 #define brnf_pass_vlan_indev 0
61 #endif
62
63 #define IS_IP(skb) \
64         (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
65
66 #define IS_IPV6(skb) \
67         (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
68
69 #define IS_ARP(skb) \
70         (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
71
72 static inline __be16 vlan_proto(const struct sk_buff *skb)
73 {
74         if (skb_vlan_tag_present(skb))
75                 return skb->protocol;
76         else if (skb->protocol == htons(ETH_P_8021Q))
77                 return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
78         else
79                 return 0;
80 }
81
82 #define IS_VLAN_IP(skb) \
83         (vlan_proto(skb) == htons(ETH_P_IP) && \
84          brnf_filter_vlan_tagged)
85
86 #define IS_VLAN_IPV6(skb) \
87         (vlan_proto(skb) == htons(ETH_P_IPV6) && \
88          brnf_filter_vlan_tagged)
89
90 #define IS_VLAN_ARP(skb) \
91         (vlan_proto(skb) == htons(ETH_P_ARP) && \
92          brnf_filter_vlan_tagged)
93
94 static inline __be16 pppoe_proto(const struct sk_buff *skb)
95 {
96         return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
97                             sizeof(struct pppoe_hdr)));
98 }
99
100 #define IS_PPPOE_IP(skb) \
101         (skb->protocol == htons(ETH_P_PPP_SES) && \
102          pppoe_proto(skb) == htons(PPP_IP) && \
103          brnf_filter_pppoe_tagged)
104
105 #define IS_PPPOE_IPV6(skb) \
106         (skb->protocol == htons(ETH_P_PPP_SES) && \
107          pppoe_proto(skb) == htons(PPP_IPV6) && \
108          brnf_filter_pppoe_tagged)
109
110 /* largest possible L2 header, see br_nf_dev_queue_xmit() */
111 #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
112
113 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
114 struct brnf_frag_data {
115         char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
116         u8 encap_size;
117         u8 size;
118 };
119
120 static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage);
121 #endif
122
123 static struct nf_bridge_info *nf_bridge_info_get(const struct sk_buff *skb)
124 {
125         return skb->nf_bridge;
126 }
127
128 static void nf_bridge_info_free(struct sk_buff *skb)
129 {
130         if (skb->nf_bridge) {
131                 nf_bridge_put(skb->nf_bridge);
132                 skb->nf_bridge = NULL;
133         }
134 }
135
136 static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
137 {
138         struct net_bridge_port *port;
139
140         port = br_port_get_rcu(dev);
141         return port ? &port->br->fake_rtable : NULL;
142 }
143
144 static inline struct net_device *bridge_parent(const struct net_device *dev)
145 {
146         struct net_bridge_port *port;
147
148         port = br_port_get_rcu(dev);
149         return port ? port->br->dev : NULL;
150 }
151
152 static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
153 {
154         skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
155         if (likely(skb->nf_bridge))
156                 atomic_set(&(skb->nf_bridge->use), 1);
157
158         return skb->nf_bridge;
159 }
160
161 static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
162 {
163         struct nf_bridge_info *nf_bridge = skb->nf_bridge;
164
165         if (atomic_read(&nf_bridge->use) > 1) {
166                 struct nf_bridge_info *tmp = nf_bridge_alloc(skb);
167
168                 if (tmp) {
169                         memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
170                         atomic_set(&tmp->use, 1);
171                 }
172                 nf_bridge_put(nf_bridge);
173                 nf_bridge = tmp;
174         }
175         return nf_bridge;
176 }
177
178 static unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
179 {
180         switch (skb->protocol) {
181         case __cpu_to_be16(ETH_P_8021Q):
182                 return VLAN_HLEN;
183         case __cpu_to_be16(ETH_P_PPP_SES):
184                 return PPPOE_SES_HLEN;
185         default:
186                 return 0;
187         }
188 }
189
190 static inline void nf_bridge_push_encap_header(struct sk_buff *skb)
191 {
192         unsigned int len = nf_bridge_encap_header_len(skb);
193
194         skb_push(skb, len);
195         skb->network_header -= len;
196 }
197
198 static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
199 {
200         unsigned int len = nf_bridge_encap_header_len(skb);
201
202         skb_pull(skb, len);
203         skb->network_header += len;
204 }
205
206 static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
207 {
208         unsigned int len = nf_bridge_encap_header_len(skb);
209
210         skb_pull_rcsum(skb, len);
211         skb->network_header += len;
212 }
213
214 /* When handing a packet over to the IP layer
215  * check whether we have a skb that is in the
216  * expected format
217  */
218
219 static int br_validate_ipv4(struct sk_buff *skb)
220 {
221         const struct iphdr *iph;
222         struct net_device *dev = skb->dev;
223         u32 len;
224
225         if (!pskb_may_pull(skb, sizeof(struct iphdr)))
226                 goto inhdr_error;
227
228         iph = ip_hdr(skb);
229
230         /* Basic sanity checks */
231         if (iph->ihl < 5 || iph->version != 4)
232                 goto inhdr_error;
233
234         if (!pskb_may_pull(skb, iph->ihl*4))
235                 goto inhdr_error;
236
237         iph = ip_hdr(skb);
238         if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
239                 goto inhdr_error;
240
241         len = ntohs(iph->tot_len);
242         if (skb->len < len) {
243                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
244                 goto drop;
245         } else if (len < (iph->ihl*4))
246                 goto inhdr_error;
247
248         if (pskb_trim_rcsum(skb, len)) {
249                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
250                 goto drop;
251         }
252
253         memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
254         /* We should really parse IP options here but until
255          * somebody who actually uses IP options complains to
256          * us we'll just silently ignore the options because
257          * we're lazy!
258          */
259         return 0;
260
261 inhdr_error:
262         IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
263 drop:
264         return -1;
265 }
266
267 /* We only check the length. A bridge shouldn't do any hop-by-hop stuff
268  * anyway
269  */
270 static int check_hbh_len(struct sk_buff *skb)
271 {
272         unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
273         u32 pkt_len;
274         const unsigned char *nh = skb_network_header(skb);
275         int off = raw - nh;
276         int len = (raw[1] + 1) << 3;
277
278         if ((raw + len) - skb->data > skb_headlen(skb))
279                 goto bad;
280
281         off += 2;
282         len -= 2;
283
284         while (len > 0) {
285                 int optlen = nh[off + 1] + 2;
286
287                 switch (nh[off]) {
288                 case IPV6_TLV_PAD1:
289                         optlen = 1;
290                         break;
291
292                 case IPV6_TLV_PADN:
293                         break;
294
295                 case IPV6_TLV_JUMBO:
296                         if (nh[off + 1] != 4 || (off & 3) != 2)
297                                 goto bad;
298                         pkt_len = ntohl(*(__be32 *)(nh + off + 2));
299                         if (pkt_len <= IPV6_MAXPLEN ||
300                             ipv6_hdr(skb)->payload_len)
301                                 goto bad;
302                         if (pkt_len > skb->len - sizeof(struct ipv6hdr))
303                                 goto bad;
304                         if (pskb_trim_rcsum(skb,
305                                             pkt_len + sizeof(struct ipv6hdr)))
306                                 goto bad;
307                         nh = skb_network_header(skb);
308                         break;
309                 default:
310                         if (optlen > len)
311                                 goto bad;
312                         break;
313                 }
314                 off += optlen;
315                 len -= optlen;
316         }
317         if (len == 0)
318                 return 0;
319 bad:
320         return -1;
321 }
322
323 static void nf_bridge_update_protocol(struct sk_buff *skb)
324 {
325         switch (skb->nf_bridge->orig_proto) {
326         case BRNF_PROTO_8021Q:
327                 skb->protocol = htons(ETH_P_8021Q);
328                 break;
329         case BRNF_PROTO_PPPOE:
330                 skb->protocol = htons(ETH_P_PPP_SES);
331                 break;
332         case BRNF_PROTO_UNCHANGED:
333                 break;
334         }
335 }
336
337 /* Obtain the correct destination MAC address, while preserving the original
338  * source MAC address. If we already know this address, we just copy it. If we
339  * don't, we use the neighbour framework to find out. In both cases, we make
340  * sure that br_handle_frame_finish() is called afterwards.
341  */
342 static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
343 {
344         struct neighbour *neigh;
345         struct dst_entry *dst;
346
347         skb->dev = bridge_parent(skb->dev);
348         if (!skb->dev)
349                 goto free_skb;
350         dst = skb_dst(skb);
351         neigh = dst_neigh_lookup_skb(dst, skb);
352         if (neigh) {
353                 struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
354                 int ret;
355
356                 if (neigh->hh.hh_len) {
357                         neigh_hh_bridge(&neigh->hh, skb);
358                         skb->dev = nf_bridge->physindev;
359                         ret = br_handle_frame_finish(sk, skb);
360                 } else {
361                         /* the neighbour function below overwrites the complete
362                          * MAC header, so we save the Ethernet source address and
363                          * protocol number.
364                          */
365                         skb_copy_from_linear_data_offset(skb,
366                                                          -(ETH_HLEN-ETH_ALEN),
367                                                          nf_bridge->neigh_header,
368                                                          ETH_HLEN-ETH_ALEN);
369                         /* tell br_dev_xmit to continue with forwarding */
370                         nf_bridge->mask |= BRNF_BRIDGED_DNAT;
371                         /* FIXME Need to refragment */
372                         ret = neigh->output(neigh, skb);
373                 }
374                 neigh_release(neigh);
375                 return ret;
376         }
377 free_skb:
378         kfree_skb(skb);
379         return 0;
380 }
381
382 static bool daddr_was_changed(const struct sk_buff *skb,
383                               const struct nf_bridge_info *nf_bridge)
384 {
385         switch (skb->protocol) {
386         case htons(ETH_P_IP):
387                 return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
388         case htons(ETH_P_IPV6):
389                 return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr,
390                               sizeof(ipv6_hdr(skb)->daddr)) != 0;
391         default:
392                 return false;
393         }
394 }
395
396 /* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables
397  * PREROUTING and continue the bridge PRE_ROUTING hook. See comment
398  * for br_nf_pre_routing_finish(), same logic is used here but
399  * equivalent IPv6 function ip6_route_input() called indirectly.
400  */
401 static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
402 {
403         struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
404         struct rtable *rt;
405         struct net_device *dev = skb->dev;
406         const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
407
408         if (nf_bridge->pkt_otherhost) {
409                 skb->pkt_type = PACKET_OTHERHOST;
410                 nf_bridge->pkt_otherhost = false;
411         }
412         nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
413         if (daddr_was_changed(skb, nf_bridge)) {
414                 skb_dst_drop(skb);
415                 v6ops->route_input(skb);
416
417                 if (skb_dst(skb)->error) {
418                         kfree_skb(skb);
419                         return 0;
420                 }
421
422                 if (skb_dst(skb)->dev == dev) {
423                         skb->dev = nf_bridge->physindev;
424                         nf_bridge_update_protocol(skb);
425                         nf_bridge_push_encap_header(skb);
426                         NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
427                                        sk, skb, skb->dev, NULL,
428                                        br_nf_pre_routing_finish_bridge,
429                                        1);
430                         return 0;
431                 }
432                 ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
433                 skb->pkt_type = PACKET_HOST;
434         } else {
435                 rt = bridge_parent_rtable(nf_bridge->physindev);
436                 if (!rt) {
437                         kfree_skb(skb);
438                         return 0;
439                 }
440                 skb_dst_set_noref(skb, &rt->dst);
441         }
442
443         skb->dev = nf_bridge->physindev;
444         nf_bridge_update_protocol(skb);
445         nf_bridge_push_encap_header(skb);
446         NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
447                        skb->dev, NULL,
448                        br_handle_frame_finish, 1);
449
450         return 0;
451 }
452
453 /* This requires some explaining. If DNAT has taken place,
454  * we will need to fix up the destination Ethernet address.
455  * This is also true when SNAT takes place (for the reply direction).
456  *
457  * There are two cases to consider:
458  * 1. The packet was DNAT'ed to a device in the same bridge
459  *    port group as it was received on. We can still bridge
460  *    the packet.
461  * 2. The packet was DNAT'ed to a different device, either
462  *    a non-bridged device or another bridge port group.
463  *    The packet will need to be routed.
464  *
465  * The correct way of distinguishing between these two cases is to
466  * call ip_route_input() and to look at skb->dst->dev, which is
467  * changed to the destination device if ip_route_input() succeeds.
468  *
469  * Let's first consider the case that ip_route_input() succeeds:
470  *
471  * If the output device equals the logical bridge device the packet
472  * came in on, we can consider this bridging. The corresponding MAC
473  * address will be obtained in br_nf_pre_routing_finish_bridge.
474  * Otherwise, the packet is considered to be routed and we just
475  * change the destination MAC address so that the packet will
476  * later be passed up to the IP stack to be routed. For a redirected
477  * packet, ip_route_input() will give back the localhost as output device,
478  * which differs from the bridge device.
479  *
480  * Let's now consider the case that ip_route_input() fails:
481  *
482  * This can be because the destination address is martian, in which case
483  * the packet will be dropped.
484  * If IP forwarding is disabled, ip_route_input() will fail, while
485  * ip_route_output_key() can return success. The source
486  * address for ip_route_output_key() is set to zero, so ip_route_output_key()
487  * thinks we're handling a locally generated packet and won't care
488  * if IP forwarding is enabled. If the output device equals the logical bridge
489  * device, we proceed as if ip_route_input() succeeded. If it differs from the
490  * logical bridge port or if ip_route_output_key() fails we drop the packet.
491  */
492 static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
493 {
494         struct net_device *dev = skb->dev;
495         struct iphdr *iph = ip_hdr(skb);
496         struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
497         struct rtable *rt;
498         int err;
499
500         nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
501
502         if (nf_bridge->pkt_otherhost) {
503                 skb->pkt_type = PACKET_OTHERHOST;
504                 nf_bridge->pkt_otherhost = false;
505         }
506         nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
507         if (daddr_was_changed(skb, nf_bridge)) {
508                 if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
509                         struct in_device *in_dev = __in_dev_get_rcu(dev);
510
511                         /* If err equals -EHOSTUNREACH the error is due to a
512                          * martian destination or due to the fact that
513                          * forwarding is disabled. For most martian packets,
514                          * ip_route_output_key() will fail. It won't fail for 2 types of
515                          * martian destinations: loopback destinations and destination
516                          * 0.0.0.0. In both cases the packet will be dropped because the
517                          * destination is the loopback device and not the bridge. */
518                         if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
519                                 goto free_skb;
520
521                         rt = ip_route_output(dev_net(dev), iph->daddr, 0,
522                                              RT_TOS(iph->tos), 0);
523                         if (!IS_ERR(rt)) {
524                                 /* - Bridged-and-DNAT'ed traffic doesn't
525                                  *   require ip_forwarding. */
526                                 if (rt->dst.dev == dev) {
527                                         skb_dst_set(skb, &rt->dst);
528                                         goto bridged_dnat;
529                                 }
530                                 ip_rt_put(rt);
531                         }
532 free_skb:
533                         kfree_skb(skb);
534                         return 0;
535                 } else {
536                         if (skb_dst(skb)->dev == dev) {
537 bridged_dnat:
538                                 skb->dev = nf_bridge->physindev;
539                                 nf_bridge_update_protocol(skb);
540                                 nf_bridge_push_encap_header(skb);
541                                 NF_HOOK_THRESH(NFPROTO_BRIDGE,
542                                                NF_BR_PRE_ROUTING,
543                                                sk, skb, skb->dev, NULL,
544                                                br_nf_pre_routing_finish_bridge,
545                                                1);
546                                 return 0;
547                         }
548                         ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
549                         skb->pkt_type = PACKET_HOST;
550                 }
551         } else {
552                 rt = bridge_parent_rtable(nf_bridge->physindev);
553                 if (!rt) {
554                         kfree_skb(skb);
555                         return 0;
556                 }
557                 skb_dst_set_noref(skb, &rt->dst);
558         }
559
560         skb->dev = nf_bridge->physindev;
561         nf_bridge_update_protocol(skb);
562         nf_bridge_push_encap_header(skb);
563         NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
564                        skb->dev, NULL,
565                        br_handle_frame_finish, 1);
566
567         return 0;
568 }
569
570 static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
571 {
572         struct net_device *vlan, *br;
573
574         br = bridge_parent(dev);
575         if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
576                 return br;
577
578         vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
579                                     skb_vlan_tag_get(skb) & VLAN_VID_MASK);
580
581         return vlan ? vlan : br;
582 }
583
584 /* Some common code for IPv4/IPv6 */
585 static struct net_device *setup_pre_routing(struct sk_buff *skb)
586 {
587         struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
588
589         if (skb->pkt_type == PACKET_OTHERHOST) {
590                 skb->pkt_type = PACKET_HOST;
591                 nf_bridge->pkt_otherhost = true;
592         }
593
594         nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
595         nf_bridge->physindev = skb->dev;
596         skb->dev = brnf_get_logical_dev(skb, skb->dev);
597
598         if (skb->protocol == htons(ETH_P_8021Q))
599                 nf_bridge->orig_proto = BRNF_PROTO_8021Q;
600         else if (skb->protocol == htons(ETH_P_PPP_SES))
601                 nf_bridge->orig_proto = BRNF_PROTO_PPPOE;
602
603         /* Must drop socket now because of tproxy. */
604         skb_orphan(skb);
605         return skb->dev;
606 }
607
608 /* Replicate the checks that IPv6 does on packet reception and pass the packet
609  * to ip6tables, which doesn't support NAT, so things are fairly simple. */
610 static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
611                                            struct sk_buff *skb,
612                                            const struct nf_hook_state *state)
613 {
614         struct nf_bridge_info *nf_bridge;
615         const struct ipv6hdr *hdr;
616         u32 pkt_len;
617
618         if (skb->len < sizeof(struct ipv6hdr))
619                 return NF_DROP;
620
621         if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
622                 return NF_DROP;
623
624         hdr = ipv6_hdr(skb);
625
626         if (hdr->version != 6)
627                 return NF_DROP;
628
629         pkt_len = ntohs(hdr->payload_len);
630
631         if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
632                 if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
633                         return NF_DROP;
634                 if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
635                         return NF_DROP;
636         }
637         if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
638                 return NF_DROP;
639
640         nf_bridge_put(skb->nf_bridge);
641         if (!nf_bridge_alloc(skb))
642                 return NF_DROP;
643         if (!setup_pre_routing(skb))
644                 return NF_DROP;
645
646         nf_bridge = nf_bridge_info_get(skb);
647         nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr;
648
649         skb->protocol = htons(ETH_P_IPV6);
650         NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb,
651                 skb->dev, NULL,
652                 br_nf_pre_routing_finish_ipv6);
653
654         return NF_STOLEN;
655 }
656
657 /* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
658  * Replicate the checks that IPv4 does on packet reception.
659  * Set skb->dev to the bridge device (i.e. parent of the
660  * receiving device) to make netfilter happy, the REDIRECT
661  * target in particular.  Save the original destination IP
662  * address to be able to detect DNAT afterwards. */
663 static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
664                                       struct sk_buff *skb,
665                                       const struct nf_hook_state *state)
666 {
667         struct nf_bridge_info *nf_bridge;
668         struct net_bridge_port *p;
669         struct net_bridge *br;
670         __u32 len = nf_bridge_encap_header_len(skb);
671
672         if (unlikely(!pskb_may_pull(skb, len)))
673                 return NF_DROP;
674
675         p = br_port_get_rcu(state->in);
676         if (p == NULL)
677                 return NF_DROP;
678         br = p->br;
679
680         if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
681                 if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
682                         return NF_ACCEPT;
683
684                 nf_bridge_pull_encap_header_rcsum(skb);
685                 return br_nf_pre_routing_ipv6(ops, skb, state);
686         }
687
688         if (!brnf_call_iptables && !br->nf_call_iptables)
689                 return NF_ACCEPT;
690
691         if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
692                 return NF_ACCEPT;
693
694         nf_bridge_pull_encap_header_rcsum(skb);
695
696         if (br_validate_ipv4(skb))
697                 return NF_DROP;
698
699         nf_bridge_put(skb->nf_bridge);
700         if (!nf_bridge_alloc(skb))
701                 return NF_DROP;
702         if (!setup_pre_routing(skb))
703                 return NF_DROP;
704
705         nf_bridge = nf_bridge_info_get(skb);
706         nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr;
707
708         skb->protocol = htons(ETH_P_IP);
709
710         NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb,
711                 skb->dev, NULL,
712                 br_nf_pre_routing_finish);
713
714         return NF_STOLEN;
715 }
716
717
718 /* PF_BRIDGE/LOCAL_IN ************************************************/
719 /* The packet is locally destined, which requires a real
720  * dst_entry, so detach the fake one.  On the way up, the
721  * packet would pass through PRE_ROUTING again (which already
722  * took place when the packet entered the bridge), but we
723  * register an IPv4 PRE_ROUTING 'sabotage' hook that will
724  * prevent this from happening. */
725 static unsigned int br_nf_local_in(const struct nf_hook_ops *ops,
726                                    struct sk_buff *skb,
727                                    const struct nf_hook_state *state)
728 {
729         br_drop_fake_rtable(skb);
730         return NF_ACCEPT;
731 }
732
733 /* PF_BRIDGE/FORWARD *************************************************/
734 static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
735 {
736         struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
737         struct net_device *in;
738
739         if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
740
741                 if (skb->protocol == htons(ETH_P_IP)) {
742                         nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
743                 }
744
745                 in = nf_bridge->physindev;
746                 if (nf_bridge->pkt_otherhost) {
747                         skb->pkt_type = PACKET_OTHERHOST;
748                         nf_bridge->pkt_otherhost = false;
749                 }
750                 nf_bridge_update_protocol(skb);
751         } else {
752                 in = *((struct net_device **)(skb->cb));
753         }
754         nf_bridge_push_encap_header(skb);
755
756         NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb,
757                        in, skb->dev, br_forward_finish, 1);
758         return 0;
759 }
760
761
762 /* This is the 'purely bridged' case.  For IP, we pass the packet to
763  * netfilter with indev and outdev set to the bridge device,
764  * but we are still able to filter on the 'real' indev/outdev
765  * because of the physdev module. For ARP, indev and outdev are the
766  * bridge ports. */
767 static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
768                                      struct sk_buff *skb,
769                                      const struct nf_hook_state *state)
770 {
771         struct nf_bridge_info *nf_bridge;
772         struct net_device *parent;
773         u_int8_t pf;
774
775         if (!skb->nf_bridge)
776                 return NF_ACCEPT;
777
778         /* Need exclusive nf_bridge_info since we might have multiple
779          * different physoutdevs. */
780         if (!nf_bridge_unshare(skb))
781                 return NF_DROP;
782
783         nf_bridge = nf_bridge_info_get(skb);
784         if (!nf_bridge)
785                 return NF_DROP;
786
787         parent = bridge_parent(state->out);
788         if (!parent)
789                 return NF_DROP;
790
791         if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
792                 pf = NFPROTO_IPV4;
793         else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
794                 pf = NFPROTO_IPV6;
795         else
796                 return NF_ACCEPT;
797
798         nf_bridge_pull_encap_header(skb);
799
800         if (skb->pkt_type == PACKET_OTHERHOST) {
801                 skb->pkt_type = PACKET_HOST;
802                 nf_bridge->pkt_otherhost = true;
803         }
804
805         if (pf == NFPROTO_IPV4) {
806                 if (br_validate_ipv4(skb))
807                         return NF_DROP;
808                 IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
809         }
810
811         nf_bridge->physoutdev = skb->dev;
812         if (pf == NFPROTO_IPV4)
813                 skb->protocol = htons(ETH_P_IP);
814         else
815                 skb->protocol = htons(ETH_P_IPV6);
816
817         NF_HOOK(pf, NF_INET_FORWARD, NULL, skb,
818                 brnf_get_logical_dev(skb, state->in),
819                 parent, br_nf_forward_finish);
820
821         return NF_STOLEN;
822 }
823
824 static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
825                                       struct sk_buff *skb,
826                                       const struct nf_hook_state *state)
827 {
828         struct net_bridge_port *p;
829         struct net_bridge *br;
830         struct net_device **d = (struct net_device **)(skb->cb);
831
832         p = br_port_get_rcu(state->out);
833         if (p == NULL)
834                 return NF_ACCEPT;
835         br = p->br;
836
837         if (!brnf_call_arptables && !br->nf_call_arptables)
838                 return NF_ACCEPT;
839
840         if (!IS_ARP(skb)) {
841                 if (!IS_VLAN_ARP(skb))
842                         return NF_ACCEPT;
843                 nf_bridge_pull_encap_header(skb);
844         }
845
846         if (arp_hdr(skb)->ar_pln != 4) {
847                 if (IS_VLAN_ARP(skb))
848                         nf_bridge_push_encap_header(skb);
849                 return NF_ACCEPT;
850         }
851         *d = state->in;
852         NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb,
853                 state->in, state->out, br_nf_forward_finish);
854
855         return NF_STOLEN;
856 }
857
858 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
859 static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb)
860 {
861         struct brnf_frag_data *data;
862         int err;
863
864         data = this_cpu_ptr(&brnf_frag_data_storage);
865         err = skb_cow_head(skb, data->size);
866
867         if (err) {
868                 kfree_skb(skb);
869                 return 0;
870         }
871
872         skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
873         __skb_push(skb, data->encap_size);
874
875         nf_bridge_info_free(skb);
876         return br_dev_queue_push_xmit(sk, skb);
877 }
878
879 static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb,
880                              int (*output)(struct sock *, struct sk_buff *))
881 {
882         unsigned int mtu = ip_skb_dst_mtu(skb);
883         struct iphdr *iph = ip_hdr(skb);
884         struct rtable *rt = skb_rtable(skb);
885         struct net_device *dev = rt->dst.dev;
886
887         if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
888                      (IPCB(skb)->frag_max_size &&
889                       IPCB(skb)->frag_max_size > mtu))) {
890                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
891                 kfree_skb(skb);
892                 return -EMSGSIZE;
893         }
894
895         return ip_do_fragment(sk, skb, output);
896 }
897
898 static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
899 {
900         int ret;
901         struct nf_bridge_info *nf_bridge;
902         unsigned int mtu_reserved;
903
904         if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) {
905                 nf_bridge_info_free(skb);
906                 return br_dev_queue_push_xmit(sk, skb);
907         }
908
909         mtu_reserved = nf_bridge_mtu_reduction(skb);
910         nf_bridge = nf_bridge_info_get(skb);
911         /* This is wrong! We should preserve the original fragment
912          * boundaries by preserving frag_list rather than refragmenting.
913          */
914         if (skb->len + mtu_reserved > skb->dev->mtu) {
915                 struct brnf_frag_data *data;
916
917                 if (br_validate_ipv4(skb))
918                         return NF_DROP;
919
920                 IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
921
922                 nf_bridge_update_protocol(skb);
923
924                 data = this_cpu_ptr(&brnf_frag_data_storage);
925                 data->encap_size = nf_bridge_encap_header_len(skb);
926                 data->size = ETH_HLEN + data->encap_size;
927
928                 skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
929                                                  data->size);
930
931                 ret = br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit);
932         } else {
933                 nf_bridge_info_free(skb);
934                 ret = br_dev_queue_push_xmit(sk, skb);
935         }
936
937         return ret;
938 }
939 #else
940 static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
941 {
942         nf_bridge_info_free(skb);
943         return br_dev_queue_push_xmit(sk, skb);
944 }
945 #endif
946
947 /* PF_BRIDGE/POST_ROUTING ********************************************/
948 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
949                                        struct sk_buff *skb,
950                                        const struct nf_hook_state *state)
951 {
952         struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
953         struct net_device *realoutdev = bridge_parent(skb->dev);
954         u_int8_t pf;
955
956         /* if nf_bridge is set, but ->physoutdev is NULL, this packet came in
957          * on a bridge, but was delivered locally and is now being routed:
958          *
959          * POST_ROUTING was already invoked from the ip stack.
960          */
961         if (!nf_bridge || !nf_bridge->physoutdev)
962                 return NF_ACCEPT;
963
964         if (!realoutdev)
965                 return NF_DROP;
966
967         if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
968                 pf = NFPROTO_IPV4;
969         else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
970                 pf = NFPROTO_IPV6;
971         else
972                 return NF_ACCEPT;
973
974         /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
975          * about the value of skb->pkt_type. */
976         if (skb->pkt_type == PACKET_OTHERHOST) {
977                 skb->pkt_type = PACKET_HOST;
978                 nf_bridge->pkt_otherhost = true;
979         }
980
981         nf_bridge_pull_encap_header(skb);
982         if (pf == NFPROTO_IPV4)
983                 skb->protocol = htons(ETH_P_IP);
984         else
985                 skb->protocol = htons(ETH_P_IPV6);
986
987         NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb,
988                 NULL, realoutdev,
989                 br_nf_dev_queue_xmit);
990
991         return NF_STOLEN;
992 }
993
994 /* IP/SABOTAGE *****************************************************/
995 /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
996  * for the second time. */
997 static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
998                                    struct sk_buff *skb,
999                                    const struct nf_hook_state *state)
1000 {
1001         if (skb->nf_bridge &&
1002             !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
1003                 return NF_STOP;
1004         }
1005
1006         return NF_ACCEPT;
1007 }
1008
1009 /* This is called when br_netfilter has called into iptables/netfilter,
1010  * and DNAT has taken place on a bridge-forwarded packet.
1011  *
1012  * neigh->output has created a new MAC header, with local br0 MAC
1013  * as saddr.
1014  *
1015  * This restores the original MAC saddr of the bridged packet
1016  * before invoking bridge forward logic to transmit the packet.
1017  */
1018 static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
1019 {
1020         struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
1021
1022         skb_pull(skb, ETH_HLEN);
1023         nf_bridge->mask &= ~BRNF_BRIDGED_DNAT;
1024
1025         BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));
1026
1027         skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN),
1028                                        nf_bridge->neigh_header,
1029                                        ETH_HLEN - ETH_ALEN);
1030         skb->dev = nf_bridge->physindev;
1031
1032         nf_bridge->physoutdev = NULL;
1033         br_handle_frame_finish(NULL, skb);
1034 }
1035
1036 static int br_nf_dev_xmit(struct sk_buff *skb)
1037 {
1038         if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
1039                 br_nf_pre_routing_finish_bridge_slow(skb);
1040                 return 1;
1041         }
1042         return 0;
1043 }
1044
1045 static const struct nf_br_ops br_ops = {
1046         .br_dev_xmit_hook =     br_nf_dev_xmit,
1047 };
1048
1049 void br_netfilter_enable(void)
1050 {
1051 }
1052 EXPORT_SYMBOL_GPL(br_netfilter_enable);
1053
1054 /* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
1055  * br_dev_queue_push_xmit is called afterwards */
1056 static struct nf_hook_ops br_nf_ops[] __read_mostly = {
1057         {
1058                 .hook = br_nf_pre_routing,
1059                 .owner = THIS_MODULE,
1060                 .pf = NFPROTO_BRIDGE,
1061                 .hooknum = NF_BR_PRE_ROUTING,
1062                 .priority = NF_BR_PRI_BRNF,
1063         },
1064         {
1065                 .hook = br_nf_local_in,
1066                 .owner = THIS_MODULE,
1067                 .pf = NFPROTO_BRIDGE,
1068                 .hooknum = NF_BR_LOCAL_IN,
1069                 .priority = NF_BR_PRI_BRNF,
1070         },
1071         {
1072                 .hook = br_nf_forward_ip,
1073                 .owner = THIS_MODULE,
1074                 .pf = NFPROTO_BRIDGE,
1075                 .hooknum = NF_BR_FORWARD,
1076                 .priority = NF_BR_PRI_BRNF - 1,
1077         },
1078         {
1079                 .hook = br_nf_forward_arp,
1080                 .owner = THIS_MODULE,
1081                 .pf = NFPROTO_BRIDGE,
1082                 .hooknum = NF_BR_FORWARD,
1083                 .priority = NF_BR_PRI_BRNF,
1084         },
1085         {
1086                 .hook = br_nf_post_routing,
1087                 .owner = THIS_MODULE,
1088                 .pf = NFPROTO_BRIDGE,
1089                 .hooknum = NF_BR_POST_ROUTING,
1090                 .priority = NF_BR_PRI_LAST,
1091         },
1092         {
1093                 .hook = ip_sabotage_in,
1094                 .owner = THIS_MODULE,
1095                 .pf = NFPROTO_IPV4,
1096                 .hooknum = NF_INET_PRE_ROUTING,
1097                 .priority = NF_IP_PRI_FIRST,
1098         },
1099         {
1100                 .hook = ip_sabotage_in,
1101                 .owner = THIS_MODULE,
1102                 .pf = NFPROTO_IPV6,
1103                 .hooknum = NF_INET_PRE_ROUTING,
1104                 .priority = NF_IP6_PRI_FIRST,
1105         },
1106 };
1107
1108 #ifdef CONFIG_SYSCTL
1109 static
1110 int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
1111                             void __user *buffer, size_t *lenp, loff_t *ppos)
1112 {
1113         int ret;
1114
1115         ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
1116
1117         if (write && *(int *)(ctl->data))
1118                 *(int *)(ctl->data) = 1;
1119         return ret;
1120 }
1121
1122 static struct ctl_table brnf_table[] = {
1123         {
1124                 .procname       = "bridge-nf-call-arptables",
1125                 .data           = &brnf_call_arptables,
1126                 .maxlen         = sizeof(int),
1127                 .mode           = 0644,
1128                 .proc_handler   = brnf_sysctl_call_tables,
1129         },
1130         {
1131                 .procname       = "bridge-nf-call-iptables",
1132                 .data           = &brnf_call_iptables,
1133                 .maxlen         = sizeof(int),
1134                 .mode           = 0644,
1135                 .proc_handler   = brnf_sysctl_call_tables,
1136         },
1137         {
1138                 .procname       = "bridge-nf-call-ip6tables",
1139                 .data           = &brnf_call_ip6tables,
1140                 .maxlen         = sizeof(int),
1141                 .mode           = 0644,
1142                 .proc_handler   = brnf_sysctl_call_tables,
1143         },
1144         {
1145                 .procname       = "bridge-nf-filter-vlan-tagged",
1146                 .data           = &brnf_filter_vlan_tagged,
1147                 .maxlen         = sizeof(int),
1148                 .mode           = 0644,
1149                 .proc_handler   = brnf_sysctl_call_tables,
1150         },
1151         {
1152                 .procname       = "bridge-nf-filter-pppoe-tagged",
1153                 .data           = &brnf_filter_pppoe_tagged,
1154                 .maxlen         = sizeof(int),
1155                 .mode           = 0644,
1156                 .proc_handler   = brnf_sysctl_call_tables,
1157         },
1158         {
1159                 .procname       = "bridge-nf-pass-vlan-input-dev",
1160                 .data           = &brnf_pass_vlan_indev,
1161                 .maxlen         = sizeof(int),
1162                 .mode           = 0644,
1163                 .proc_handler   = brnf_sysctl_call_tables,
1164         },
1165         { }
1166 };
1167 #endif
1168
1169 static int __init br_netfilter_init(void)
1170 {
1171         int ret;
1172
1173         ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
1174         if (ret < 0)
1175                 return ret;
1176
1177 #ifdef CONFIG_SYSCTL
1178         brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
1179         if (brnf_sysctl_header == NULL) {
1180                 printk(KERN_WARNING
1181                        "br_netfilter: can't register to sysctl.\n");
1182                 nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
1183                 return -ENOMEM;
1184         }
1185 #endif
1186         RCU_INIT_POINTER(nf_br_ops, &br_ops);
1187         printk(KERN_NOTICE "Bridge firewalling registered\n");
1188         return 0;
1189 }
1190
1191 static void __exit br_netfilter_fini(void)
1192 {
1193         RCU_INIT_POINTER(nf_br_ops, NULL);
1194         nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
1195 #ifdef CONFIG_SYSCTL
1196         unregister_net_sysctl_table(brnf_sysctl_header);
1197 #endif
1198 }
1199
1200 module_init(br_netfilter_init);
1201 module_exit(br_netfilter_fini);
1202
1203 MODULE_LICENSE("GPL");
1204 MODULE_AUTHOR("Lennert Buytenhek <buytenh@gnu.org>");
1205 MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
1206 MODULE_DESCRIPTION("Linux ethernet netfilter firewall bridge");