2 ctdb system specific code to manage raw sockets on linux
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Marc Dequènes (Duck) 2009
7 Copyright (C) Volker Lendecke 2012
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, see <http://www.gnu.org/licenses/>.
26 * Use BSD struct tcphdr field names for portability. Modern glibc
27 * makes them available by default via <netinet/tcp.h> but older glibc
28 * requires __FAVOR_BSD to be defined.
30 * __FAVOR_BSD is normally defined in <features.h> if _DEFAULT_SOURCE
31 * (new) or _BSD_SOURCE (now deprecated) is set and _GNU_SOURCE is not
32 * set. Including "replace.h" above causes <features.h> to be
33 * indirectly included and this will not set __FAVOR_BSD because
34 * _GNU_SOURCE is set in Samba's "config.h" (which is included by
37 * Therefore, set __FAVOR_BSD by hand below.
40 #include "system/network.h"
42 #ifdef HAVE_NETINET_IF_ETHER_H
43 #include <netinet/if_ether.h>
45 #ifdef HAVE_NETINET_IP6_H
46 #include <netinet/ip6.h>
48 #ifdef HAVE_NETINET_ICMP6_H
49 #include <netinet/icmp6.h>
51 #ifdef HAVE_LINUX_IF_PACKET_H
52 #include <linux/if_packet.h>
56 #define ETHERTYPE_IP6 0x86dd
59 #include "lib/util/debug.h"
60 #include "lib/util/blocking.h"
62 #include "protocol/protocol.h"
64 #include "common/logging.h"
65 #include "common/system_socket.h"
68 uint16 checksum for n bytes
70 static uint32_t uint16_checksum(uint16_t *data, size_t n)
74 sum += (uint32_t)ntohs(*data);
79 sum += (uint32_t)ntohs(*(uint8_t *)data);
85 * See if the given IP is currently on an interface
87 bool ctdb_sys_have_ip(ctdb_sock_addr *_addr)
91 ctdb_sock_addr __addr = *_addr;
92 ctdb_sock_addr *addr = &__addr;
93 socklen_t addrlen = 0;
95 switch (addr->sa.sa_family) {
97 addr->ip.sin_port = 0;
98 addrlen = sizeof(struct sockaddr_in);
101 addr->ip6.sin6_port = 0;
102 addrlen = sizeof(struct sockaddr_in6);
106 s = socket(addr->sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
111 ret = bind(s, (struct sockaddr *)addr, addrlen);
118 * simple TCP checksum - assumes data is multiple of 2 bytes long
120 static uint16_t ip_checksum(uint16_t *data, size_t n, struct ip *ip)
122 uint32_t sum = uint16_checksum(data, n);
125 sum += uint16_checksum((uint16_t *)&ip->ip_src, sizeof(ip->ip_src));
126 sum += uint16_checksum((uint16_t *)&ip->ip_dst, sizeof(ip->ip_dst));
128 sum = (sum & 0xFFFF) + (sum >> 16);
129 sum = (sum & 0xFFFF) + (sum >> 16);
138 static uint16_t ip6_checksum(uint16_t *data, size_t n, struct ip6_hdr *ip6)
145 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
146 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
149 phdr[0] = len & UINT16_MAX;
150 phdr[1] = (len >> 16) & UINT16_MAX;
151 /* ip6_nxt is only 8 bits, so fits comfortably into a uint16_t */
152 phdr[2] = htons(ip6->ip6_nxt);
153 sum += uint16_checksum(phdr, sizeof(phdr));
155 sum += uint16_checksum(data, n);
157 sum = (sum & 0xFFFF) + (sum >> 16);
158 sum = (sum & 0xFFFF) + (sum >> 16);
168 * Send gratuitous ARP request/reply or IPv6 neighbor advertisement
171 #ifdef HAVE_PACKETSOCKET
173 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
176 struct sockaddr_ll sall = {0};
177 struct ether_header *eh;
180 struct nd_neighbor_advert *nd_na;
181 struct nd_opt_hdr *nd_oh;
182 struct ether_addr *ea;
183 struct ifreq if_hwaddr = {{{0}}};
184 /* Size of IPv6 neighbor advertisement (with option) */
185 unsigned char buffer[sizeof(struct ether_header) +
186 sizeof(struct ip6_hdr) +
187 sizeof(struct nd_neighbor_advert) +
188 sizeof(struct nd_opt_hdr) + ETH_ALEN];
190 char bdcast[] = {0xff,0xff,0xff,0xff,0xff,0xff};
191 struct ifreq ifr = {{{0}}};
194 s = socket(AF_PACKET, SOCK_RAW, 0);
197 DBG_ERR("Failed to open raw socket\n");
200 DBG_DEBUG("Created SOCKET FD:%d for sending arp\n", s);
203 strlcpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
204 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
206 DBG_ERR("Interface '%s' not found\n", iface);
210 /* Get MAC address */
211 strlcpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name));
212 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
215 DBG_ERR("ioctl failed\n");
218 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
220 D_DEBUG("Ignoring loopback arp request\n");
223 if (if_hwaddr.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
225 DBG_ERR("Not an ethernet address family (0x%x)\n",
226 if_hwaddr.ifr_hwaddr.sa_family);
230 /* Set up most of destination address structure */
231 sall.sll_family = AF_PACKET;
232 sall.sll_halen = sizeof(struct ether_addr);
233 sall.sll_protocol = htons(ETH_P_ALL);
234 sall.sll_ifindex = ifr.ifr_ifindex;
236 switch (addr->ip.sin_family) {
238 memset(buffer, 0 , 64);
239 eh = (struct ether_header *)buffer;
240 memset(eh->ether_dhost, 0xff, ETH_ALEN);
241 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
242 eh->ether_type = htons(ETHERTYPE_ARP);
244 ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
245 ah->ar_hrd = htons(ARPHRD_ETHER);
246 ah->ar_pro = htons(ETH_P_IP);
247 ah->ar_hln = ETH_ALEN;
250 /* send a gratious arp */
251 ah->ar_op = htons(ARPOP_REQUEST);
252 ptr = (char *)&ah[1];
253 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
255 memcpy(ptr, &addr->ip.sin_addr, 4);
257 memset(ptr, 0, ETH_ALEN);
259 memcpy(ptr, &addr->ip.sin_addr, 4);
262 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
264 ret = sendto(s,buffer, 64, 0,
265 (struct sockaddr *)&sall, sizeof(sall));
268 DBG_ERR("Failed sendto\n");
272 /* send unsolicited arp reply broadcast */
273 ah->ar_op = htons(ARPOP_REPLY);
274 ptr = (char *)&ah[1];
275 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
277 memcpy(ptr, &addr->ip.sin_addr, 4);
279 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
281 memcpy(ptr, &addr->ip.sin_addr, 4);
284 ret = sendto(s, buffer, 64, 0,
285 (struct sockaddr *)&sall, sizeof(sall));
288 DBG_ERR("Failed sendto\n");
295 memset(buffer, 0 , sizeof(buffer));
296 eh = (struct ether_header *)buffer;
298 * Ethernet multicast: 33:33:00:00:00:01 (see RFC2464,
299 * section 7) - note zeroes above!
301 eh->ether_dhost[0] = eh->ether_dhost[1] = 0x33;
302 eh->ether_dhost[5] = 0x01;
303 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
304 eh->ether_type = htons(ETHERTYPE_IP6);
306 ip6 = (struct ip6_hdr *)(eh+1);
308 ip6->ip6_plen = htons(sizeof(*nd_na) +
309 sizeof(struct nd_opt_hdr) +
311 ip6->ip6_nxt = IPPROTO_ICMPV6;
313 ip6->ip6_src = addr->ip6.sin6_addr;
314 /* all-nodes multicast */
316 ret = inet_pton(AF_INET6, "ff02::1", &ip6->ip6_dst);
319 DBG_ERR("Failed inet_pton\n");
323 nd_na = (struct nd_neighbor_advert *)(ip6+1);
324 nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
325 nd_na->nd_na_code = 0;
326 nd_na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE;
327 nd_na->nd_na_target = addr->ip6.sin6_addr;
328 /* Option: Target link-layer address */
329 nd_oh = (struct nd_opt_hdr *)(nd_na+1);
330 nd_oh->nd_opt_type = ND_OPT_TARGET_LINKADDR;
331 nd_oh->nd_opt_len = 1;
333 ea = (struct ether_addr *)(nd_oh+1);
334 memcpy(ea, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
336 nd_na->nd_na_cksum = ip6_checksum((uint16_t *)nd_na,
337 ntohs(ip6->ip6_plen), ip6);
339 memcpy(&sall.sll_addr[0], &eh->ether_dhost[0], sall.sll_halen);
341 ret = sendto(s, buffer, sizeof(buffer),
342 0, (struct sockaddr *)&sall, sizeof(sall));
345 DBG_ERR("Failed sendto\n");
353 DBG_ERR("Not an ipv4/ipv6 address (family is %u)\n",
354 addr->ip.sin_family);
365 #else /* HAVE_PACKETSOCKET */
367 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
369 /* Not implemented */
373 #endif /* HAVE_PACKETSOCKET */
376 * Send tcp segment from the specified IP/port to the specified
377 * destination IP/port.
379 * This is used to trigger the receiving host into sending its own ACK,
380 * which should trigger early detection of TCP reset by the client
383 * This can also be used to send RST segments (if rst is true) and also
384 * if correct seq and ack numbers are provided.
386 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
387 const ctdb_sock_addr *src,
396 ctdb_sock_addr *tmpdest;
407 switch (src->ip.sin_family) {
411 ip4pkt.ip.ip_hl = sizeof(ip4pkt.ip)/4;
412 ip4pkt.ip.ip_len = htons(sizeof(ip4pkt));
413 ip4pkt.ip.ip_ttl = 255;
414 ip4pkt.ip.ip_p = IPPROTO_TCP;
415 ip4pkt.ip.ip_src.s_addr = src->ip.sin_addr.s_addr;
416 ip4pkt.ip.ip_dst.s_addr = dest->ip.sin_addr.s_addr;
417 ip4pkt.ip.ip_sum = 0;
419 ip4pkt.tcp.th_sport = src->ip.sin_port;
420 ip4pkt.tcp.th_dport = dest->ip.sin_port;
421 ip4pkt.tcp.th_seq = seq;
422 ip4pkt.tcp.th_ack = ack;
423 ip4pkt.tcp.th_flags = 0;
424 ip4pkt.tcp.th_flags |= TH_ACK;
426 ip4pkt.tcp.th_flags |= TH_RST;
428 ip4pkt.tcp.th_off = sizeof(ip4pkt.tcp)/4;
429 /* this makes it easier to spot in a sniffer */
430 ip4pkt.tcp.th_win = htons(1234);
431 ip4pkt.tcp.th_sum = ip_checksum((uint16_t *)&ip4pkt.tcp,
435 /* open a raw socket to send this segment from */
436 s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
438 DBG_ERR("Failed to open raw socket (%s)\n",
443 ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
445 DBG_ERR("Failed to setup IP headers (%s)\n",
451 ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
452 (const struct sockaddr *)&dest->ip,
456 if (ret != sizeof(ip4pkt)) {
457 D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
463 ip6pkt.ip6.ip6_vfc = 0x60;
464 ip6pkt.ip6.ip6_plen = htons(20);
465 ip6pkt.ip6.ip6_nxt = IPPROTO_TCP;
466 ip6pkt.ip6.ip6_hlim = 64;
467 ip6pkt.ip6.ip6_src = src->ip6.sin6_addr;
468 ip6pkt.ip6.ip6_dst = dest->ip6.sin6_addr;
470 ip6pkt.tcp.th_sport = src->ip6.sin6_port;
471 ip6pkt.tcp.th_dport = dest->ip6.sin6_port;
472 ip6pkt.tcp.th_seq = seq;
473 ip6pkt.tcp.th_ack = ack;
474 ip6pkt.tcp.th_flags = 0;
475 ip6pkt.tcp.th_flags |= TH_RST;
477 ip6pkt.tcp.th_flags |= TH_RST;
479 ip6pkt.tcp.th_off = sizeof(ip6pkt.tcp)/4;
480 /* this makes it easier to spot in a sniffer */
481 ip6pkt.tcp.th_win = htons(1234);
482 ip6pkt.tcp.th_sum = ip6_checksum((uint16_t *)&ip6pkt.tcp,
486 s = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
488 DBG_ERR("Failed to open sending socket\n");
492 /* sendto() don't like if the port is set and the socket is
495 tmpdest = discard_const(dest);
496 tmpport = tmpdest->ip6.sin6_port;
498 tmpdest->ip6.sin6_port = 0;
499 ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0,
500 (const struct sockaddr *)&dest->ip6,
503 tmpdest->ip6.sin6_port = tmpport;
506 if (ret != sizeof(ip6pkt)) {
507 D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
513 DBG_ERR("Not an ipv4/v6 address\n");
523 * If AF_PACKET is available then use a raw socket otherwise use pcap.
524 * wscript has checked to make sure that pcap is available if needed.
527 #ifdef HAVE_AF_PACKET
530 * This function is used to open a raw socket to capture from
532 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
536 /* Open a socket to capture all traffic */
537 s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
539 DBG_ERR("Failed to open raw socket\n");
543 DBG_DEBUG("Created RAW SOCKET FD:%d for tcp tickle\n", s);
545 ret = set_blocking(s, false);
547 DBG_ERR("Failed to set socket non-blocking (%s)\n",
553 set_close_on_exec(s);
559 * This function is used to do any additional cleanup required when closing
561 * Note that the socket itself is closed automatically in the caller.
563 int ctdb_sys_close_capture_socket(void *private_data)
570 * called when the raw socket becomes readable
572 int ctdb_sys_read_tcp_packet(int s, void *private_data,
581 #define RCVPKTSIZE 100
582 char pkt[RCVPKTSIZE];
583 struct ether_header *eth;
588 ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
589 if (ret < sizeof(*eth)+sizeof(*ip)) {
597 eth = (struct ether_header *)pkt;
599 /* we want either IPv4 or IPv6 */
600 if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
602 ip = (struct iphdr *)(eth+1);
604 /* We only want IPv4 packets */
605 if (ip->version != 4) {
608 /* Dont look at fragments */
609 if ((ntohs(ip->frag_off)&0x1fff) != 0) {
612 /* we only want TCP */
613 if (ip->protocol != IPPROTO_TCP) {
617 /* make sure its not a short packet */
618 if (offsetof(struct tcphdr, th_ack) + 4 +
619 (ip->ihl*4) + sizeof(*eth) > ret) {
623 tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
625 /* tell the caller which one we've found */
626 src->ip.sin_family = AF_INET;
627 src->ip.sin_addr.s_addr = ip->saddr;
628 src->ip.sin_port = tcp->th_sport;
629 dst->ip.sin_family = AF_INET;
630 dst->ip.sin_addr.s_addr = ip->daddr;
631 dst->ip.sin_port = tcp->th_dport;
632 *ack_seq = tcp->th_ack;
634 if (window != NULL) {
635 *window = tcp->th_win;
638 *rst = tcp->th_flags & TH_RST;
642 } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
644 ip6 = (struct ip6_hdr *)(eth+1);
646 /* we only want TCP */
647 if (ip6->ip6_nxt != IPPROTO_TCP) {
652 tcp = (struct tcphdr *)(ip6+1);
654 /* tell the caller which one we've found */
655 src->ip6.sin6_family = AF_INET6;
656 src->ip6.sin6_port = tcp->th_sport;
657 src->ip6.sin6_addr = ip6->ip6_src;
659 dst->ip6.sin6_family = AF_INET6;
660 dst->ip6.sin6_port = tcp->th_dport;
661 dst->ip6.sin6_addr = ip6->ip6_dst;
663 *ack_seq = tcp->th_ack;
665 if (window != NULL) {
666 *window = tcp->th_win;
669 *rst = tcp->th_flags & TH_RST;
678 #else /* HAVE_AF_PACKET */
682 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
686 pt=pcap_open_live(iface, 100, 0, 0, NULL);
688 DBG_ERR("Failed to open capture device %s\n", iface);
691 *((pcap_t **)private_data) = pt;
693 return pcap_fileno(pt);
696 int ctdb_sys_close_capture_socket(void *private_data)
698 pcap_t *pt = (pcap_t *)private_data;
703 int ctdb_sys_read_tcp_packet(int s,
713 struct ether_header *eth;
717 struct ctdb_killtcp_connection *conn;
718 struct pcap_pkthdr pkthdr;
719 const u_char *buffer;
720 pcap_t *pt = (pcap_t *)private_data;
722 buffer=pcap_next(pt, &pkthdr);
731 eth = (struct ether_header *)buffer;
733 /* we want either IPv4 or IPv6 */
734 if (eth->ether_type == htons(ETHERTYPE_IP)) {
736 ip = (struct ip *)(eth+1);
738 /* We only want IPv4 packets */
742 /* Dont look at fragments */
743 if ((ntohs(ip->ip_off)&0x1fff) != 0) {
746 /* we only want TCP */
747 if (ip->ip_p != IPPROTO_TCP) {
751 /* make sure its not a short packet */
752 if (offsetof(struct tcphdr, th_ack) + 4 +
753 (ip->ip_hl*4) > pkthdr.len) {
757 tcp = (struct tcphdr *)((ip->ip_hl*4) + (char *)ip);
759 /* tell the caller which one we've found */
760 src->ip.sin_family = AF_INET;
761 src->ip.sin_addr.s_addr = ip->ip_src.s_addr;
762 src->ip.sin_port = tcp->th_sport;
763 dst->ip.sin_family = AF_INET;
764 dst->ip.sin_addr.s_addr = ip->ip_dst.s_addr;
765 dst->ip.sin_port = tcp->th_dport;
766 *ack_seq = tcp->th_ack;
768 if (window != NULL) {
769 *window = tcp->th_win;
772 *rst = tcp->th_flags & TH_RST;
776 } else if (eth->ether_type == htons(ETHERTYPE_IP6)) {
778 ip6 = (struct ip6_hdr *)(eth+1);
780 /* we only want TCP */
781 if (ip6->ip6_nxt != IPPROTO_TCP) {
786 tcp = (struct tcphdr *)(ip6+1);
788 /* tell the caller which one we've found */
789 src->ip6.sin6_family = AF_INET6;
790 src->ip6.sin6_port = tcp->th_sport;
791 src->ip6.sin6_addr = ip6->ip6_src;
793 dst->ip6.sin6_family = AF_INET6;
794 dst->ip6.sin6_port = tcp->th_dport;
795 dst->ip6.sin6_addr = ip6->ip6_dst;
797 *ack_seq = tcp->th_ack;
799 if (window != NULL) {
800 *window = tcp->th_win;
803 *rst = tcp->th_flags & TH_RST;
812 #endif /* HAVE_AF_PACKET */