2 ctdb system specific code to manage raw sockets on linux
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
26 #include "lib/util/debug.h"
28 #include "protocol/protocol.h"
30 #include <netinet/if_ether.h>
31 #include <netinet/ip6.h>
32 #include <netinet/icmp6.h>
33 #include <net/if_arp.h>
34 #include <netpacket/packet.h>
35 #include <sys/prctl.h>
37 #include "common/logging.h"
38 #include "common/system.h"
41 #define ETHERTYPE_IP6 0x86dd
45 calculate the tcp checksum for tcp over ipv6
47 static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
53 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
54 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
57 phdr[1] = htonl(ip6->ip6_nxt);
58 sum += uint16_checksum((uint16_t *)phdr, 8);
60 sum += uint16_checksum(data, n);
62 sum = (sum & 0xFFFF) + (sum >> 16);
63 sum = (sum & 0xFFFF) + (sum >> 16);
73 send gratuitous arp reply after we have taken over an ip address
75 saddr is the address we are trying to claim
76 iface is the interface name we will be using to claim the address
78 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
81 struct sockaddr_ll sall;
82 struct ether_header *eh;
85 struct nd_neighbor_advert *nd_na;
86 struct nd_opt_hdr *nd_oh;
87 struct ifreq if_hwaddr;
88 /* Size of IPv6 neighbor advertisement (with option) */
89 unsigned char buffer[sizeof(struct ether_header) +
90 sizeof(struct ip6_hdr) +
91 sizeof(struct nd_neighbor_advert) +
92 sizeof(struct nd_opt_hdr) + ETH_ALEN];
94 char bdcast[] = {0xff,0xff,0xff,0xff,0xff,0xff};
99 ZERO_STRUCT(if_hwaddr);
101 switch (addr->ip.sin_family) {
103 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
105 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
109 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
110 strlcpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
111 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
112 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
117 /* get the mac address */
118 strncpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name)-1);
119 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
122 DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
125 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
126 DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
130 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
133 DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
134 if_hwaddr.ifr_hwaddr.sa_family));
139 memset(buffer, 0 , 64);
140 eh = (struct ether_header *)buffer;
141 memset(eh->ether_dhost, 0xff, ETH_ALEN);
142 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
143 eh->ether_type = htons(ETHERTYPE_ARP);
145 ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
146 ah->ar_hrd = htons(ARPHRD_ETHER);
147 ah->ar_pro = htons(ETH_P_IP);
148 ah->ar_hln = ETH_ALEN;
151 /* send a gratious arp */
152 ah->ar_op = htons(ARPOP_REQUEST);
153 ptr = (char *)&ah[1];
154 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
156 memcpy(ptr, &addr->ip.sin_addr, 4);
158 memset(ptr, 0, ETH_ALEN);
160 memcpy(ptr, &addr->ip.sin_addr, 4);
163 sall.sll_family = AF_PACKET;
165 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
166 sall.sll_protocol = htons(ETH_P_ALL);
167 sall.sll_ifindex = ifr.ifr_ifindex;
168 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
171 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
175 /* send unsolicited arp reply broadcast */
176 ah->ar_op = htons(ARPOP_REPLY);
177 ptr = (char *)&ah[1];
178 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
180 memcpy(ptr, &addr->ip.sin_addr, 4);
182 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
184 memcpy(ptr, &addr->ip.sin_addr, 4);
187 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
189 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
197 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
199 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
203 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
204 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
205 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
206 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
211 /* get the mac address */
212 strncpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name)-1);
213 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
216 DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
219 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
220 DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
224 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
227 DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
228 if_hwaddr.ifr_hwaddr.sa_family));
232 memset(buffer, 0 , sizeof(buffer));
233 eh = (struct ether_header *)buffer;
234 /* Ethernet multicast: 33:33:00:00:00:01 (see RFC2464,
235 * section 7) - note zeroes above! */
236 eh->ether_dhost[0] = eh->ether_dhost[1] = 0x33;
237 eh->ether_dhost[5] = 0x01;
238 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
239 eh->ether_type = htons(ETHERTYPE_IP6);
241 ip6 = (struct ip6_hdr *)(eh+1);
243 ip6->ip6_plen = htons(sizeof(*nd_na) +
244 sizeof(struct nd_opt_hdr) +
246 ip6->ip6_nxt = IPPROTO_ICMPV6;
248 ip6->ip6_src = addr->ip6.sin6_addr;
249 /* all-nodes multicast */
251 ret = inet_pton(AF_INET6, "ff02::1", &ip6->ip6_dst);
254 DEBUG(DEBUG_CRIT,(__location__ " failed inet_pton\n"));
258 nd_na = (struct nd_neighbor_advert *)(ip6+1);
259 nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
260 nd_na->nd_na_code = 0;
261 nd_na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE;
262 nd_na->nd_na_target = addr->ip6.sin6_addr;
263 /* Option: Target link-layer address */
264 nd_oh = (struct nd_opt_hdr *)(nd_na+1);
265 nd_oh->nd_opt_type = ND_OPT_TARGET_LINKADDR;
266 nd_oh->nd_opt_len = 1;
267 memcpy(&(nd_oh+1)[0], if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
269 nd_na->nd_na_cksum = tcp_checksum6((uint16_t *)nd_na,
270 ntohs(ip6->ip6_plen), ip6);
272 sall.sll_family = AF_PACKET;
274 memcpy(&sall.sll_addr[0], &eh->ether_dhost[0], sall.sll_halen);
275 sall.sll_protocol = htons(ETH_P_ALL);
276 sall.sll_ifindex = ifr.ifr_ifindex;
277 ret = sendto(s, buffer, sizeof(buffer),
278 0, (struct sockaddr *)&sall, sizeof(sall));
281 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
288 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/ipv6 address (family is %u)\n", addr->ip.sin_family));
297 simple TCP checksum - assumes data is multiple of 2 bytes long
299 static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
301 uint32_t sum = uint16_checksum(data, n);
303 sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
305 sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
307 sum += ip->protocol + n;
308 sum = (sum & 0xFFFF) + (sum >> 16);
309 sum = (sum & 0xFFFF) + (sum >> 16);
319 Send tcp segment from the specified IP/port to the specified
322 This is used to trigger the receiving host into sending its own ACK,
323 which should trigger early detection of TCP reset by the client
326 This can also be used to send RST segments (if rst is true) and also
327 if correct seq and ack numbers are provided.
329 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
330 const ctdb_sock_addr *src,
331 uint32_t seq, uint32_t ack, int rst)
337 ctdb_sock_addr *tmpdest;
347 switch (src->ip.sin_family) {
350 ip4pkt.ip.version = 4;
351 ip4pkt.ip.ihl = sizeof(ip4pkt.ip)/4;
352 ip4pkt.ip.tot_len = htons(sizeof(ip4pkt));
354 ip4pkt.ip.protocol = IPPROTO_TCP;
355 ip4pkt.ip.saddr = src->ip.sin_addr.s_addr;
356 ip4pkt.ip.daddr = dest->ip.sin_addr.s_addr;
359 ip4pkt.tcp.source = src->ip.sin_port;
360 ip4pkt.tcp.dest = dest->ip.sin_port;
361 ip4pkt.tcp.seq = seq;
362 ip4pkt.tcp.ack_seq = ack;
367 ip4pkt.tcp.doff = sizeof(ip4pkt.tcp)/4;
368 /* this makes it easier to spot in a sniffer */
369 ip4pkt.tcp.window = htons(1234);
370 ip4pkt.tcp.check = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
372 /* open a raw socket to send this segment from */
373 s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
375 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
380 ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
382 DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
389 set_close_on_exec(s);
391 ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
392 (const struct sockaddr *)&dest->ip,
395 if (ret != sizeof(ip4pkt)) {
396 DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
402 ip6pkt.ip6.ip6_vfc = 0x60;
403 ip6pkt.ip6.ip6_plen = htons(20);
404 ip6pkt.ip6.ip6_nxt = IPPROTO_TCP;
405 ip6pkt.ip6.ip6_hlim = 64;
406 ip6pkt.ip6.ip6_src = src->ip6.sin6_addr;
407 ip6pkt.ip6.ip6_dst = dest->ip6.sin6_addr;
409 ip6pkt.tcp.source = src->ip6.sin6_port;
410 ip6pkt.tcp.dest = dest->ip6.sin6_port;
411 ip6pkt.tcp.seq = seq;
412 ip6pkt.tcp.ack_seq = ack;
417 ip6pkt.tcp.doff = sizeof(ip6pkt.tcp)/4;
418 /* this makes it easier to spot in a sniffer */
419 ip6pkt.tcp.window = htons(1234);
420 ip6pkt.tcp.check = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
422 s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
424 DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
428 /* sendto() don't like if the port is set and the socket is
431 tmpdest = discard_const(dest);
432 tmpport = tmpdest->ip6.sin6_port;
434 tmpdest->ip6.sin6_port = 0;
435 ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0,
436 (const struct sockaddr *)&dest->ip6,
438 tmpdest->ip6.sin6_port = tmpport;
441 if (ret != sizeof(ip6pkt)) {
442 DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
448 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
456 This function is used to open a raw socket to capture from
458 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
462 /* Open a socket to capture all traffic */
463 s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
465 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
469 DEBUG(DEBUG_DEBUG, (__location__ " Created RAW SOCKET FD:%d for tcp tickle\n", s));
472 set_close_on_exec(s);
478 This function is used to do any additional cleanup required when closing
480 Note that the socket itself is closed automatically in the caller.
482 int ctdb_sys_close_capture_socket(void *private_data)
489 called when the raw socket becomes readable
491 int ctdb_sys_read_tcp_packet(int s, void *private_data,
492 ctdb_sock_addr *src, ctdb_sock_addr *dst,
493 uint32_t *ack_seq, uint32_t *seq)
496 #define RCVPKTSIZE 100
497 char pkt[RCVPKTSIZE];
498 struct ether_header *eth;
503 ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
504 if (ret < sizeof(*eth)+sizeof(*ip)) {
509 eth = (struct ether_header *)pkt;
511 /* we want either IPv4 or IPv6 */
512 if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
514 ip = (struct iphdr *)(eth+1);
516 /* We only want IPv4 packets */
517 if (ip->version != 4) {
520 /* Dont look at fragments */
521 if ((ntohs(ip->frag_off)&0x1fff) != 0) {
524 /* we only want TCP */
525 if (ip->protocol != IPPROTO_TCP) {
529 /* make sure its not a short packet */
530 if (offsetof(struct tcphdr, ack_seq) + 4 +
531 (ip->ihl*4) + sizeof(*eth) > ret) {
535 tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
537 /* tell the caller which one we've found */
538 src->ip.sin_family = AF_INET;
539 src->ip.sin_addr.s_addr = ip->saddr;
540 src->ip.sin_port = tcp->source;
541 dst->ip.sin_family = AF_INET;
542 dst->ip.sin_addr.s_addr = ip->daddr;
543 dst->ip.sin_port = tcp->dest;
544 *ack_seq = tcp->ack_seq;
548 } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
550 ip6 = (struct ip6_hdr *)(eth+1);
552 /* we only want TCP */
553 if (ip6->ip6_nxt != IPPROTO_TCP) {
558 tcp = (struct tcphdr *)(ip6+1);
560 /* tell the caller which one we've found */
561 src->ip6.sin6_family = AF_INET6;
562 src->ip6.sin6_port = tcp->source;
563 src->ip6.sin6_addr = ip6->ip6_src;
565 dst->ip6.sin6_family = AF_INET6;
566 dst->ip6.sin6_port = tcp->dest;
567 dst->ip6.sin6_addr = ip6->ip6_dst;
569 *ack_seq = tcp->ack_seq;
579 bool ctdb_sys_check_iface_exists(const char *iface)
584 s = socket(PF_PACKET, SOCK_RAW, 0);
586 /* We don't know if the interface exists, so assume yes */
587 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
591 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name)-1);
592 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0 && errno == ENODEV) {
593 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
602 int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
605 socklen_t crl = sizeof(struct ucred);
607 if ((ret = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &crl) == 0)) {