2 ctdb system specific code to manage raw sockets on linux
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
25 #include "../include/ctdb_private.h"
26 #include <netinet/if_ether.h>
27 #include <netinet/ip6.h>
28 #include <netinet/icmp6.h>
29 #include <net/if_arp.h>
30 #include <netpacket/packet.h>
33 #define ETHERTYPE_IP6 0x86dd
37 calculate the tcp checksum for tcp over ipv6
39 static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
45 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
46 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
49 phdr[1] = htonl(ip6->ip6_nxt);
50 sum += uint16_checksum((uint16_t *)phdr, 8);
52 sum += uint16_checksum(data, n);
54 sum = (sum & 0xFFFF) + (sum >> 16);
55 sum = (sum & 0xFFFF) + (sum >> 16);
65 send gratuitous arp reply after we have taken over an ip address
67 saddr is the address we are trying to claim
68 iface is the interface name we will be using to claim the address
70 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
73 struct sockaddr_ll sall;
74 struct ether_header *eh;
77 struct icmp6_hdr *icmp6;
78 struct ifreq if_hwaddr;
79 unsigned char buffer[78]; /* ipv6 neigh solicitation size */
81 char bdcast[] = {0xff,0xff,0xff,0xff,0xff,0xff};
86 switch (addr->ip.sin_family) {
88 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
90 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
94 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
95 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
96 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
97 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
102 /* get the mac address */
103 strcpy(if_hwaddr.ifr_name, iface);
104 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
107 DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
110 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
111 DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
115 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
118 DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
119 if_hwaddr.ifr_hwaddr.sa_family));
124 memset(buffer, 0 , 64);
125 eh = (struct ether_header *)buffer;
126 memset(eh->ether_dhost, 0xff, ETH_ALEN);
127 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
128 eh->ether_type = htons(ETHERTYPE_ARP);
130 ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
131 ah->ar_hrd = htons(ARPHRD_ETHER);
132 ah->ar_pro = htons(ETH_P_IP);
133 ah->ar_hln = ETH_ALEN;
136 /* send a gratious arp */
137 ah->ar_op = htons(ARPOP_REQUEST);
138 ptr = (char *)&ah[1];
139 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
141 memcpy(ptr, &addr->ip.sin_addr, 4);
143 memset(ptr, 0, ETH_ALEN);
145 memcpy(ptr, &addr->ip.sin_addr, 4);
148 sall.sll_family = AF_PACKET;
150 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
151 sall.sll_protocol = htons(ETH_P_ALL);
152 sall.sll_ifindex = ifr.ifr_ifindex;
153 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
156 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
160 /* send unsolicited arp reply broadcast */
161 ah->ar_op = htons(ARPOP_REPLY);
162 ptr = (char *)&ah[1];
163 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
165 memcpy(ptr, &addr->ip.sin_addr, 4);
167 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
169 memcpy(ptr, &addr->ip.sin_addr, 4);
172 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
174 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
182 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
184 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
188 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
189 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
190 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
191 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
196 /* get the mac address */
197 strcpy(if_hwaddr.ifr_name, iface);
198 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
201 DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
204 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
205 DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
209 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
212 DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
213 if_hwaddr.ifr_hwaddr.sa_family));
217 memset(buffer, 0 , sizeof(buffer));
218 eh = (struct ether_header *)buffer;
219 memset(eh->ether_dhost, 0xff, ETH_ALEN);
220 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
221 eh->ether_type = htons(ETHERTYPE_IP6);
223 ip6 = (struct ip6_hdr *)(eh+1);
225 ip6->ip6_plen = htons(24);
226 ip6->ip6_nxt = IPPROTO_ICMPV6;
228 ip6->ip6_dst = addr->ip6.sin6_addr;
230 icmp6 = (struct icmp6_hdr *)(ip6+1);
231 icmp6->icmp6_type = ND_NEIGHBOR_SOLICIT;
232 icmp6->icmp6_code = 0;
233 memcpy(&icmp6->icmp6_data32[1], &addr->ip6.sin6_addr, 16);
235 icmp6->icmp6_cksum = tcp_checksum6((uint16_t *)icmp6, ntohs(ip6->ip6_plen), ip6);
237 sall.sll_family = AF_PACKET;
239 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
240 sall.sll_protocol = htons(ETH_P_ALL);
241 sall.sll_ifindex = ifr.ifr_ifindex;
242 ret = sendto(s, buffer, 78, 0, (struct sockaddr *)&sall, sizeof(sall));
245 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
252 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/ipv6 address (family is %u)\n", addr->ip.sin_family));
261 simple TCP checksum - assumes data is multiple of 2 bytes long
263 static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
265 uint32_t sum = uint16_checksum(data, n);
267 sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
269 sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
271 sum += ip->protocol + n;
272 sum = (sum & 0xFFFF) + (sum >> 16);
273 sum = (sum & 0xFFFF) + (sum >> 16);
283 Send tcp segment from the specified IP/port to the specified
286 This is used to trigger the receiving host into sending its own ACK,
287 which should trigger early detection of TCP reset by the client
290 This can also be used to send RST segments (if rst is true) and also
291 if correct seq and ack numbers are provided.
293 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
294 const ctdb_sock_addr *src,
295 uint32_t seq, uint32_t ack, int rst)
301 ctdb_sock_addr *tmpdest;
311 switch (src->ip.sin_family) {
314 ip4pkt.ip.version = 4;
315 ip4pkt.ip.ihl = sizeof(ip4pkt.ip)/4;
316 ip4pkt.ip.tot_len = htons(sizeof(ip4pkt));
318 ip4pkt.ip.protocol = IPPROTO_TCP;
319 ip4pkt.ip.saddr = src->ip.sin_addr.s_addr;
320 ip4pkt.ip.daddr = dest->ip.sin_addr.s_addr;
323 ip4pkt.tcp.source = src->ip.sin_port;
324 ip4pkt.tcp.dest = dest->ip.sin_port;
325 ip4pkt.tcp.seq = seq;
326 ip4pkt.tcp.ack_seq = ack;
331 ip4pkt.tcp.doff = sizeof(ip4pkt.tcp)/4;
332 /* this makes it easier to spot in a sniffer */
333 ip4pkt.tcp.window = htons(1234);
334 ip4pkt.tcp.check = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
336 /* open a raw socket to send this segment from */
337 s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
339 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
344 ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
346 DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
353 set_close_on_exec(s);
355 ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
356 (const struct sockaddr *)&dest->ip,
359 if (ret != sizeof(ip4pkt)) {
360 DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
366 ip6pkt.ip6.ip6_vfc = 0x60;
367 ip6pkt.ip6.ip6_plen = htons(20);
368 ip6pkt.ip6.ip6_nxt = IPPROTO_TCP;
369 ip6pkt.ip6.ip6_hlim = 64;
370 ip6pkt.ip6.ip6_src = src->ip6.sin6_addr;
371 ip6pkt.ip6.ip6_dst = dest->ip6.sin6_addr;
373 ip6pkt.tcp.source = src->ip6.sin6_port;
374 ip6pkt.tcp.dest = dest->ip6.sin6_port;
375 ip6pkt.tcp.seq = seq;
376 ip6pkt.tcp.ack_seq = ack;
381 ip6pkt.tcp.doff = sizeof(ip6pkt.tcp)/4;
382 /* this makes it easier to spot in a sniffer */
383 ip6pkt.tcp.window = htons(1234);
384 ip6pkt.tcp.check = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
386 s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
388 DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
392 /* sendto() dont like if the port is set and the socket is
395 tmpdest = discard_const(dest);
396 tmpport = tmpdest->ip6.sin6_port;
398 tmpdest->ip6.sin6_port = 0;
399 ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0,
400 (const struct sockaddr *)&dest->ip6,
402 tmpdest->ip6.sin6_port = tmpport;
405 if (ret != sizeof(ip6pkt)) {
406 DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
412 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
420 This function is used to open a raw socket to capture from
422 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
426 /* Open a socket to capture all traffic */
427 s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
429 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
433 DEBUG(DEBUG_DEBUG, (__location__ " Created RAW SOCKET FD:%d for tcp tickle\n", s));
436 set_close_on_exec(s);
442 This function is used to do any additional cleanup required when closing
444 Note that the socket itself is closed automatically in the caller.
446 int ctdb_sys_close_capture_socket(void *private_data)
453 called when the raw socket becomes readable
455 int ctdb_sys_read_tcp_packet(int s, void *private_data,
456 ctdb_sock_addr *src, ctdb_sock_addr *dst,
457 uint32_t *ack_seq, uint32_t *seq)
460 #define RCVPKTSIZE 100
461 char pkt[RCVPKTSIZE];
462 struct ether_header *eth;
467 ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
468 if (ret < sizeof(*eth)+sizeof(*ip)) {
473 eth = (struct ether_header *)pkt;
475 /* we want either IPv4 or IPv6 */
476 if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
478 ip = (struct iphdr *)(eth+1);
480 /* We only want IPv4 packets */
481 if (ip->version != 4) {
484 /* Dont look at fragments */
485 if ((ntohs(ip->frag_off)&0x1fff) != 0) {
488 /* we only want TCP */
489 if (ip->protocol != IPPROTO_TCP) {
493 /* make sure its not a short packet */
494 if (offsetof(struct tcphdr, ack_seq) + 4 +
495 (ip->ihl*4) + sizeof(*eth) > ret) {
499 tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
501 /* tell the caller which one we've found */
502 src->ip.sin_family = AF_INET;
503 src->ip.sin_addr.s_addr = ip->saddr;
504 src->ip.sin_port = tcp->source;
505 dst->ip.sin_family = AF_INET;
506 dst->ip.sin_addr.s_addr = ip->daddr;
507 dst->ip.sin_port = tcp->dest;
508 *ack_seq = tcp->ack_seq;
512 } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
514 ip6 = (struct ip6_hdr *)(eth+1);
516 /* we only want TCP */
517 if (ip6->ip6_nxt != IPPROTO_TCP) {
522 tcp = (struct tcphdr *)(ip6+1);
524 /* tell the caller which one we've found */
525 src->ip6.sin6_family = AF_INET6;
526 src->ip6.sin6_port = tcp->source;
527 src->ip6.sin6_addr = ip6->ip6_src;
529 dst->ip6.sin6_family = AF_INET6;
530 dst->ip6.sin6_port = tcp->dest;
531 dst->ip6.sin6_addr = ip6->ip6_dst;
533 *ack_seq = tcp->ack_seq;
543 bool ctdb_sys_check_iface_exists(const char *iface)
548 s = socket(PF_PACKET, SOCK_RAW, 0);
550 /* We dont know if the interface exists, so assume yes */
551 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
555 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
556 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0 && errno == ENODEV) {
557 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
566 int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
569 socklen_t crl = sizeof(struct ucred);
571 if ((ret = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &crl) == 0)) {
578 * Find the process name from process ID
580 char *ctdb_get_process_name(pid_t pid)
587 snprintf(path, sizeof(path), "/proc/%d/exe", pid);
588 n = readlink(path, buf, sizeof(buf));
593 /* Remove any extra fields */
595 ptr = strtok(buf, " ");
601 * Parsing a line from /proc/locks,
603 static bool parse_proc_locks_line(char *line, pid_t *pid,
604 struct ctdb_lock_info *curlock)
608 /* output of /proc/locks
611 * 1: POSIX ADVISORY WRITE 25945 fd:00:6424820 212 212
614 * 1: -> POSIX ADVISORY WRITE 25946 fd:00:6424820 212 212
618 ptr = strtok_r(line, " ", &saveptr);
619 if (ptr == NULL) return false;
622 ptr = strtok_r(NULL, " ", &saveptr);
623 if (ptr == NULL) return false;
624 if (strcmp(ptr, "->") == 0) {
625 curlock->waiting = true;
626 ptr = strtok_r(NULL, " ", &saveptr);
628 curlock->waiting = false;
632 if (ptr == NULL || strcmp(ptr, "POSIX") != 0) {
637 ptr = strtok_r(NULL, " ", &saveptr);
638 if (ptr == NULL) return false;
641 ptr = strtok_r(NULL, " ", &saveptr);
642 if (ptr == NULL) return false;
643 if (strcmp(ptr, "READ") == 0) {
644 curlock->read_only = true;
645 } else if (strcmp(ptr, "WRITE") == 0) {
646 curlock->read_only = false;
652 ptr = strtok_r(NULL, " ", &saveptr);
653 if (ptr == NULL) return false;
656 /* MAJOR:MINOR:INODE */
657 ptr = strtok_r(NULL, " :", &saveptr);
658 if (ptr == NULL) return false;
659 ptr = strtok_r(NULL, " :", &saveptr);
660 if (ptr == NULL) return false;
661 ptr = strtok_r(NULL, " :", &saveptr);
662 if (ptr == NULL) return false;
663 curlock->inode = atol(ptr);
666 ptr = strtok_r(NULL, " ", &saveptr);
667 if (ptr == NULL) return false;
668 curlock->start = atol(ptr);
671 ptr = strtok_r(NULL, " ", &saveptr);
672 if (ptr == NULL) return false;
673 if (strncmp(ptr, "EOF", 3) == 0) {
674 curlock->end = (off_t)-1;
676 curlock->end = atol(ptr);
683 * Find information of lock being waited on for given process ID
685 bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info)
688 struct ctdb_lock_info curlock;
694 if ((fp = fopen("/proc/locks", "r")) == NULL) {
695 DEBUG(DEBUG_ERR, ("Failed to read locks information"));
698 while ((ptr = fgets(buf, sizeof(buf), fp)) != NULL) {
699 if (! parse_proc_locks_line(buf, &pid, &curlock)) {
702 if (pid == req_pid && curlock.waiting) {
703 *lock_info = curlock;
714 * Find process ID which holds an overlapping byte lock for required
715 * inode and byte range.
717 bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid)
720 struct ctdb_lock_info curlock;
726 if ((fp = fopen("/proc/locks", "r")) == NULL) {
727 DEBUG(DEBUG_ERR, ("Failed to read locks information"));
730 while ((ptr = fgets(buf, sizeof(buf), fp)) != NULL) {
731 if (! parse_proc_locks_line(buf, &pid, &curlock)) {
735 if (curlock.waiting) {
739 if (curlock.inode != reqlock->inode) {
743 if (curlock.start > reqlock->end ||
744 curlock.end < reqlock->start) {
745 /* Outside the required range */