2 ctdb system specific code to manage raw sockets on linux
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
25 #include "../include/ctdb_private.h"
26 #include <netinet/if_ether.h>
27 #include <netinet/ip6.h>
28 #include <netinet/icmp6.h>
29 #include <net/if_arp.h>
30 #include <netpacket/packet.h>
31 #include <sys/prctl.h>
34 #define ETHERTYPE_IP6 0x86dd
38 calculate the tcp checksum for tcp over ipv6
40 static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
46 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
47 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
50 phdr[1] = htonl(ip6->ip6_nxt);
51 sum += uint16_checksum((uint16_t *)phdr, 8);
53 sum += uint16_checksum(data, n);
55 sum = (sum & 0xFFFF) + (sum >> 16);
56 sum = (sum & 0xFFFF) + (sum >> 16);
66 send gratuitous arp reply after we have taken over an ip address
68 saddr is the address we are trying to claim
69 iface is the interface name we will be using to claim the address
71 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
74 struct sockaddr_ll sall;
75 struct ether_header *eh;
78 struct icmp6_hdr *icmp6;
79 struct ifreq if_hwaddr;
80 unsigned char buffer[78]; /* ipv6 neigh solicitation size */
82 char bdcast[] = {0xff,0xff,0xff,0xff,0xff,0xff};
87 switch (addr->ip.sin_family) {
89 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
91 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
95 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
96 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
97 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
98 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
103 /* get the mac address */
104 strcpy(if_hwaddr.ifr_name, iface);
105 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
108 DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
111 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
112 DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
116 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
119 DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
120 if_hwaddr.ifr_hwaddr.sa_family));
125 memset(buffer, 0 , 64);
126 eh = (struct ether_header *)buffer;
127 memset(eh->ether_dhost, 0xff, ETH_ALEN);
128 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
129 eh->ether_type = htons(ETHERTYPE_ARP);
131 ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
132 ah->ar_hrd = htons(ARPHRD_ETHER);
133 ah->ar_pro = htons(ETH_P_IP);
134 ah->ar_hln = ETH_ALEN;
137 /* send a gratious arp */
138 ah->ar_op = htons(ARPOP_REQUEST);
139 ptr = (char *)&ah[1];
140 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
142 memcpy(ptr, &addr->ip.sin_addr, 4);
144 memset(ptr, 0, ETH_ALEN);
146 memcpy(ptr, &addr->ip.sin_addr, 4);
149 sall.sll_family = AF_PACKET;
151 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
152 sall.sll_protocol = htons(ETH_P_ALL);
153 sall.sll_ifindex = ifr.ifr_ifindex;
154 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
157 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
161 /* send unsolicited arp reply broadcast */
162 ah->ar_op = htons(ARPOP_REPLY);
163 ptr = (char *)&ah[1];
164 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
166 memcpy(ptr, &addr->ip.sin_addr, 4);
168 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
170 memcpy(ptr, &addr->ip.sin_addr, 4);
173 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
175 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
183 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
185 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
189 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
190 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
191 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
192 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
197 /* get the mac address */
198 strcpy(if_hwaddr.ifr_name, iface);
199 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
202 DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
205 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
206 DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
210 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
213 DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
214 if_hwaddr.ifr_hwaddr.sa_family));
218 memset(buffer, 0 , sizeof(buffer));
219 eh = (struct ether_header *)buffer;
220 memset(eh->ether_dhost, 0xff, ETH_ALEN);
221 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
222 eh->ether_type = htons(ETHERTYPE_IP6);
224 ip6 = (struct ip6_hdr *)(eh+1);
226 ip6->ip6_plen = htons(24);
227 ip6->ip6_nxt = IPPROTO_ICMPV6;
229 ip6->ip6_dst = addr->ip6.sin6_addr;
231 icmp6 = (struct icmp6_hdr *)(ip6+1);
232 icmp6->icmp6_type = ND_NEIGHBOR_SOLICIT;
233 icmp6->icmp6_code = 0;
234 memcpy(&icmp6->icmp6_data32[1], &addr->ip6.sin6_addr, 16);
236 icmp6->icmp6_cksum = tcp_checksum6((uint16_t *)icmp6, ntohs(ip6->ip6_plen), ip6);
238 sall.sll_family = AF_PACKET;
240 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
241 sall.sll_protocol = htons(ETH_P_ALL);
242 sall.sll_ifindex = ifr.ifr_ifindex;
243 ret = sendto(s, buffer, 78, 0, (struct sockaddr *)&sall, sizeof(sall));
246 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
253 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/ipv6 address (family is %u)\n", addr->ip.sin_family));
262 simple TCP checksum - assumes data is multiple of 2 bytes long
264 static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
266 uint32_t sum = uint16_checksum(data, n);
268 sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
270 sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
272 sum += ip->protocol + n;
273 sum = (sum & 0xFFFF) + (sum >> 16);
274 sum = (sum & 0xFFFF) + (sum >> 16);
284 Send tcp segment from the specified IP/port to the specified
287 This is used to trigger the receiving host into sending its own ACK,
288 which should trigger early detection of TCP reset by the client
291 This can also be used to send RST segments (if rst is true) and also
292 if correct seq and ack numbers are provided.
294 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
295 const ctdb_sock_addr *src,
296 uint32_t seq, uint32_t ack, int rst)
302 ctdb_sock_addr *tmpdest;
312 switch (src->ip.sin_family) {
315 ip4pkt.ip.version = 4;
316 ip4pkt.ip.ihl = sizeof(ip4pkt.ip)/4;
317 ip4pkt.ip.tot_len = htons(sizeof(ip4pkt));
319 ip4pkt.ip.protocol = IPPROTO_TCP;
320 ip4pkt.ip.saddr = src->ip.sin_addr.s_addr;
321 ip4pkt.ip.daddr = dest->ip.sin_addr.s_addr;
324 ip4pkt.tcp.source = src->ip.sin_port;
325 ip4pkt.tcp.dest = dest->ip.sin_port;
326 ip4pkt.tcp.seq = seq;
327 ip4pkt.tcp.ack_seq = ack;
332 ip4pkt.tcp.doff = sizeof(ip4pkt.tcp)/4;
333 /* this makes it easier to spot in a sniffer */
334 ip4pkt.tcp.window = htons(1234);
335 ip4pkt.tcp.check = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
337 /* open a raw socket to send this segment from */
338 s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
340 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
345 ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
347 DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
354 set_close_on_exec(s);
356 ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
357 (const struct sockaddr *)&dest->ip,
360 if (ret != sizeof(ip4pkt)) {
361 DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
367 ip6pkt.ip6.ip6_vfc = 0x60;
368 ip6pkt.ip6.ip6_plen = htons(20);
369 ip6pkt.ip6.ip6_nxt = IPPROTO_TCP;
370 ip6pkt.ip6.ip6_hlim = 64;
371 ip6pkt.ip6.ip6_src = src->ip6.sin6_addr;
372 ip6pkt.ip6.ip6_dst = dest->ip6.sin6_addr;
374 ip6pkt.tcp.source = src->ip6.sin6_port;
375 ip6pkt.tcp.dest = dest->ip6.sin6_port;
376 ip6pkt.tcp.seq = seq;
377 ip6pkt.tcp.ack_seq = ack;
382 ip6pkt.tcp.doff = sizeof(ip6pkt.tcp)/4;
383 /* this makes it easier to spot in a sniffer */
384 ip6pkt.tcp.window = htons(1234);
385 ip6pkt.tcp.check = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
387 s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
389 DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
393 /* sendto() dont like if the port is set and the socket is
396 tmpdest = discard_const(dest);
397 tmpport = tmpdest->ip6.sin6_port;
399 tmpdest->ip6.sin6_port = 0;
400 ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0,
401 (const struct sockaddr *)&dest->ip6,
403 tmpdest->ip6.sin6_port = tmpport;
406 if (ret != sizeof(ip6pkt)) {
407 DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
413 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
421 This function is used to open a raw socket to capture from
423 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
427 /* Open a socket to capture all traffic */
428 s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
430 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
434 DEBUG(DEBUG_DEBUG, (__location__ " Created RAW SOCKET FD:%d for tcp tickle\n", s));
437 set_close_on_exec(s);
443 This function is used to do any additional cleanup required when closing
445 Note that the socket itself is closed automatically in the caller.
447 int ctdb_sys_close_capture_socket(void *private_data)
454 called when the raw socket becomes readable
456 int ctdb_sys_read_tcp_packet(int s, void *private_data,
457 ctdb_sock_addr *src, ctdb_sock_addr *dst,
458 uint32_t *ack_seq, uint32_t *seq)
461 #define RCVPKTSIZE 100
462 char pkt[RCVPKTSIZE];
463 struct ether_header *eth;
468 ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
469 if (ret < sizeof(*eth)+sizeof(*ip)) {
474 eth = (struct ether_header *)pkt;
476 /* we want either IPv4 or IPv6 */
477 if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
479 ip = (struct iphdr *)(eth+1);
481 /* We only want IPv4 packets */
482 if (ip->version != 4) {
485 /* Dont look at fragments */
486 if ((ntohs(ip->frag_off)&0x1fff) != 0) {
489 /* we only want TCP */
490 if (ip->protocol != IPPROTO_TCP) {
494 /* make sure its not a short packet */
495 if (offsetof(struct tcphdr, ack_seq) + 4 +
496 (ip->ihl*4) + sizeof(*eth) > ret) {
500 tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
502 /* tell the caller which one we've found */
503 src->ip.sin_family = AF_INET;
504 src->ip.sin_addr.s_addr = ip->saddr;
505 src->ip.sin_port = tcp->source;
506 dst->ip.sin_family = AF_INET;
507 dst->ip.sin_addr.s_addr = ip->daddr;
508 dst->ip.sin_port = tcp->dest;
509 *ack_seq = tcp->ack_seq;
513 } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
515 ip6 = (struct ip6_hdr *)(eth+1);
517 /* we only want TCP */
518 if (ip6->ip6_nxt != IPPROTO_TCP) {
523 tcp = (struct tcphdr *)(ip6+1);
525 /* tell the caller which one we've found */
526 src->ip6.sin6_family = AF_INET6;
527 src->ip6.sin6_port = tcp->source;
528 src->ip6.sin6_addr = ip6->ip6_src;
530 dst->ip6.sin6_family = AF_INET6;
531 dst->ip6.sin6_port = tcp->dest;
532 dst->ip6.sin6_addr = ip6->ip6_dst;
534 *ack_seq = tcp->ack_seq;
544 bool ctdb_sys_check_iface_exists(const char *iface)
549 s = socket(PF_PACKET, SOCK_RAW, 0);
551 /* We dont know if the interface exists, so assume yes */
552 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
556 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
557 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0 && errno == ENODEV) {
558 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
567 int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
570 socklen_t crl = sizeof(struct ucred);
572 if ((ret = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &crl) == 0)) {
579 * Find the process name from process ID
581 char *ctdb_get_process_name(pid_t pid)
588 snprintf(path, sizeof(path), "/proc/%d/exe", pid);
589 n = readlink(path, buf, sizeof(buf));
594 /* Remove any extra fields */
596 ptr = strtok(buf, " ");
603 int ctdb_set_process_name(const char *name)
607 strncpy(procname, name, 15);
608 return prctl(PR_SET_NAME, (unsigned long)procname, 0, 0, 0);
612 * Parsing a line from /proc/locks,
614 static bool parse_proc_locks_line(char *line, pid_t *pid,
615 struct ctdb_lock_info *curlock)
619 /* output of /proc/locks
622 * 1: POSIX ADVISORY WRITE 25945 fd:00:6424820 212 212
625 * 1: -> POSIX ADVISORY WRITE 25946 fd:00:6424820 212 212
629 ptr = strtok_r(line, " ", &saveptr);
630 if (ptr == NULL) return false;
633 ptr = strtok_r(NULL, " ", &saveptr);
634 if (ptr == NULL) return false;
635 if (strcmp(ptr, "->") == 0) {
636 curlock->waiting = true;
637 ptr = strtok_r(NULL, " ", &saveptr);
639 curlock->waiting = false;
643 if (ptr == NULL || strcmp(ptr, "POSIX") != 0) {
648 ptr = strtok_r(NULL, " ", &saveptr);
649 if (ptr == NULL) return false;
652 ptr = strtok_r(NULL, " ", &saveptr);
653 if (ptr == NULL) return false;
654 if (strcmp(ptr, "READ") == 0) {
655 curlock->read_only = true;
656 } else if (strcmp(ptr, "WRITE") == 0) {
657 curlock->read_only = false;
663 ptr = strtok_r(NULL, " ", &saveptr);
664 if (ptr == NULL) return false;
667 /* MAJOR:MINOR:INODE */
668 ptr = strtok_r(NULL, " :", &saveptr);
669 if (ptr == NULL) return false;
670 ptr = strtok_r(NULL, " :", &saveptr);
671 if (ptr == NULL) return false;
672 ptr = strtok_r(NULL, " :", &saveptr);
673 if (ptr == NULL) return false;
674 curlock->inode = atol(ptr);
677 ptr = strtok_r(NULL, " ", &saveptr);
678 if (ptr == NULL) return false;
679 curlock->start = atol(ptr);
682 ptr = strtok_r(NULL, " ", &saveptr);
683 if (ptr == NULL) return false;
684 if (strncmp(ptr, "EOF", 3) == 0) {
685 curlock->end = (off_t)-1;
687 curlock->end = atol(ptr);
694 * Find information of lock being waited on for given process ID
696 bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info)
699 struct ctdb_lock_info curlock;
705 if ((fp = fopen("/proc/locks", "r")) == NULL) {
706 DEBUG(DEBUG_ERR, ("Failed to read locks information"));
709 while ((ptr = fgets(buf, sizeof(buf), fp)) != NULL) {
710 if (! parse_proc_locks_line(buf, &pid, &curlock)) {
713 if (pid == req_pid && curlock.waiting) {
714 *lock_info = curlock;
725 * Find process ID which holds an overlapping byte lock for required
726 * inode and byte range.
728 bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid)
731 struct ctdb_lock_info curlock;
737 if ((fp = fopen("/proc/locks", "r")) == NULL) {
738 DEBUG(DEBUG_ERR, ("Failed to read locks information"));
741 while ((ptr = fgets(buf, sizeof(buf), fp)) != NULL) {
742 if (! parse_proc_locks_line(buf, &pid, &curlock)) {
746 if (curlock.waiting) {
750 if (curlock.inode != reqlock->inode) {
754 if (curlock.start > reqlock->end ||
755 curlock.end < reqlock->start) {
756 /* Outside the required range */