86cbdaab6ad2154da1fe49bcba635b01687ee4b7
[samba.git] / ctdb / common / system_socket.c
1 /*
2    ctdb system specific code to manage raw sockets on linux
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Marc Dequènes (Duck) 2009
7    Copyright (C) Volker Lendecke 2012
8
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3 of the License, or
12    (at your option) any later version.
13
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, see <http://www.gnu.org/licenses/>.
21 */
22
23 #include "replace.h"
24
25 /*
26  * Use BSD struct tcphdr field names for portability.  Modern glibc
27  * makes them available by default via <netinet/tcp.h> but older glibc
28  * requires __FAVOR_BSD to be defined.
29  *
30  * __FAVOR_BSD is normally defined in <features.h> if _DEFAULT_SOURCE
31  * (new) or _BSD_SOURCE (now deprecated) is set and _GNU_SOURCE is not
32  * set.  Including "replace.h" above causes <features.h> to be
33  * indirectly included and this will not set __FAVOR_BSD because
34  * _GNU_SOURCE is set in Samba's "config.h" (which is included by
35  * "replace.h").
36  *
37  * Therefore, set __FAVOR_BSD by hand below.
38  */
39 #define __FAVOR_BSD 1
40 #include "system/network.h"
41
42 #ifdef HAVE_NETINET_IF_ETHER_H
43 #include <netinet/if_ether.h>
44 #endif
45 #ifdef HAVE_NETINET_IP6_H
46 #include <netinet/ip6.h>
47 #endif
48 #ifdef HAVE_NETINET_ICMP6_H
49 #include <netinet/icmp6.h>
50 #endif
51 #ifdef HAVE_LINUX_IF_PACKET_H
52 #include <linux/if_packet.h>
53 #endif
54
55 #ifndef ETHERTYPE_IP6
56 #define ETHERTYPE_IP6 0x86dd
57 #endif
58
59 #include "lib/util/debug.h"
60 #include "lib/util/blocking.h"
61
62 #include "protocol/protocol.h"
63
64 #include "common/logging.h"
65 #include "common/system_socket.h"
66
67 /*
68   uint16 checksum for n bytes
69  */
70 static uint32_t uint16_checksum(uint16_t *data, size_t n)
71 {
72         uint32_t sum=0;
73         while (n>=2) {
74                 sum += (uint32_t)ntohs(*data);
75                 data++;
76                 n -= 2;
77         }
78         if (n == 1) {
79                 sum += (uint32_t)ntohs(*(uint8_t *)data);
80         }
81         return sum;
82 }
83
84 /*
85  * See if the given IP is currently on an interface
86  */
87 bool ctdb_sys_have_ip(ctdb_sock_addr *_addr)
88 {
89         int s;
90         int ret;
91         ctdb_sock_addr __addr = *_addr;
92         ctdb_sock_addr *addr = &__addr;
93         socklen_t addrlen = 0;
94
95         switch (addr->sa.sa_family) {
96         case AF_INET:
97                 addr->ip.sin_port = 0;
98                 addrlen = sizeof(struct sockaddr_in);
99                 break;
100         case AF_INET6:
101                 addr->ip6.sin6_port = 0;
102                 addrlen = sizeof(struct sockaddr_in6);
103                 break;
104         }
105
106         s = socket(addr->sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
107         if (s == -1) {
108                 return false;
109         }
110
111         ret = bind(s, (struct sockaddr *)addr, addrlen);
112
113         close(s);
114         return ret == 0;
115 }
116
117 /*
118  * simple TCP checksum - assumes data is multiple of 2 bytes long
119  */
120 static uint16_t ip_checksum(uint16_t *data, size_t n, struct ip *ip)
121 {
122         uint32_t sum = uint16_checksum(data, n);
123         uint16_t sum2;
124
125         sum += uint16_checksum((uint16_t *)&ip->ip_src, sizeof(ip->ip_src));
126         sum += uint16_checksum((uint16_t *)&ip->ip_dst, sizeof(ip->ip_dst));
127         sum += ip->ip_p + n;
128         sum = (sum & 0xFFFF) + (sum >> 16);
129         sum = (sum & 0xFFFF) + (sum >> 16);
130         sum2 = htons(sum);
131         sum2 = ~sum2;
132         if (sum2 == 0) {
133                 return 0xFFFF;
134         }
135         return sum2;
136 }
137
138 static uint16_t ip6_checksum(uint16_t *data, size_t n, struct ip6_hdr *ip6)
139 {
140         uint16_t phdr[3];
141         uint32_t sum = 0;
142         uint16_t sum2;
143         uint32_t len;
144
145         sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
146         sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
147
148         len = htonl(n);
149         phdr[0] = len & UINT16_MAX;
150         phdr[1] = (len >> 16) & UINT16_MAX;
151         /* ip6_nxt is only 8 bits, so fits comfortably into a uint16_t */
152         phdr[2] = htons(ip6->ip6_nxt);
153         sum += uint16_checksum(phdr, sizeof(phdr));
154
155         sum += uint16_checksum(data, n);
156
157         sum = (sum & 0xFFFF) + (sum >> 16);
158         sum = (sum & 0xFFFF) + (sum >> 16);
159         sum2 = htons(sum);
160         sum2 = ~sum2;
161         if (sum2 == 0) {
162                 return 0xFFFF;
163         }
164         return sum2;
165 }
166
167 /*
168  * Send gratuitous ARP request/reply or IPv6 neighbor advertisement
169  */
170
171 #ifdef HAVE_PACKETSOCKET
172
173 /*
174  * Create IPv4 ARP requests/replies or IPv6 neighbour advertisement
175  * packets
176  */
177
178 #define ARP_STRUCT_SIZE sizeof(struct ether_header) + \
179                         sizeof(struct ether_arp)
180
181 #define IP6_NA_STRUCT_SIZE sizeof(struct ether_header) + \
182                            sizeof(struct ip6_hdr) + \
183                            sizeof(struct nd_neighbor_advert) + \
184                            sizeof(struct nd_opt_hdr) + \
185                            sizeof(struct ether_addr)
186
187 #define ARP_BUFFER_SIZE MAX(ARP_STRUCT_SIZE, 64)
188
189 #define IP6_NA_BUFFER_SIZE MAX(IP6_NA_STRUCT_SIZE, 64)
190
191 static int arp_build(uint8_t *buffer,
192                      size_t buflen,
193                      const struct sockaddr_in *addr,
194                      const struct ether_addr *hwaddr,
195                      bool reply,
196                      struct ether_addr **ether_dhost,
197                      size_t *len)
198 {
199         size_t l = ARP_BUFFER_SIZE;
200         struct ether_header *eh;
201         struct ether_arp *ea;
202         struct arphdr *ah;
203
204         if (addr->sin_family != AF_INET) {
205                 return EINVAL;
206         }
207
208         if (buflen < l) {
209                 return EMSGSIZE;
210         }
211
212         memset(buffer, 0 , l);
213
214         eh = (struct ether_header *)buffer;
215         memset(eh->ether_dhost, 0xff, ETH_ALEN);
216         memcpy(eh->ether_shost, hwaddr, ETH_ALEN);
217         eh->ether_type = htons(ETHERTYPE_ARP);
218
219         ea = (struct ether_arp *)(buffer + sizeof(struct ether_header));
220         ah = &ea->ea_hdr;
221         ah->ar_hrd = htons(ARPHRD_ETHER);
222         ah->ar_pro = htons(ETH_P_IP);
223         ah->ar_hln = ETH_ALEN;
224         ah->ar_pln = sizeof(ea->arp_spa);
225
226         if (! reply) {
227                 ah->ar_op  = htons(ARPOP_REQUEST);
228                 memcpy(ea->arp_sha, hwaddr, ETH_ALEN);
229                 memcpy(ea->arp_spa, &addr->sin_addr, sizeof(ea->arp_spa));
230                 memset(ea->arp_tha, 0, ETH_ALEN);
231                 memcpy(ea->arp_tpa, &addr->sin_addr, sizeof(ea->arp_tpa));
232         } else {
233                 ah->ar_op  = htons(ARPOP_REPLY);
234                 memcpy(ea->arp_sha, hwaddr, ETH_ALEN);
235                 memcpy(ea->arp_spa, &addr->sin_addr, sizeof(ea->arp_spa));
236                 memcpy(ea->arp_tha, hwaddr, ETH_ALEN);
237                 memcpy(ea->arp_tpa, &addr->sin_addr, sizeof(ea->arp_tpa));
238         }
239
240         *ether_dhost = (struct ether_addr *)eh->ether_dhost;
241         *len = l;
242         return 0;
243 }
244
245 static int ip6_na_build(uint8_t *buffer,
246                         size_t buflen,
247                         const struct sockaddr_in6 *addr,
248                         const struct ether_addr *hwaddr,
249                         struct ether_addr **ether_dhost,
250                         size_t *len)
251 {
252         size_t l = IP6_NA_BUFFER_SIZE;
253         struct ether_header *eh;
254         struct ip6_hdr *ip6;
255         struct nd_neighbor_advert *nd_na;
256         struct nd_opt_hdr *nd_oh;
257         struct ether_addr *ea;
258         int ret;
259
260         if (addr->sin6_family != AF_INET6) {
261                 return EINVAL;
262         }
263
264         if (buflen < l) {
265                 return EMSGSIZE;
266         }
267
268         memset(buffer, 0 , l);
269
270         eh = (struct ether_header *)buffer;
271         /*
272          * Ethernet multicast: 33:33:00:00:00:01 (see RFC2464,
273          * section 7) - note memset 0 above!
274          */
275         eh->ether_dhost[0] = 0x33;
276         eh->ether_dhost[1] = 0x33;
277         eh->ether_dhost[5] = 0x01;
278         memcpy(eh->ether_shost, hwaddr, ETH_ALEN);
279         eh->ether_type = htons(ETHERTYPE_IP6);
280
281         ip6 = (struct ip6_hdr *)(buffer + sizeof(struct ether_header));
282         ip6->ip6_vfc  = 6 << 4;
283         ip6->ip6_plen = htons(sizeof(struct nd_neighbor_advert) +
284                               sizeof(struct nd_opt_hdr) +
285                               ETH_ALEN);
286         ip6->ip6_nxt  = IPPROTO_ICMPV6;
287         ip6->ip6_hlim = 255;
288         ip6->ip6_src  = addr->sin6_addr;
289         /* all-nodes multicast */
290
291         ret = inet_pton(AF_INET6, "ff02::1", &ip6->ip6_dst);
292         if (ret != 1) {
293                 return EIO;
294         }
295
296         nd_na = (struct nd_neighbor_advert *)(buffer +
297                                               sizeof(struct ether_header) +
298                                               sizeof(struct ip6_hdr));
299         nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
300         nd_na->nd_na_code = 0;
301         nd_na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE;
302         nd_na->nd_na_target = addr->sin6_addr;
303
304         /* Option: Target link-layer address */
305         nd_oh = (struct nd_opt_hdr *)(buffer +
306                                       sizeof(struct ether_header) +
307                                       sizeof(struct ip6_hdr) +
308                                       sizeof(struct nd_neighbor_advert));
309         nd_oh->nd_opt_type = ND_OPT_TARGET_LINKADDR;
310         nd_oh->nd_opt_len = 1;  /* multiple of 8 octets */
311
312         ea = (struct ether_addr *)(buffer +
313                                    sizeof(struct ether_header) +
314                                    sizeof(struct ip6_hdr) +
315                                    sizeof(struct nd_neighbor_advert) +
316                                    sizeof(struct nd_opt_hdr));
317         memcpy(ea, hwaddr, ETH_ALEN);
318
319         nd_na->nd_na_cksum = ip6_checksum((uint16_t *)nd_na,
320                                           ntohs(ip6->ip6_plen),
321                                           ip6);
322
323         *ether_dhost = (struct ether_addr *)eh->ether_dhost;
324         *len = l;
325         return 0;
326 }
327
328 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
329 {
330         int s;
331         struct sockaddr_ll sall = {0};
332         struct ifreq if_hwaddr = {
333                 .ifr_ifru = {
334                         .ifru_flags = 0
335                 },
336         };
337         uint8_t buffer[MAX(ARP_BUFFER_SIZE, IP6_NA_BUFFER_SIZE)];
338         struct ifreq ifr = {
339                 .ifr_ifru = {
340                         .ifru_flags = 0
341                 },
342         };
343         struct ether_addr *hwaddr = NULL;
344         struct ether_addr *ether_dhost = NULL;
345         size_t len = 0;
346         int ret = 0;
347
348         s = socket(AF_PACKET, SOCK_RAW, 0);
349         if (s == -1) {
350                 ret = errno;
351                 DBG_ERR("Failed to open raw socket\n");
352                 return ret;
353         }
354         DBG_DEBUG("Created SOCKET FD:%d for sending arp\n", s);
355
356         /* Find interface */
357         strlcpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
358         if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
359                 ret = errno;
360                 DBG_ERR("Interface '%s' not found\n", iface);
361                 goto fail;
362         }
363
364         /* Get MAC address */
365         strlcpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name));
366         ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
367         if ( ret < 0 ) {
368                 ret = errno;
369                 DBG_ERR("ioctl failed\n");
370                 goto fail;
371         }
372         if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
373                 ret = 0;
374                 D_DEBUG("Ignoring loopback arp request\n");
375                 goto fail;
376         }
377         if (if_hwaddr.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
378                 ret = EINVAL;
379                 DBG_ERR("Not an ethernet address family (0x%x)\n",
380                         if_hwaddr.ifr_hwaddr.sa_family);
381                 goto fail;;
382         }
383
384         /* Set up most of destination address structure */
385         sall.sll_family = AF_PACKET;
386         sall.sll_halen = sizeof(struct ether_addr);
387         sall.sll_protocol = htons(ETH_P_ALL);
388         sall.sll_ifindex = ifr.ifr_ifindex;
389
390         /* For clarity */
391         hwaddr = (struct ether_addr *)if_hwaddr.ifr_hwaddr.sa_data;
392
393         switch (addr->ip.sin_family) {
394         case AF_INET:
395                 /* Send gratuitous ARP */
396                 ret = arp_build(buffer,
397                                 sizeof(buffer),
398                                 &addr->ip,
399                                 hwaddr,
400                                 false,
401                                 &ether_dhost,
402                                 &len);
403                 if (ret != 0) {
404                         DBG_ERR("Failed to build ARP request\n");
405                         goto fail;
406                 }
407
408                 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
409
410                 ret = sendto(s,
411                              buffer,
412                              len,
413                              0,
414                              (struct sockaddr *)&sall,
415                              sizeof(sall));
416                 if (ret < 0 ) {
417                         ret = errno;
418                         DBG_ERR("Failed sendto\n");
419                         goto fail;
420                 }
421
422                 /* Send unsolicited ARP reply */
423                 ret = arp_build(buffer,
424                                 sizeof(buffer),
425                                 &addr->ip,
426                                 hwaddr,
427                                 true,
428                                 &ether_dhost,
429                                 &len);
430                 if (ret != 0) {
431                         DBG_ERR("Failed to build ARP reply\n");
432                         goto fail;
433                 }
434
435                 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
436
437                 ret = sendto(s,
438                              buffer,
439                              len,
440                              0,
441                              (struct sockaddr *)&sall,
442                              sizeof(sall));
443                 if (ret < 0 ) {
444                         ret = errno;
445                         DBG_ERR("Failed sendto\n");
446                         goto fail;
447                 }
448
449                 close(s);
450                 break;
451
452         case AF_INET6:
453                 ret = ip6_na_build(buffer,
454                                    sizeof(buffer),
455                                    &addr->ip6,
456                                    hwaddr,
457                                    &ether_dhost,
458                                    &len);
459                 if (ret != 0) {
460                         DBG_ERR("Failed to build IPv6 neighbor advertisment\n");
461                         goto fail;
462                 }
463
464                 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
465
466                 ret = sendto(s,
467                              buffer,
468                              len,
469                              0,
470                              (struct sockaddr *)&sall,
471                              sizeof(sall));
472                 if (ret < 0 ) {
473                         ret = errno;
474                         DBG_ERR("Failed sendto\n");
475                         goto fail;
476                 }
477
478                 close(s);
479                 break;
480
481         default:
482                 ret = EINVAL;
483                 DBG_ERR("Not an ipv4/ipv6 address (family is %u)\n",
484                         addr->ip.sin_family);
485                 goto fail;
486         }
487
488         return 0;
489
490 fail:
491         close(s);
492         return ret;
493 }
494
495 #else /* HAVE_PACKETSOCKET */
496
497 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
498 {
499         /* Not implemented */
500         return ENOSYS;
501 }
502
503 #endif /* HAVE_PACKETSOCKET */
504
505
506 #define IP4_TCP_BUFFER_SIZE sizeof(struct ip) + \
507                             sizeof(struct tcphdr)
508
509 #define IP6_TCP_BUFFER_SIZE sizeof(struct ip6_hdr) + \
510                             sizeof(struct tcphdr)
511
512 static int tcp4_build(uint8_t *buf,
513                       size_t buflen,
514                       const struct sockaddr_in *src,
515                       const struct sockaddr_in *dst,
516                       uint32_t seq,
517                       uint32_t ack,
518                       int rst,
519                       size_t *len)
520 {
521         size_t l = IP4_TCP_BUFFER_SIZE;
522         struct {
523                 struct ip ip;
524                 struct tcphdr tcp;
525         } *ip4pkt;
526
527         if (l != sizeof(*ip4pkt)) {
528                 return EMSGSIZE;
529         }
530
531         if (buflen < l) {
532                 return EMSGSIZE;
533         }
534
535         ip4pkt = (void *)buf;
536         memset(ip4pkt, 0, l);
537
538         ip4pkt->ip.ip_v     = 4;
539         ip4pkt->ip.ip_hl    = sizeof(ip4pkt->ip)/sizeof(uint32_t);
540         ip4pkt->ip.ip_len   = htons(sizeof(ip4pkt));
541         ip4pkt->ip.ip_ttl   = 255;
542         ip4pkt->ip.ip_p     = IPPROTO_TCP;
543         ip4pkt->ip.ip_src.s_addr = src->sin_addr.s_addr;
544         ip4pkt->ip.ip_dst.s_addr = dst->sin_addr.s_addr;
545         ip4pkt->ip.ip_sum   = 0;
546
547         ip4pkt->tcp.th_sport = src->sin_port;
548         ip4pkt->tcp.th_dport = dst->sin_port;
549         ip4pkt->tcp.th_seq   = seq;
550         ip4pkt->tcp.th_ack   = ack;
551         ip4pkt->tcp.th_flags = 0;
552         ip4pkt->tcp.th_flags |= TH_ACK;
553         if (rst) {
554                 ip4pkt->tcp.th_flags |= TH_RST;
555         }
556         ip4pkt->tcp.th_off   = sizeof(ip4pkt->tcp)/sizeof(uint32_t);
557         /* this makes it easier to spot in a sniffer */
558         ip4pkt->tcp.th_win   = htons(1234);
559         ip4pkt->tcp.th_sum   = ip_checksum((uint16_t *)&ip4pkt->tcp,
560                                            sizeof(ip4pkt->tcp),
561                                            &ip4pkt->ip);
562
563         *len = l;
564         return 0;
565 }
566
567 static int tcp6_build(uint8_t *buf,
568                       size_t buflen,
569                       const struct sockaddr_in6 *src,
570                       const struct sockaddr_in6 *dst,
571                       uint32_t seq,
572                       uint32_t ack,
573                       int rst,
574                       size_t *len)
575 {
576         size_t l = IP6_TCP_BUFFER_SIZE;
577         struct {
578                 struct ip6_hdr ip6;
579                 struct tcphdr tcp;
580         } *ip6pkt;
581
582         if (l != sizeof(*ip6pkt)) {
583                 return EMSGSIZE;
584         }
585
586         if (buflen < l) {
587                 return EMSGSIZE;
588         }
589
590         ip6pkt = (void *)buf;
591         memset(ip6pkt, 0, l);
592
593         ip6pkt->ip6.ip6_vfc  = 6 << 4;
594         ip6pkt->ip6.ip6_plen = htons(sizeof(struct tcphdr));
595         ip6pkt->ip6.ip6_nxt  = IPPROTO_TCP;
596         ip6pkt->ip6.ip6_hlim = 64;
597         ip6pkt->ip6.ip6_src  = src->sin6_addr;
598         ip6pkt->ip6.ip6_dst  = dst->sin6_addr;
599
600         ip6pkt->tcp.th_sport = src->sin6_port;
601         ip6pkt->tcp.th_dport = dst->sin6_port;
602         ip6pkt->tcp.th_seq   = seq;
603         ip6pkt->tcp.th_ack   = ack;
604         ip6pkt->tcp.th_flags = 0;
605         ip6pkt->tcp.th_flags |= TH_ACK;
606         if (rst) {
607                 ip6pkt->tcp.th_flags |= TH_RST;
608         }
609         ip6pkt->tcp.th_off    = sizeof(ip6pkt->tcp)/sizeof(uint32_t);
610         /* this makes it easier to spot in a sniffer */
611         ip6pkt->tcp.th_win   = htons(1234);
612         ip6pkt->tcp.th_sum   = ip6_checksum((uint16_t *)&ip6pkt->tcp,
613                                             sizeof(ip6pkt->tcp),
614                                             &ip6pkt->ip6);
615
616         *len = l;
617         return 0;
618 }
619
620 /*
621  * Send tcp segment from the specified IP/port to the specified
622  * destination IP/port.
623  *
624  * This is used to trigger the receiving host into sending its own ACK,
625  * which should trigger early detection of TCP reset by the client
626  * after IP takeover
627  *
628  * This can also be used to send RST segments (if rst is true) and also
629  * if correct seq and ack numbers are provided.
630  */
631 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
632                       const ctdb_sock_addr *src,
633                       uint32_t seq,
634                       uint32_t ack,
635                       int rst)
636 {
637         uint8_t buf[MAX(IP4_TCP_BUFFER_SIZE, IP6_TCP_BUFFER_SIZE)];
638         size_t len = 0;
639         int ret;
640         int s;
641         uint32_t one = 1;
642         struct sockaddr_in6 tmpdest = { 0 };
643         int saved_errno;
644
645         switch (src->ip.sin_family) {
646         case AF_INET:
647                 ret = tcp4_build(buf,
648                                  sizeof(buf),
649                                  &src->ip,
650                                  &dest->ip,
651                                  seq,
652                                  ack,
653                                  rst,
654                                  &len);
655                 if (ret != 0) {
656                         DBG_ERR("Failed to build TCP packet (%d)\n", ret);
657                         return ret;
658                 }
659
660                 /* open a raw socket to send this segment from */
661                 s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
662                 if (s == -1) {
663                         DBG_ERR("Failed to open raw socket (%s)\n",
664                                 strerror(errno));
665                         return -1;
666                 }
667
668                 ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
669                 if (ret != 0) {
670                         DBG_ERR("Failed to setup IP headers (%s)\n",
671                                 strerror(errno));
672                         close(s);
673                         return -1;
674                 }
675
676                 ret = sendto(s,
677                              buf,
678                              len,
679                              0,
680                              (const struct sockaddr *)&dest->ip,
681                              sizeof(dest->ip));
682                 saved_errno = errno;
683                 close(s);
684                 if (ret == -1) {
685                         D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
686                         return -1;
687                 }
688                 if ((size_t)ret != len) {
689                         DBG_ERR("Failed sendto - didn't send full packet\n");
690                         return -1;
691                 }
692                 break;
693
694         case AF_INET6:
695                 ret = tcp6_build(buf,
696                                  sizeof(buf),
697                                  &src->ip6,
698                                  &dest->ip6,
699                                  seq,
700                                  ack,
701                                  rst,
702                                  &len);
703                 if (ret != 0) {
704                         DBG_ERR("Failed to build TCP packet (%d)\n", ret);
705                         return ret;
706                 }
707
708                 s = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
709                 if (s == -1) {
710                         DBG_ERR("Failed to open sending socket\n");
711                         return -1;
712
713                 }
714                 /*
715                  * sendto() on an IPv6 raw socket requires the port to
716                  * be either 0 or a protocol value
717                  */
718                 tmpdest = dest->ip6;
719                 tmpdest.sin6_port = 0;
720
721                 ret = sendto(s,
722                              buf,
723                              len,
724                              0,
725                              (const struct sockaddr *)&tmpdest,
726                              sizeof(tmpdest));
727                 saved_errno = errno;
728                 close(s);
729                 if (ret == -1) {
730                         D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
731                         return -1;
732                 }
733                 if ((size_t)ret != len) {
734                         DBG_ERR("Failed sendto - didn't send full packet\n");
735                         return -1;
736                 }
737                 break;
738
739         default:
740                 DBG_ERR("Not an ipv4/v6 address\n");
741                 return -1;
742         }
743
744         return 0;
745 }
746
747 /*
748  * Packet capture
749  *
750  * If AF_PACKET is available then use a raw socket otherwise use pcap.
751  * wscript has checked to make sure that pcap is available if needed.
752  */
753
754 static int tcp4_extract(const uint8_t *ip_pkt,
755                         size_t pktlen,
756                         struct sockaddr_in *src,
757                         struct sockaddr_in *dst,
758                         uint32_t *ack_seq,
759                         uint32_t *seq,
760                         int *rst,
761                         uint16_t *window)
762 {
763         const struct ip *ip;
764         const struct tcphdr *tcp;
765
766         if (pktlen < sizeof(struct ip)) {
767                 return EMSGSIZE;
768         }
769
770         ip = (const struct ip *)ip_pkt;
771
772         /* IPv4 only */
773         if (ip->ip_v != 4) {
774                 return ENOMSG;
775         }
776         /* Don't look at fragments */
777         if ((ntohs(ip->ip_off)&0x1fff) != 0) {
778                 return ENOMSG;
779         }
780         /* TCP only */
781         if (ip->ip_p != IPPROTO_TCP) {
782                 return ENOMSG;
783         }
784
785         /* Ensure there is enough of the packet to gather required fields */
786         if (pktlen <
787             (ip->ip_hl * sizeof(uint32_t)) + offsetof(struct tcphdr, th_sum)) {
788                 return EMSGSIZE;
789         }
790
791         tcp = (const struct tcphdr *)(ip_pkt + (ip->ip_hl * sizeof(uint32_t)));
792
793         src->sin_family      = AF_INET;
794         src->sin_addr.s_addr = ip->ip_src.s_addr;
795         src->sin_port        = tcp->th_sport;
796
797         dst->sin_family      = AF_INET;
798         dst->sin_addr.s_addr = ip->ip_dst.s_addr;
799         dst->sin_port        = tcp->th_dport;
800
801         *ack_seq             = tcp->th_ack;
802         *seq                 = tcp->th_seq;
803         if (window != NULL) {
804                 *window = tcp->th_win;
805         }
806         if (rst != NULL) {
807                 *rst = tcp->th_flags & TH_RST;
808         }
809
810         return 0;
811 }
812
813 static int tcp6_extract(const uint8_t *ip_pkt,
814                         size_t pktlen,
815                         struct sockaddr_in6 *src,
816                         struct sockaddr_in6 *dst,
817                         uint32_t *ack_seq,
818                         uint32_t *seq,
819                         int *rst,
820                         uint16_t *window)
821 {
822         const struct ip6_hdr *ip6;
823         const struct tcphdr *tcp;
824
825         /* Ensure there is enough of the packet to gather required fields */
826         if (pktlen < sizeof(struct ip6_hdr) + offsetof(struct tcphdr, th_sum)) {
827                 return EMSGSIZE;
828         }
829
830         ip6 = (const struct ip6_hdr *)ip_pkt;
831
832         /* IPv6 only */
833         if ((ip6->ip6_vfc >> 4) != 6){
834                 return ENOMSG;
835         }
836
837         /* TCP only */
838         if (ip6->ip6_nxt != IPPROTO_TCP) {
839                 return ENOMSG;
840         }
841
842         tcp = (const struct tcphdr *)(ip_pkt + sizeof(struct ip6_hdr));
843
844         src->sin6_family = AF_INET6;
845         src->sin6_port   = tcp->th_sport;
846         src->sin6_addr   = ip6->ip6_src;
847
848         dst->sin6_family = AF_INET6;
849         dst->sin6_port   = tcp->th_dport;
850         dst->sin6_addr   = ip6->ip6_dst;
851
852         *ack_seq             = tcp->th_ack;
853         *seq                 = tcp->th_seq;
854         if (window != NULL) {
855                 *window = tcp->th_win;
856         }
857         if (rst != NULL) {
858                 *rst = tcp->th_flags & TH_RST;
859         }
860
861         return 0;
862 }
863
864
865 #ifdef HAVE_AF_PACKET
866
867 /*
868  * This function is used to open a raw socket to capture from
869  */
870 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
871 {
872         int s, ret;
873
874         /* Open a socket to capture all traffic */
875         s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
876         if (s == -1) {
877                 DBG_ERR("Failed to open raw socket\n");
878                 return -1;
879         }
880
881         DBG_DEBUG("Created RAW SOCKET FD:%d for tcp tickle\n", s);
882
883         ret = set_blocking(s, false);
884         if (ret != 0) {
885                 DBG_ERR("Failed to set socket non-blocking (%s)\n",
886                         strerror(errno));
887                 close(s);
888                 return -1;
889         }
890
891         set_close_on_exec(s);
892
893         return s;
894 }
895
896 /*
897  * This function is used to do any additional cleanup required when closing
898  * a capture socket.
899  * Note that the socket itself is closed automatically in the caller.
900  */
901 int ctdb_sys_close_capture_socket(void *private_data)
902 {
903         return 0;
904 }
905
906
907 /*
908  * called when the raw socket becomes readable
909  */
910 int ctdb_sys_read_tcp_packet(int s, void *private_data,
911                              ctdb_sock_addr *src,
912                              ctdb_sock_addr *dst,
913                              uint32_t *ack_seq,
914                              uint32_t *seq,
915                              int *rst,
916                              uint16_t *window)
917 {
918         ssize_t nread;
919         uint8_t pkt[100]; /* Large enough for simple ACK/RST packets */
920         struct ether_header *eth;
921         int ret;
922
923         nread = recv(s, pkt, sizeof(pkt), MSG_TRUNC);
924         if (nread == -1) {
925                 return errno;
926         }
927         if ((size_t)nread < sizeof(*eth)) {
928                 return EMSGSIZE;
929         }
930
931         ZERO_STRUCTP(src);
932         ZERO_STRUCTP(dst);
933
934         /* Ethernet */
935         eth = (struct ether_header *)pkt;
936
937         /* we want either IPv4 or IPv6 */
938         if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
939                 ret = tcp4_extract(pkt + sizeof(struct ether_header),
940                                    (size_t)nread - sizeof(struct ether_header),
941                                    &src->ip,
942                                    &dst->ip,
943                                    ack_seq,
944                                    seq,
945                                    rst,
946                                    window);
947                 return ret;
948
949         } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
950                 ret = tcp6_extract(pkt + sizeof(struct ether_header),
951                                    (size_t)nread - sizeof(struct ether_header),
952                                    &src->ip6,
953                                    &dst->ip6,
954                                    ack_seq,
955                                    seq,
956                                    rst,
957                                    window);
958                 return ret;
959         }
960
961         return ENOMSG;
962 }
963
964 #else /* HAVE_AF_PACKET */
965
966 #include <pcap.h>
967
968 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
969 {
970         pcap_t *pt;
971
972         pt=pcap_open_live(iface, 100, 0, 0, NULL);
973         if (pt == NULL) {
974                 DBG_ERR("Failed to open capture device %s\n", iface);
975                 return -1;
976         }
977         *((pcap_t **)private_data) = pt;
978
979         return pcap_fileno(pt);
980 }
981
982 int ctdb_sys_close_capture_socket(void *private_data)
983 {
984         pcap_t *pt = (pcap_t *)private_data;
985         pcap_close(pt);
986         return 0;
987 }
988
989 int ctdb_sys_read_tcp_packet(int s,
990                              void *private_data,
991                              ctdb_sock_addr *src,
992                              ctdb_sock_addr *dst,
993                              uint32_t *ack_seq,
994                              uint32_t *seq,
995                              int *rst,
996                              uint16_t *window)
997 {
998         int ret;
999         struct ether_header *eth;
1000         struct pcap_pkthdr pkthdr;
1001         const u_char *buffer;
1002         pcap_t *pt = (pcap_t *)private_data;
1003
1004         buffer=pcap_next(pt, &pkthdr);
1005         if (buffer==NULL) {
1006                 return ENOMSG;
1007         }
1008
1009         ZERO_STRUCTP(src);
1010         ZERO_STRUCTP(dst);
1011
1012         /* Ethernet */
1013         eth = (struct ether_header *)buffer;
1014
1015         /* we want either IPv4 or IPv6 */
1016         if (eth->ether_type == htons(ETHERTYPE_IP)) {
1017                 ret = tcp4_extract(buffer + sizeof(struct ether_header),
1018                                    (size_t)(pkthdr.caplen -
1019                                             sizeof(struct ether_header)),
1020                                    &src->ip,
1021                                    &dst->ip,
1022                                    ack_seq,
1023                                    seq,
1024                                    rst,
1025                                    window);
1026                 return ret;
1027
1028         } else if (eth->ether_type == htons(ETHERTYPE_IP6)) {
1029                 ret = tcp6_extract(buffer + sizeof(struct ether_header),
1030                                    (size_t)(pkthdr.caplen -
1031                                             sizeof(struct ether_header)),
1032                                    &src->ip6,
1033                                    &dst->ip6,
1034                                    ack_seq,
1035                                    seq,
1036                                    rst,
1037                                    window);
1038                 return ret;
1039         }
1040
1041         return ENOMSG;
1042 }
1043
1044 #endif /* HAVE_AF_PACKET */