Merge commit 'rusty/ports-from-1.0.112' into foo
[samba.git] / ctdb / common / system_linux.c
1 /* 
2    ctdb system specific code to manage raw sockets on linux
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "includes.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
25 #include "../include/ctdb_private.h"
26 #include "lib/tevent/tevent.h"
27 #include <netinet/if_ether.h>
28 #include <netinet/ip6.h>
29 #include <netinet/icmp6.h>
30 #include <net/if_arp.h>
31 #include <netpacket/packet.h>
32
33 #ifndef ETHERTYPE_IP6
34 #define ETHERTYPE_IP6 0x86dd
35 #endif
36
37 /*
38   calculate the tcp checksum for tcp over ipv6
39 */
40 static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
41 {
42         uint32_t phdr[2];
43         uint32_t sum = 0;
44         uint16_t sum2;
45
46         sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
47         sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
48
49         phdr[0] = htonl(n);
50         phdr[1] = htonl(ip6->ip6_nxt);
51         sum += uint16_checksum((uint16_t *)phdr, 8);
52
53         sum += uint16_checksum(data, n);
54
55         sum = (sum & 0xFFFF) + (sum >> 16);
56         sum = (sum & 0xFFFF) + (sum >> 16);
57         sum2 = htons(sum);
58         sum2 = ~sum2;
59         if (sum2 == 0) {
60                 return 0xFFFF;
61         }
62         return sum2;
63 }
64
65 /*
66   send gratuitous arp reply after we have taken over an ip address
67
68   saddr is the address we are trying to claim
69   iface is the interface name we will be using to claim the address
70  */
71 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
72 {
73         int s, ret;
74         struct sockaddr_ll sall;
75         struct ether_header *eh;
76         struct arphdr *ah;
77         struct ip6_hdr *ip6;
78         struct icmp6_hdr *icmp6;
79         struct ifreq if_hwaddr;
80         unsigned char buffer[78]; /* ipv6 neigh solicitation size */
81         char *ptr;
82         char bdcast[] = {0xff,0xff,0xff,0xff,0xff,0xff};
83         struct ifreq ifr;
84
85         ZERO_STRUCT(sall);
86
87         switch (addr->ip.sin_family) {
88         case AF_INET:
89                 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
90                 if (s == -1){
91                         DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
92                         return -1;
93                 }
94
95                 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
96                 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
97                 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
98                         DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
99                         close(s);
100                         return -1;
101                 }
102
103                 /* get the mac address */
104                 strcpy(if_hwaddr.ifr_name, iface);
105                 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
106                 if ( ret < 0 ) {
107                         close(s);
108                         DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
109                         return -1;
110                 }
111                 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
112                         DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
113                         close(s);
114                         return 0;
115                 }
116                 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
117                         close(s);
118                         errno = EINVAL;
119                         DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
120                                  if_hwaddr.ifr_hwaddr.sa_family));
121                         return -1;
122                 }
123
124
125                 memset(buffer, 0 , 64);
126                 eh = (struct ether_header *)buffer;
127                 memset(eh->ether_dhost, 0xff, ETH_ALEN);
128                 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
129                 eh->ether_type = htons(ETHERTYPE_ARP);
130         
131                 ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
132                 ah->ar_hrd = htons(ARPHRD_ETHER);
133                 ah->ar_pro = htons(ETH_P_IP);
134                 ah->ar_hln = ETH_ALEN;
135                 ah->ar_pln = 4;
136
137                 /* send a gratious arp */
138                 ah->ar_op  = htons(ARPOP_REQUEST);
139                 ptr = (char *)&ah[1];
140                 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
141                 ptr+=ETH_ALEN;
142                 memcpy(ptr, &addr->ip.sin_addr, 4);       
143                 ptr+=4;
144                 memset(ptr, 0, ETH_ALEN); 
145                 ptr+=ETH_ALEN;
146                 memcpy(ptr, &addr->ip.sin_addr, 4);       
147                 ptr+=4;
148         
149                 sall.sll_family = AF_PACKET;
150                 sall.sll_halen = 6;
151                 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
152                 sall.sll_protocol = htons(ETH_P_ALL);
153                 sall.sll_ifindex = ifr.ifr_ifindex;
154                 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
155                 if (ret < 0 ){
156                         close(s);
157                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
158                         return -1;
159                 }       
160
161                 /* send unsolicited arp reply broadcast */
162                 ah->ar_op  = htons(ARPOP_REPLY);
163                 ptr = (char *)&ah[1];
164                 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
165                 ptr+=ETH_ALEN;
166                 memcpy(ptr, &addr->ip.sin_addr, 4);       
167                 ptr+=4;
168                 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
169                 ptr+=ETH_ALEN;
170                 memcpy(ptr, &addr->ip.sin_addr, 4);       
171                 ptr+=4;
172
173                 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
174                 if (ret < 0 ){
175                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
176                         close(s);
177                         return -1;
178                 }
179
180                 close(s);
181                 break;
182         case AF_INET6:
183                 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
184                 if (s == -1){
185                         DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
186                         return -1;
187                 }
188
189                 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
190                 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
191                 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
192                         DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
193                         close(s);
194                         return -1;
195                 }
196
197                 /* get the mac address */
198                 strcpy(if_hwaddr.ifr_name, iface);
199                 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
200                 if ( ret < 0 ) {
201                         close(s);
202                         DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
203                         return -1;
204                 }
205                 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
206                         DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
207                         close(s);
208                         return 0;
209                 }
210                 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
211                         close(s);
212                         errno = EINVAL;
213                         DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
214                                  if_hwaddr.ifr_hwaddr.sa_family));
215                         return -1;
216                 }
217
218                 memset(buffer, 0 , sizeof(buffer));
219                 eh = (struct ether_header *)buffer;
220                 memset(eh->ether_dhost, 0xff, ETH_ALEN);
221                 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
222                 eh->ether_type = htons(ETHERTYPE_IP6);
223
224                 ip6 = (struct ip6_hdr *)(eh+1);
225                 ip6->ip6_vfc  = 0x60;
226                 ip6->ip6_plen = htons(24);
227                 ip6->ip6_nxt  = IPPROTO_ICMPV6;
228                 ip6->ip6_hlim = 255;
229                 ip6->ip6_dst  = addr->ip6.sin6_addr;
230
231                 icmp6 = (struct icmp6_hdr *)(ip6+1);
232                 icmp6->icmp6_type = ND_NEIGHBOR_SOLICIT;
233                 icmp6->icmp6_code = 0;
234                 memcpy(&icmp6->icmp6_data32[1], &addr->ip6.sin6_addr, 16);
235
236                 icmp6->icmp6_cksum = tcp_checksum6((uint16_t *)icmp6, ntohs(ip6->ip6_plen), ip6);
237
238                 sall.sll_family = AF_PACKET;
239                 sall.sll_halen = 6;
240                 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
241                 sall.sll_protocol = htons(ETH_P_ALL);
242                 sall.sll_ifindex = ifr.ifr_ifindex;
243                 ret = sendto(s, buffer, 78, 0, (struct sockaddr *)&sall, sizeof(sall));
244                 if (ret < 0 ){
245                         close(s);
246                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
247                         return -1;
248                 }       
249
250                 close(s);
251                 break;
252         default:
253                 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/ipv6 address (family is %u)\n", addr->ip.sin_family));
254                 return -1;
255         }
256
257         return 0;
258 }
259
260
261 /*
262   simple TCP checksum - assumes data is multiple of 2 bytes long
263  */
264 static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
265 {
266         uint32_t sum = uint16_checksum(data, n);
267         uint16_t sum2;
268         sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
269                                sizeof(ip->saddr));
270         sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
271                                sizeof(ip->daddr));
272         sum += ip->protocol + n;
273         sum = (sum & 0xFFFF) + (sum >> 16);
274         sum = (sum & 0xFFFF) + (sum >> 16);
275         sum2 = htons(sum);
276         sum2 = ~sum2;
277         if (sum2 == 0) {
278                 return 0xFFFF;
279         }
280         return sum2;
281 }
282
283 /*
284   Send tcp segment from the specified IP/port to the specified
285   destination IP/port. 
286
287   This is used to trigger the receiving host into sending its own ACK,
288   which should trigger early detection of TCP reset by the client
289   after IP takeover
290
291   This can also be used to send RST segments (if rst is true) and also
292   if correct seq and ack numbers are provided.
293  */
294 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest, 
295                       const ctdb_sock_addr *src,
296                       uint32_t seq, uint32_t ack, int rst)
297 {
298         int s;
299         int ret;
300         uint32_t one = 1;
301         uint16_t tmpport;
302         ctdb_sock_addr *tmpdest;
303         struct {
304                 struct iphdr ip;
305                 struct tcphdr tcp;
306         } ip4pkt;
307         struct {
308                 struct ip6_hdr ip6;
309                 struct tcphdr tcp;
310         } ip6pkt;
311
312         switch (src->ip.sin_family) {
313         case AF_INET:
314                 ZERO_STRUCT(ip4pkt);
315                 ip4pkt.ip.version  = 4;
316                 ip4pkt.ip.ihl      = sizeof(ip4pkt.ip)/4;
317                 ip4pkt.ip.tot_len  = htons(sizeof(ip4pkt));
318                 ip4pkt.ip.ttl      = 255;
319                 ip4pkt.ip.protocol = IPPROTO_TCP;
320                 ip4pkt.ip.saddr    = src->ip.sin_addr.s_addr;
321                 ip4pkt.ip.daddr    = dest->ip.sin_addr.s_addr;
322                 ip4pkt.ip.check    = 0;
323
324                 ip4pkt.tcp.source   = src->ip.sin_port;
325                 ip4pkt.tcp.dest     = dest->ip.sin_port;
326                 ip4pkt.tcp.seq      = seq;
327                 ip4pkt.tcp.ack_seq  = ack;
328                 ip4pkt.tcp.ack      = 1;
329                 if (rst) {
330                         ip4pkt.tcp.rst      = 1;
331                 }
332                 ip4pkt.tcp.doff     = sizeof(ip4pkt.tcp)/4;
333                 /* this makes it easier to spot in a sniffer */
334                 ip4pkt.tcp.window   = htons(1234);
335                 ip4pkt.tcp.check    = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
336
337                 /* open a raw socket to send this segment from */
338                 s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
339                 if (s == -1) {
340                         DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
341                                  strerror(errno)));
342                         return -1;
343                 }
344
345                 ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
346                 if (ret != 0) {
347                         DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
348                                  strerror(errno)));
349                         close(s);
350                         return -1;
351                 }
352
353                 set_nonblocking(s);
354                 set_close_on_exec(s);
355
356                 ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0, &dest->ip, sizeof(dest->ip));
357                 close(s);
358                 if (ret != sizeof(ip4pkt)) {
359                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
360                         return -1;
361                 }
362                 break;
363         case AF_INET6:
364                 ZERO_STRUCT(ip6pkt);
365                 ip6pkt.ip6.ip6_vfc  = 0x60;
366                 ip6pkt.ip6.ip6_plen = htons(20);
367                 ip6pkt.ip6.ip6_nxt  = IPPROTO_TCP;
368                 ip6pkt.ip6.ip6_hlim = 64;
369                 ip6pkt.ip6.ip6_src  = src->ip6.sin6_addr;
370                 ip6pkt.ip6.ip6_dst  = dest->ip6.sin6_addr;
371
372                 ip6pkt.tcp.source   = src->ip6.sin6_port;
373                 ip6pkt.tcp.dest     = dest->ip6.sin6_port;
374                 ip6pkt.tcp.seq      = seq;
375                 ip6pkt.tcp.ack_seq  = ack;
376                 ip6pkt.tcp.ack      = 1;
377                 if (rst) {
378                         ip6pkt.tcp.rst      = 1;
379                 }
380                 ip6pkt.tcp.doff     = sizeof(ip6pkt.tcp)/4;
381                 /* this makes it easier to spot in a sniffer */
382                 ip6pkt.tcp.window   = htons(1234);
383                 ip6pkt.tcp.check    = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
384
385                 s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
386                 if (s == -1) {
387                         DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
388                         return -1;
389
390                 }
391                 /* sendto() dont like if the port is set and the socket is
392                    in raw mode.
393                 */
394                 tmpdest = discard_const(dest);
395                 tmpport = tmpdest->ip6.sin6_port;
396
397                 tmpdest->ip6.sin6_port = 0;
398                 ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0, &dest->ip6, sizeof(dest->ip6));
399                 tmpdest->ip6.sin6_port = tmpport;
400                 close(s);
401
402                 if (ret != sizeof(ip6pkt)) {
403                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
404                         return -1;
405                 }
406                 break;
407
408         default:
409                 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
410                 return -1;
411         }
412
413         return 0;
414 }
415
416 /* 
417    This function is used to open a raw socket to capture from
418  */
419 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
420 {
421         int s;
422
423         /* Open a socket to capture all traffic */
424         s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
425         if (s == -1) {
426                 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
427                 return -1;
428         }
429
430         DEBUG(DEBUG_DEBUG, (__location__ " Created RAW SOCKET FD:%d for tcp tickle\n", s));
431
432         set_nonblocking(s);
433         set_close_on_exec(s);
434
435         return s;
436 }
437
438 /* 
439    This function is used to do any additional cleanup required when closing
440    a capture socket.
441    Note that the socket itself is closed automatically in the caller.
442  */
443 int ctdb_sys_close_capture_socket(void *private_data)
444 {
445         return 0;
446 }
447
448
449 /*
450   called when the raw socket becomes readable
451  */
452 int ctdb_sys_read_tcp_packet(int s, void *private_data, 
453                         ctdb_sock_addr *src, ctdb_sock_addr *dst,
454                         uint32_t *ack_seq, uint32_t *seq)
455 {
456         int ret;
457 #define RCVPKTSIZE 100
458         char pkt[RCVPKTSIZE];
459         struct ether_header *eth;
460         struct iphdr *ip;
461         struct ip6_hdr *ip6;
462         struct tcphdr *tcp;
463
464         ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
465         if (ret < sizeof(*eth)+sizeof(*ip)) {
466                 return -1;
467         }
468
469         /* Ethernet */
470         eth = (struct ether_header *)pkt;
471
472         /* we want either IPv4 or IPv6 */
473         if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
474                 /* IP */
475                 ip = (struct iphdr *)(eth+1);
476
477                 /* We only want IPv4 packets */
478                 if (ip->version != 4) {
479                         return -1;
480                 }
481                 /* Dont look at fragments */
482                 if ((ntohs(ip->frag_off)&0x1fff) != 0) {
483                         return -1;
484                 }
485                 /* we only want TCP */
486                 if (ip->protocol != IPPROTO_TCP) {
487                         return -1;
488                 }
489
490                 /* make sure its not a short packet */
491                 if (offsetof(struct tcphdr, ack_seq) + 4 + 
492                     (ip->ihl*4) + sizeof(*eth) > ret) {
493                         return -1;
494                 }
495                 /* TCP */
496                 tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
497
498                 /* tell the caller which one we've found */
499                 src->ip.sin_family      = AF_INET;
500                 src->ip.sin_addr.s_addr = ip->saddr;
501                 src->ip.sin_port        = tcp->source;
502                 dst->ip.sin_family      = AF_INET;
503                 dst->ip.sin_addr.s_addr = ip->daddr;
504                 dst->ip.sin_port        = tcp->dest;
505                 *ack_seq                = tcp->ack_seq;
506                 *seq                    = tcp->seq;
507
508                 return 0;
509         } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
510                 /* IP6 */
511                 ip6 = (struct ip6_hdr *)(eth+1);
512
513                 /* we only want TCP */
514                 if (ip6->ip6_nxt != IPPROTO_TCP) {
515                         return -1;
516                 }
517
518                 /* TCP */
519                 tcp = (struct tcphdr *)(ip6+1);
520
521                 /* tell the caller which one we've found */
522                 src->ip6.sin6_family = AF_INET6;
523                 src->ip6.sin6_port   = tcp->source;
524                 src->ip6.sin6_addr   = ip6->ip6_src;
525
526                 dst->ip6.sin6_family = AF_INET6;
527                 dst->ip6.sin6_port   = tcp->dest;
528                 dst->ip6.sin6_addr   = ip6->ip6_dst;
529
530                 *ack_seq             = tcp->ack_seq;
531                 *seq                 = tcp->seq;
532
533                 return 0;
534         }
535
536         return -1;
537 }
538
539