ctdb_sys_have_ip: don't overwrite input data (setting port to 0)
[ambi/samba-autobuild/.git] / ctdb / common / system_linux.c
1 /* 
2    ctdb system specific code to manage raw sockets on linux
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "includes.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
25 #include "../include/ctdb_private.h"
26 #include "lib/events/events.h"
27 #include <netinet/if_ether.h>
28 #include <netinet/ip6.h>
29 #include <netinet/icmp6.h>
30 #include <net/if_arp.h>
31
32
33 #ifndef ETHERTYPE_IP6
34 #define ETHERTYPE_IP6 0x86dd
35 #endif
36
37 /*
38   uint16 checksum for n bytes
39  */
40 static uint32_t uint16_checksum(uint16_t *data, size_t n)
41 {
42         uint32_t sum=0;
43         while (n>=2) {
44                 sum += (uint32_t)ntohs(*data);
45                 data++;
46                 n -= 2;
47         }
48         if (n == 1) {
49                 sum += (uint32_t)ntohs(*(uint8_t *)data);
50         }
51         return sum;
52 }
53
54 /*
55   calculate the tcp checksum for tcp over ipv6
56 */
57 static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
58 {
59         uint32_t phdr[2];
60         uint32_t sum = 0;
61         uint16_t sum2;
62
63         sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
64         sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
65
66         phdr[0] = htonl(n);
67         phdr[1] = htonl(ip6->ip6_nxt);
68         sum += uint16_checksum((uint16_t *)phdr, 8);
69
70         sum += uint16_checksum(data, n);
71
72         sum = (sum & 0xFFFF) + (sum >> 16);
73         sum = (sum & 0xFFFF) + (sum >> 16);
74         sum2 = htons(sum);
75         sum2 = ~sum2;
76         if (sum2 == 0) {
77                 return 0xFFFF;
78         }
79         return sum2;
80 }
81
82 /*
83   send gratuitous arp reply after we have taken over an ip address
84
85   saddr is the address we are trying to claim
86   iface is the interface name we will be using to claim the address
87  */
88 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
89 {
90         int s, ret;
91         struct sockaddr sa;
92         struct ether_header *eh;
93         struct arphdr *ah;
94         struct ip6_hdr *ip6;
95         struct icmp6_hdr *icmp6;
96         struct ifreq if_hwaddr;
97         unsigned char buffer[78]; /* ipv6 neigh solicitation size */
98         char *ptr;
99
100         ZERO_STRUCT(sa);
101
102         switch (addr->ip.sin_family) {
103         case AF_INET:
104                 s = socket(AF_INET, SOCK_PACKET, htons(ETHERTYPE_ARP));
105                 if (s == -1){
106                         DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
107                         return -1;
108                 }
109
110                 /* get the mac address */
111                 strcpy(if_hwaddr.ifr_name, iface);
112                 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
113                 if ( ret < 0 ) {
114                         close(s);
115                         DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
116                         return -1;
117                 }
118                 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
119                         DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
120                         close(s);
121                         return 0;
122                 }
123                 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
124                         close(s);
125                         errno = EINVAL;
126                         DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
127                                  if_hwaddr.ifr_hwaddr.sa_family));
128                         return -1;
129                 }
130
131
132                 memset(buffer, 0 , 64);
133                 eh = (struct ether_header *)buffer;
134                 memset(eh->ether_dhost, 0xff, ETH_ALEN);
135                 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
136                 eh->ether_type = htons(ETHERTYPE_ARP);
137         
138                 ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
139                 ah->ar_hrd = htons(ARPHRD_ETHER);
140                 ah->ar_pro = htons(ETH_P_IP);
141                 ah->ar_hln = ETH_ALEN;
142                 ah->ar_pln = 4;
143
144                 /* send a gratious arp */
145                 ah->ar_op  = htons(ARPOP_REQUEST);
146                 ptr = (char *)&ah[1];
147                 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
148                 ptr+=ETH_ALEN;
149                 memcpy(ptr, &addr->ip.sin_addr, 4);       
150                 ptr+=4;
151                 memset(ptr, 0, ETH_ALEN); 
152                 ptr+=ETH_ALEN;
153                 memcpy(ptr, &addr->ip.sin_addr, 4);       
154                 ptr+=4;
155         
156                 strncpy(sa.sa_data, iface, sizeof(sa.sa_data));
157                 ret = sendto(s, buffer, 64, 0, &sa, sizeof(sa));
158                 if (ret < 0 ){
159                         close(s);
160                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
161                         return -1;
162                 }       
163
164                 /* send unsolicited arp reply broadcast */
165                 ah->ar_op  = htons(ARPOP_REPLY);
166                 ptr = (char *)&ah[1];
167                 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
168                 ptr+=ETH_ALEN;
169                 memcpy(ptr, &addr->ip.sin_addr, 4);       
170                 ptr+=4;
171                 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
172                 ptr+=ETH_ALEN;
173                 memcpy(ptr, &addr->ip.sin_addr, 4);       
174                 ptr+=4;
175
176                 strncpy(sa.sa_data, iface, sizeof(sa.sa_data));
177                 ret = sendto(s, buffer, 64, 0, &sa, sizeof(sa));
178                 if (ret < 0 ){
179                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
180                         return -1;
181                 }
182
183                 close(s);
184                 break;
185         case AF_INET6:
186                 s = socket(AF_INET, SOCK_PACKET, htons(ETHERTYPE_IP6));
187                 if (s == -1){
188                         DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
189                         return -1;
190                 }
191
192                 /* get the mac address */
193                 strcpy(if_hwaddr.ifr_name, iface);
194                 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
195                 if ( ret < 0 ) {
196                         close(s);
197                         DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
198                         return -1;
199                 }
200                 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
201                         DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
202                         close(s);
203                         return 0;
204                 }
205                 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
206                         close(s);
207                         errno = EINVAL;
208                         DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
209                                  if_hwaddr.ifr_hwaddr.sa_family));
210                         return -1;
211                 }
212
213                 memset(buffer, 0 , sizeof(buffer));
214                 eh = (struct ether_header *)buffer;
215                 memset(eh->ether_dhost, 0xff, ETH_ALEN);
216                 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
217                 eh->ether_type = htons(ETHERTYPE_IP6);
218
219                 ip6 = (struct ip6_hdr *)(eh+1);
220                 ip6->ip6_vfc  = 0x60;
221                 ip6->ip6_plen = htons(24);
222                 ip6->ip6_nxt  = IPPROTO_ICMPV6;
223                 ip6->ip6_hlim = 255;
224                 ip6->ip6_dst  = addr->ip6.sin6_addr;
225
226                 icmp6 = (struct icmp6_hdr *)(ip6+1);
227                 icmp6->icmp6_type = ND_NEIGHBOR_SOLICIT;
228                 icmp6->icmp6_code = 0;
229                 memcpy(&icmp6->icmp6_data32[1], &addr->ip6.sin6_addr, 16);
230
231                 icmp6->icmp6_cksum = tcp_checksum6((uint16_t *)icmp6, ntohs(ip6->ip6_plen), ip6);
232
233                 strncpy(sa.sa_data, iface, sizeof(sa.sa_data));
234                 ret = sendto(s, buffer, 78, 0, &sa, sizeof(sa));
235                 if (ret < 0 ){
236                         close(s);
237                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
238                         return -1;
239                 }       
240
241                 close(s);
242                 break;
243         default:
244                 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/ipv6 address (family is %u)\n", addr->ip.sin_family));
245                 return -1;
246         }
247
248         return 0;
249 }
250
251
252 /*
253   simple TCP checksum - assumes data is multiple of 2 bytes long
254  */
255 static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
256 {
257         uint32_t sum = uint16_checksum(data, n);
258         uint16_t sum2;
259         sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
260                                sizeof(ip->saddr));
261         sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
262                                sizeof(ip->daddr));
263         sum += ip->protocol + n;
264         sum = (sum & 0xFFFF) + (sum >> 16);
265         sum = (sum & 0xFFFF) + (sum >> 16);
266         sum2 = htons(sum);
267         sum2 = ~sum2;
268         if (sum2 == 0) {
269                 return 0xFFFF;
270         }
271         return sum2;
272 }
273
274 /*
275   Send tcp segment from the specified IP/port to the specified
276   destination IP/port. 
277
278   This is used to trigger the receiving host into sending its own ACK,
279   which should trigger early detection of TCP reset by the client
280   after IP takeover
281
282   This can also be used to send RST segments (if rst is true) and also
283   if correct seq and ack numbers are provided.
284  */
285 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest, 
286                       const ctdb_sock_addr *src,
287                       uint32_t seq, uint32_t ack, int rst)
288 {
289         int s;
290         int ret;
291         uint32_t one = 1;
292         uint16_t tmpport;
293         ctdb_sock_addr *tmpdest;
294         struct {
295                 struct iphdr ip;
296                 struct tcphdr tcp;
297         } ip4pkt;
298         struct {
299                 struct ip6_hdr ip6;
300                 struct tcphdr tcp;
301         } ip6pkt;
302
303         switch (src->ip.sin_family) {
304         case AF_INET:
305                 ZERO_STRUCT(ip4pkt);
306                 ip4pkt.ip.version  = 4;
307                 ip4pkt.ip.ihl      = sizeof(ip4pkt.ip)/4;
308                 ip4pkt.ip.tot_len  = htons(sizeof(ip4pkt));
309                 ip4pkt.ip.ttl      = 255;
310                 ip4pkt.ip.protocol = IPPROTO_TCP;
311                 ip4pkt.ip.saddr    = src->ip.sin_addr.s_addr;
312                 ip4pkt.ip.daddr    = dest->ip.sin_addr.s_addr;
313                 ip4pkt.ip.check    = 0;
314
315                 ip4pkt.tcp.source   = src->ip.sin_port;
316                 ip4pkt.tcp.dest     = dest->ip.sin_port;
317                 ip4pkt.tcp.seq      = seq;
318                 ip4pkt.tcp.ack_seq  = ack;
319                 ip4pkt.tcp.ack      = 1;
320                 if (rst) {
321                         ip4pkt.tcp.rst      = 1;
322                 }
323                 ip4pkt.tcp.doff     = sizeof(ip4pkt.tcp)/4;
324                 /* this makes it easier to spot in a sniffer */
325                 ip4pkt.tcp.window   = htons(1234);
326                 ip4pkt.tcp.check    = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
327
328                 /* open a raw socket to send this segment from */
329                 s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
330                 if (s == -1) {
331                         DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
332                                  strerror(errno)));
333                         return -1;
334                 }
335
336                 ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
337                 if (ret != 0) {
338                         DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
339                                  strerror(errno)));
340                         close(s);
341                         return -1;
342                 }
343
344                 set_nonblocking(s);
345                 set_close_on_exec(s);
346
347                 ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0, &dest->ip, sizeof(dest->ip));
348                 close(s);
349                 if (ret != sizeof(ip4pkt)) {
350                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
351                         return -1;
352                 }
353                 break;
354         case AF_INET6:
355                 ZERO_STRUCT(ip6pkt);
356                 ip6pkt.ip6.ip6_vfc  = 0x60;
357                 ip6pkt.ip6.ip6_plen = htons(20);
358                 ip6pkt.ip6.ip6_nxt  = IPPROTO_TCP;
359                 ip6pkt.ip6.ip6_hlim = 64;
360                 ip6pkt.ip6.ip6_src  = src->ip6.sin6_addr;
361                 ip6pkt.ip6.ip6_dst  = dest->ip6.sin6_addr;
362
363                 ip6pkt.tcp.source   = src->ip6.sin6_port;
364                 ip6pkt.tcp.dest     = dest->ip6.sin6_port;
365                 ip6pkt.tcp.seq      = seq;
366                 ip6pkt.tcp.ack_seq  = ack;
367                 ip6pkt.tcp.ack      = 1;
368                 if (rst) {
369                         ip6pkt.tcp.rst      = 1;
370                 }
371                 ip6pkt.tcp.doff     = sizeof(ip6pkt.tcp)/4;
372                 /* this makes it easier to spot in a sniffer */
373                 ip6pkt.tcp.window   = htons(1234);
374                 ip6pkt.tcp.check    = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
375
376                 s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
377                 if (s == -1) {
378                         DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
379                         return -1;
380
381                 }
382                 /* sendto() dont like if the port is set and the socket is
383                    in raw mode.
384                 */
385                 tmpdest = discard_const(dest);
386                 tmpport = tmpdest->ip6.sin6_port;
387
388                 tmpdest->ip6.sin6_port = 0;
389                 ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0, &dest->ip6, sizeof(dest->ip6));
390                 tmpdest->ip6.sin6_port = tmpport;
391                 close(s);
392
393                 if (ret != sizeof(ip6pkt)) {
394                         DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
395                         return -1;
396                 }
397                 break;
398
399         default:
400                 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
401                 return -1;
402         }
403
404         return 0;
405 }
406
407
408 /*
409   see if we currently have an interface with the given IP
410
411   we try to bind to it, and if that fails then we don't have that IP
412   on an interface
413
414   ifname, if non-NULL, will return the name of the interface this ip is tied to
415  */
416 bool ctdb_sys_have_ip(ctdb_sock_addr *_addr)
417 {
418         int s;
419         int ret;
420         ctdb_sock_addr __addr = *_addr;
421         ctdb_sock_addr *addr = &__addr;
422
423         switch (addr->sa.sa_family) {
424         case AF_INET:
425                 addr->ip.sin_port = 0;
426                 break;
427         case AF_INET6:
428                 addr->ip6.sin6_port = 0;
429                 break;
430         }
431         s = socket(addr->sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
432         if (s == -1) {
433                 return false;
434         }
435         ret = bind(s, (struct sockaddr *)addr, sizeof(ctdb_sock_addr));
436
437         close(s);
438         return ret == 0;
439 }
440
441 /* 
442    This function is used to open a raw socket to capture from
443  */
444 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
445 {
446         int s;
447
448         /* Open a socket to capture all traffic */
449         s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
450         if (s == -1) {
451                 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
452                 return -1;
453         }
454
455         set_nonblocking(s);
456         set_close_on_exec(s);
457
458         return s;
459 }
460
461 /* 
462    This function is used to do any additional cleanup required when closing
463    a capture socket.
464    Note that the socket itself is closed automatically in the caller.
465  */
466 int ctdb_sys_close_capture_socket(void *private_data)
467 {
468         return 0;
469 }
470
471
472 /*
473   called when the raw socket becomes readable
474  */
475 int ctdb_sys_read_tcp_packet(int s, void *private_data, 
476                         ctdb_sock_addr *src, ctdb_sock_addr *dst,
477                         uint32_t *ack_seq, uint32_t *seq)
478 {
479         int ret;
480 #define RCVPKTSIZE 100
481         char pkt[RCVPKTSIZE];
482         struct ether_header *eth;
483         struct iphdr *ip;
484         struct ip6_hdr *ip6;
485         struct tcphdr *tcp;
486
487         ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
488         if (ret < sizeof(*eth)+sizeof(*ip)) {
489                 return -1;
490         }
491
492         /* Ethernet */
493         eth = (struct ether_header *)pkt;
494
495         /* we want either IPv4 or IPv6 */
496         if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
497                 /* IP */
498                 ip = (struct iphdr *)(eth+1);
499
500                 /* We only want IPv4 packets */
501                 if (ip->version != 4) {
502                         return -1;
503                 }
504                 /* Dont look at fragments */
505                 if ((ntohs(ip->frag_off)&0x1fff) != 0) {
506                         return -1;
507                 }
508                 /* we only want TCP */
509                 if (ip->protocol != IPPROTO_TCP) {
510                         return -1;
511                 }
512
513                 /* make sure its not a short packet */
514                 if (offsetof(struct tcphdr, ack_seq) + 4 + 
515                     (ip->ihl*4) + sizeof(*eth) > ret) {
516                         return -1;
517                 }
518                 /* TCP */
519                 tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
520
521                 /* tell the caller which one we've found */
522                 src->ip.sin_family      = AF_INET;
523                 src->ip.sin_addr.s_addr = ip->saddr;
524                 src->ip.sin_port        = tcp->source;
525                 dst->ip.sin_family      = AF_INET;
526                 dst->ip.sin_addr.s_addr = ip->daddr;
527                 dst->ip.sin_port        = tcp->dest;
528                 *ack_seq                = tcp->ack_seq;
529                 *seq                    = tcp->seq;
530
531                 return 0;
532         } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
533                 /* IP6 */
534                 ip6 = (struct ip6_hdr *)(eth+1);
535
536                 /* we only want TCP */
537                 if (ip6->ip6_nxt != IPPROTO_TCP) {
538                         return -1;
539                 }
540
541                 /* TCP */
542                 tcp = (struct tcphdr *)(ip6+1);
543
544                 /* tell the caller which one we've found */
545                 src->ip6.sin6_family = AF_INET6;
546                 src->ip6.sin6_port   = tcp->source;
547                 src->ip6.sin6_addr   = ip6->ip6_src;
548
549                 dst->ip6.sin6_family = AF_INET6;
550                 dst->ip6.sin6_port   = tcp->dest;
551                 dst->ip6.sin6_addr   = ip6->ip6_dst;
552
553                 *ack_seq             = tcp->ack_seq;
554                 *seq                 = tcp->seq;
555
556                 return 0;
557         }
558
559         return -1;
560 }
561
562