Merge branch 'master' of ssh://git.samba.org/data/git/samba into noejs
[amitay/samba.git] / source4 / cluster / ctdb / takeover / ctdb_takeover.c
1 /* 
2    ctdb recovery code
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6
7    This library is free software; you can redistribute it and/or
8    modify it under the terms of the GNU Lesser General Public
9    License as published by the Free Software Foundation; either
10    version 3 of the License, or (at your option) any later version.
11
12    This library is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    Lesser General Public License for more details.
16
17    You should have received a copy of the GNU Lesser General Public
18    License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 */
20 #include "includes.h"
21 #include "lib/events/events.h"
22 #include "../tdb/include/tdb.h"
23 #include "system/network.h"
24 #include "system/filesys.h"
25 #include "system/wait.h"
26 #include "../include/ctdb_private.h"
27
28
29 #define TAKEOVER_TIMEOUT() timeval_current_ofs(5,0)
30
31 #define CTDB_ARP_INTERVAL 1
32 #define CTDB_ARP_REPEAT   3
33
34 struct ctdb_takeover_arp {
35         struct ctdb_context *ctdb;
36         uint32_t count;
37         struct sockaddr_in sin;
38         struct ctdb_tcp_list *tcp_list;
39 };
40
41 /*
42   lists of tcp endpoints
43  */
44 struct ctdb_tcp_list {
45         struct ctdb_tcp_list *prev, *next;
46         uint32_t vnn;
47         struct sockaddr_in saddr;
48         struct sockaddr_in daddr;
49 };
50
51
52
53 /*
54   send a gratuitous arp
55  */
56 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
57                                   struct timeval t, void *private_data)
58 {
59         struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
60                                                         struct ctdb_takeover_arp);
61         int ret;
62         struct ctdb_tcp_list *tcp;
63
64         ret = ctdb_sys_send_arp(&arp->sin, arp->ctdb->takeover.interface);
65         if (ret != 0) {
66                 DEBUG(0,(__location__ "sending of arp failed (%s)\n", strerror(errno)));
67         }
68
69         for (tcp=arp->tcp_list;tcp;tcp=tcp->next) {
70                 DEBUG(2,("sending tcp tickle ack for %u->%s:%u\n",
71                          (unsigned)ntohs(tcp->daddr.sin_port), 
72                          inet_ntoa(tcp->saddr.sin_addr),
73                          (unsigned)ntohs(tcp->saddr.sin_port)));
74                 ret = ctdb_sys_send_ack(&tcp->saddr, &tcp->daddr);
75                 if (ret != 0) {
76                         DEBUG(0,(__location__ " Failed to send tcp tickle ack for %s\n",
77                                  inet_ntoa(tcp->saddr.sin_addr)));
78                 }
79         }
80
81         arp->count++;
82
83         if (arp->count == CTDB_ARP_REPEAT) {
84                 talloc_free(arp);
85                 return;
86         }
87         
88         event_add_timed(arp->ctdb->ev, arp->ctdb->takeover.last_ctx, 
89                         timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
90                         ctdb_control_send_arp, arp);
91 }
92
93
94 /*
95   take over an ip address
96  */
97 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, TDB_DATA indata)
98 {
99         int ret;
100         struct sockaddr_in *sin = (struct sockaddr_in *)indata.dptr;
101         struct ctdb_takeover_arp *arp;
102         char *ip = inet_ntoa(sin->sin_addr);
103         struct ctdb_tcp_list *tcp;
104
105         if (ctdb_sys_have_ip(ip)) {
106                 return 0;
107         }
108
109         DEBUG(0,("Takover of IP %s/%u on interface %s\n", 
110                  ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits, 
111                  ctdb->takeover.interface));
112         ret = ctdb_event_script(ctdb, "takeip %s %s %u",
113                                 ctdb->takeover.interface, 
114                                 ip,
115                                 ctdb->nodes[ctdb->vnn]->public_netmask_bits);
116         if (ret != 0) {
117                 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
118                          ip, ctdb->takeover.interface));
119                 return -1;
120         }
121
122         if (!ctdb->takeover.last_ctx) {
123                 ctdb->takeover.last_ctx = talloc_new(ctdb);
124                 CTDB_NO_MEMORY(ctdb, ctdb->takeover.last_ctx);
125         }
126
127         arp = talloc_zero(ctdb->takeover.last_ctx, struct ctdb_takeover_arp);
128         CTDB_NO_MEMORY(ctdb, arp);
129         
130         arp->ctdb = ctdb;
131         arp->sin = *sin;
132
133         /* add all of the known tcp connections for this IP to the
134            list of tcp connections to send tickle acks for */
135         for (tcp=ctdb->tcp_list;tcp;tcp=tcp->next) {
136                 if (sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) {
137                         struct ctdb_tcp_list *t2 = talloc(arp, struct ctdb_tcp_list);
138                         CTDB_NO_MEMORY(ctdb, t2);
139                         *t2 = *tcp;
140                         DLIST_ADD(arp->tcp_list, t2);
141                 }
142         }
143
144         event_add_timed(arp->ctdb->ev, arp->ctdb->takeover.last_ctx, 
145                         timeval_zero(), ctdb_control_send_arp, arp);
146
147         return ret;
148 }
149
150 /*
151   release an ip address
152  */
153 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata)
154 {
155         struct sockaddr_in *sin = (struct sockaddr_in *)indata.dptr;
156         TDB_DATA data;
157         char *ip = inet_ntoa(sin->sin_addr);
158         int ret;
159         struct ctdb_tcp_list *tcp;
160
161         if (!ctdb_sys_have_ip(ip)) {
162                 return 0;
163         }
164
165         DEBUG(0,("Release of IP %s/%u on interface %s\n", 
166                  ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits, 
167                  ctdb->takeover.interface));
168
169         /* stop any previous arps */
170         talloc_free(ctdb->takeover.last_ctx);
171         ctdb->takeover.last_ctx = NULL;
172
173         ret = ctdb_event_script(ctdb, "releaseip %s %s %u",
174                                 ctdb->takeover.interface, 
175                                 ip,
176                                 ctdb->nodes[ctdb->vnn]->public_netmask_bits);
177         if (ret != 0) {
178                 DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
179                          ip, ctdb->takeover.interface));
180                 return -1;
181         }
182
183         /* send a message to all clients of this node telling them
184            that the cluster has been reconfigured and they should
185            release any sockets on this IP */
186         data.dptr = (uint8_t *)ip;
187         data.dsize = strlen(ip)+1;
188
189         ctdb_daemon_send_message(ctdb, ctdb->vnn, CTDB_SRVID_RELEASE_IP, data);
190
191         /* tell other nodes about any tcp connections we were holding with this IP */
192         for (tcp=ctdb->tcp_list;tcp;tcp=tcp->next) {
193                 if (tcp->vnn == ctdb->vnn && 
194                     sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) {
195                         struct ctdb_control_tcp_vnn t;
196
197                         t.vnn  = ctdb->vnn;
198                         t.src  = tcp->saddr;
199                         t.dest = tcp->daddr;
200
201                         data.dptr = (uint8_t *)&t;
202                         data.dsize = sizeof(t);
203
204                         ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_VNNMAP, 0, 
205                                                  CTDB_CONTROL_TCP_ADD,
206                                                  0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
207                 }
208         }
209
210
211         return 0;
212 }
213
214
215 /*
216   setup the event script
217 */
218 int ctdb_set_event_script(struct ctdb_context *ctdb, const char *script)
219 {
220         ctdb->takeover.event_script = talloc_strdup(ctdb, script);
221         CTDB_NO_MEMORY(ctdb, ctdb->takeover.event_script);
222         return 0;
223 }
224
225 /*
226   setup the public address list from a file
227 */
228 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
229 {
230         char **lines;
231         int nlines;
232         int i;
233
234         lines = file_lines_load(alist, &nlines, ctdb);
235         if (lines == NULL) {
236                 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
237                 return -1;
238         }
239         while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
240                 nlines--;
241         }
242
243         if (nlines != ctdb->num_nodes) {
244                 DEBUG(0,("Number of lines in %s does not match number of nodes!\n", alist));
245                 talloc_free(lines);
246                 return -1;
247         }
248
249         for (i=0;i<nlines;i++) {
250                 char *p;
251                 struct in_addr in;
252
253                 ctdb->nodes[i]->public_address = talloc_strdup(ctdb->nodes[i], lines[i]);
254                 CTDB_NO_MEMORY(ctdb, ctdb->nodes[i]->public_address);
255                 ctdb->nodes[i]->takeover_vnn = -1;
256
257                 /* see if they supplied a netmask length */
258                 p = strchr(ctdb->nodes[i]->public_address, '/');
259                 if (!p) {
260                         DEBUG(0,("You must supply a netmask for public address %s\n",
261                                  ctdb->nodes[i]->public_address));
262                         return -1;
263                 }
264                 *p = 0;
265                 ctdb->nodes[i]->public_netmask_bits = atoi(p+1);
266
267                 if (ctdb->nodes[i]->public_netmask_bits > 32) {
268                         DEBUG(0, ("Illegal netmask for IP %s\n", ctdb->nodes[i]->public_address));
269                         return -1;
270                 }
271
272                 if (inet_aton(ctdb->nodes[i]->public_address, &in) == 0) {
273                         DEBUG(0,("Badly formed IP '%s' in public address list\n", ctdb->nodes[i]->public_address));
274                         return -1;
275                 }
276         }
277
278         talloc_free(lines);
279         return 0;
280 }
281
282 /*
283   see if two IPs are on the same subnet
284  */
285 static bool ctdb_same_subnet(const char *ip1, const char *ip2, uint8_t netmask_bits)
286 {
287         struct in_addr in1, in2;
288         uint32_t mask;
289
290         inet_aton(ip1, &in1);
291         inet_aton(ip2, &in2);
292
293         mask = ~((1LL<<(32-netmask_bits))-1);
294
295         if ((ntohl(in1.s_addr) & mask) != (ntohl(in2.s_addr) & mask)) {
296                 return false;
297         }
298
299         return true;
300 }
301
302 /*
303   make any IP alias changes for public addresses that are necessary 
304  */
305 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
306 {
307         int i, j;
308         int ret;
309
310         /* work out which node will look after each public IP */
311         for (i=0;i<nodemap->num;i++) {
312                 if (nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED) {
313                         ctdb->nodes[i]->takeover_vnn = nodemap->nodes[i].vnn;
314                 } else {
315                         /* assign this dead nodes IP to the next higher node */
316                         for (j=(i+1)%nodemap->num;
317                              j != i;
318                              j=(j+1)%nodemap->num) {
319                                 if ((nodemap->nodes[j].flags & NODE_FLAGS_CONNECTED) &&
320                                     ctdb_same_subnet(ctdb->nodes[j]->public_address, 
321                                                      ctdb->nodes[i]->public_address, 
322                                                      ctdb->nodes[j]->public_netmask_bits)) {
323                                         ctdb->nodes[i]->takeover_vnn = nodemap->nodes[j].vnn;
324                                         break;
325                                 }
326                         }
327                         if (j == i) {
328                                 DEBUG(0,(__location__ " No node available on same network to take %s\n",
329                                          ctdb->nodes[i]->public_address));
330                                 ctdb->nodes[i]->takeover_vnn = -1;      
331                         }
332                 }
333         }       
334
335         /* at this point ctdb->nodes[i]->takeover_vnn is the vnn which will own each IP */
336
337
338         /* now tell all nodes to delete any alias that they should not
339            have.  This will be a NOOP on nodes that don't currently
340            hold the given alias */
341         for (i=0;i<nodemap->num;i++) {
342                 /* don't talk to unconnected nodes */
343                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_CONNECTED)) continue;
344
345                 /* tell this node to delete all of the aliases that it should not have */
346                 for (j=0;j<nodemap->num;j++) {
347                         if (ctdb->nodes[j]->takeover_vnn != nodemap->nodes[i].vnn) {
348                                 ret = ctdb_ctrl_release_ip(ctdb, TAKEOVER_TIMEOUT(),
349                                                            nodemap->nodes[i].vnn, 
350                                                            ctdb->nodes[j]->public_address);
351                                 if (ret != 0) {
352                                         DEBUG(0,("Failed to tell vnn %u to release IP %s\n",
353                                                  nodemap->nodes[i].vnn,
354                                                  ctdb->nodes[j]->public_address));
355                                         return -1;
356                                 }
357                         }
358                 }
359         }
360
361         /* tell all nodes to get their own IPs */
362         for (i=0;i<nodemap->num;i++) {
363                 ret = ctdb_ctrl_takeover_ip(ctdb, TAKEOVER_TIMEOUT(), 
364                                             ctdb->nodes[i]->takeover_vnn, 
365                                             ctdb->nodes[i]->public_address);
366                 if (ret != 0) {
367                         DEBUG(0,("Failed asking vnn %u to take over IP %s\n",
368                                  ctdb->nodes[i]->takeover_vnn, 
369                                  ctdb->nodes[i]->public_address));
370                         return -1;
371                 }
372         }
373
374         return 0;
375 }
376
377
378 /*
379   called by a client to inform us of a TCP connection that it is managing
380   that should tickled with an ACK when IP takeover is done
381  */
382 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id, uint32_t vnn,
383                                 TDB_DATA indata)
384 {
385         struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
386         struct ctdb_control_tcp *p = (struct ctdb_control_tcp *)indata.dptr;
387         struct ctdb_tcp_list *tcp;
388         struct ctdb_control_tcp_vnn t;
389         int ret;
390         TDB_DATA data;
391
392         tcp = talloc(client, struct ctdb_tcp_list);
393         CTDB_NO_MEMORY(ctdb, tcp);
394
395         tcp->vnn   = vnn;
396         tcp->saddr = p->src;
397         tcp->daddr = p->dest;
398
399         DLIST_ADD(client->tcp_list, tcp);
400
401         t.vnn  = vnn;
402         t.src  = p->src;
403         t.dest = p->dest;
404
405         data.dptr = (uint8_t *)&t;
406         data.dsize = sizeof(t);
407
408         /* tell all nodes about this tcp connection */
409         ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_VNNMAP, 0, 
410                                        CTDB_CONTROL_TCP_ADD,
411                                        0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
412         if (ret != 0) {
413                 DEBUG(0,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
414                 return -1;
415         }
416
417         return 0;
418 }
419
420 /*
421   see if two sockaddr_in are the same
422  */
423 static bool same_sockaddr_in(struct sockaddr_in *in1, struct sockaddr_in *in2)
424 {
425         return in1->sin_family == in2->sin_family &&
426                 in1->sin_port == in2->sin_port &&
427                 in1->sin_addr.s_addr == in2->sin_addr.s_addr;
428 }
429
430 /*
431   find a tcp address on a list
432  */
433 static struct ctdb_tcp_list *ctdb_tcp_find(struct ctdb_tcp_list *list, 
434                                            struct ctdb_tcp_list *tcp)
435 {
436         while (list) {
437                 if (same_sockaddr_in(&list->saddr, &tcp->saddr) &&
438                     same_sockaddr_in(&list->daddr, &tcp->daddr)) {
439                         return list;
440                 }
441                 list = list->next;
442         }
443         return NULL;
444 }
445
446 /*
447   called by a daemon to inform us of a TCP connection that one of its
448   clients managing that should tickled with an ACK when IP takeover is
449   done
450  */
451 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
452 {
453         struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
454         struct ctdb_tcp_list *tcp;
455
456         tcp = talloc(ctdb, struct ctdb_tcp_list);
457         CTDB_NO_MEMORY(ctdb, tcp);
458
459         tcp->vnn   = p->vnn;
460         tcp->saddr = p->src;
461         tcp->daddr = p->dest;
462
463         if (NULL == ctdb_tcp_find(ctdb->tcp_list, tcp)) {
464                 DLIST_ADD(ctdb->tcp_list, tcp);
465                 DEBUG(2,("Added tickle info for %s:%u from vnn %u\n",
466                          inet_ntoa(tcp->daddr.sin_addr), ntohs(tcp->daddr.sin_port),
467                          tcp->vnn));
468         } else {
469                 DEBUG(4,("Already had tickle info for %s:%u from vnn %u\n",
470                          inet_ntoa(tcp->daddr.sin_addr), ntohs(tcp->daddr.sin_port),
471                          tcp->vnn));
472         }
473
474         return 0;
475 }
476
477 /*
478   called by a daemon to inform us of a TCP connection that one of its
479   clients managing that should tickled with an ACK when IP takeover is
480   done
481  */
482 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
483 {
484         struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
485         struct ctdb_tcp_list t, *tcp;
486
487         t.vnn   = p->vnn;
488         t.saddr = p->src;
489         t.daddr = p->dest;
490
491         tcp = ctdb_tcp_find(ctdb->tcp_list, &t);
492         if (tcp) {
493                 DEBUG(2,("Removed tickle info for %s:%u from vnn %u\n",
494                          inet_ntoa(tcp->daddr.sin_addr), ntohs(tcp->daddr.sin_port),
495                          tcp->vnn));
496                 DLIST_REMOVE(ctdb->tcp_list, tcp);
497                 talloc_free(tcp);
498         }
499
500         return 0;
501 }
502
503
504 /*
505   called when a daemon restarts - wipes all tcp entries from that vnn
506  */
507 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
508 {
509         struct ctdb_tcp_list *tcp, *next;       
510         for (tcp=ctdb->tcp_list;tcp;tcp=next) {
511                 next = tcp->next;
512                 if (tcp->vnn == vnn) {
513                         DLIST_REMOVE(ctdb->tcp_list, tcp);
514                         talloc_free(tcp);
515                 }
516
517                 /* and tell the new guy about any that he should have
518                    from us */
519                 if (tcp->vnn == ctdb->vnn) {
520                         struct ctdb_control_tcp_vnn t;
521                         TDB_DATA data;
522
523                         t.vnn  = tcp->vnn;
524                         t.src  = tcp->saddr;
525                         t.dest = tcp->daddr;
526
527                         data.dptr = (uint8_t *)&t;
528                         data.dsize = sizeof(t);
529
530                         ctdb_daemon_send_control(ctdb, vnn, 0, 
531                                                  CTDB_CONTROL_TCP_ADD,
532                                                  0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
533                 }
534         }
535         return 0;
536 }
537
538
539 /*
540   called when a client structure goes away - hook to remove
541   elements from the tcp_list in all daemons
542  */
543 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
544 {
545         while (client->tcp_list) {
546                 TDB_DATA data;
547                 struct ctdb_control_tcp_vnn p;
548                 struct ctdb_tcp_list *tcp = client->tcp_list;
549                 DLIST_REMOVE(client->tcp_list, tcp);
550                 p.vnn = tcp->vnn;
551                 p.src = tcp->saddr;
552                 p.dest = tcp->daddr;
553                 data.dptr = (uint8_t *)&p;
554                 data.dsize = sizeof(p);
555                 ctdb_daemon_send_control(client->ctdb, CTDB_BROADCAST_VNNMAP, 0, 
556                                          CTDB_CONTROL_TCP_REMOVE,
557                                          0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
558                 talloc_free(tcp);
559         }
560 }
561
562
563 /*
564   release all IPs on shutdown
565  */
566 void ctdb_release_all_ips(struct ctdb_context *ctdb)
567 {
568         int i;
569
570         if (!ctdb->takeover.enabled) {
571                 return;
572         }
573
574         for (i=0;i<ctdb->num_nodes;i++) {
575                 struct ctdb_node *node = ctdb->nodes[i];
576                 if (ctdb_sys_have_ip(node->public_address)) {
577                         ctdb_event_script(ctdb, "releaseip %s %s %u",
578                                           ctdb->takeover.interface, 
579                                           node->public_address,
580                                           node->public_netmask_bits);
581                 }
582         }
583 }