bc9c6fe06c37adc67605110f25a9fb7956f63f7d
[martins/samba.git] / ctdb / server / ctdb_update_record.c
1 /* 
2    implementation of the update record control
3
4    Copyright (C) Andrew Tridgell  2007
5    Copyright (C) Ronnie Sahlberg  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/time.h"
24
25 #include <talloc.h>
26 #include <tevent.h>
27
28 #include "lib/tdb_wrap/tdb_wrap.h"
29 #include "lib/util/debug.h"
30 #include "lib/util/samba_util.h"
31 #include "lib/util/util_process.h"
32
33 #include "ctdb_private.h"
34 #include "ctdb_client.h"
35
36 #include "common/system.h"
37 #include "common/common.h"
38 #include "common/logging.h"
39
40 struct ctdb_persistent_write_state {
41         struct ctdb_db_context *ctdb_db;
42         struct ctdb_marshall_buffer *m;
43         struct ctdb_req_control_old *c;
44         uint32_t flags;
45 };
46
47 /* don't create/update records that does not exist locally */
48 #define UPDATE_FLAGS_REPLACE_ONLY       1
49
50 /*
51   called from a child process to write the data
52  */
53 static int ctdb_persistent_store(struct ctdb_persistent_write_state *state)
54 {
55         int ret, i;
56         struct ctdb_rec_data_old *rec = NULL;
57         struct ctdb_marshall_buffer *m = state->m;
58
59         ret = tdb_transaction_start(state->ctdb_db->ltdb->tdb);
60         if (ret == -1) {
61                 DEBUG(DEBUG_ERR,("Failed to start transaction for db_id 0x%08x in ctdb_persistent_store\n",
62                                  state->ctdb_db->db_id));
63                 return -1;
64         }
65
66         for (i=0;i<m->count;i++) {
67                 struct ctdb_ltdb_header oldheader;
68                 struct ctdb_ltdb_header header;
69                 TDB_DATA key, data, olddata;
70                 TALLOC_CTX *tmp_ctx = talloc_new(state);
71
72                 rec = ctdb_marshall_loop_next(m, rec, NULL, &header, &key, &data);
73
74                 if (rec == NULL) {
75                         DEBUG(DEBUG_ERR,("Failed to get next record %d for db_id 0x%08x in ctdb_persistent_store\n",
76                                          i, state->ctdb_db->db_id));
77                         talloc_free(tmp_ctx);
78                         goto failed;
79                 }
80
81                 /* we must check if the record exists or not because
82                    ctdb_ltdb_fetch will unconditionally create a record
83                  */
84                 if (state->flags & UPDATE_FLAGS_REPLACE_ONLY) {
85                         TDB_DATA trec;
86                         trec = tdb_fetch(state->ctdb_db->ltdb->tdb, key);
87                         if (trec.dsize == 0) {
88                                 talloc_free(tmp_ctx);
89                                 continue;
90                         }
91                         free(trec.dptr);
92                 }
93
94                 /* fetch the old header and ensure the rsn is less than the new rsn */
95                 ret = ctdb_ltdb_fetch(state->ctdb_db, key, &oldheader, tmp_ctx, &olddata);
96                 if (ret != 0) {
97                         DEBUG(DEBUG_ERR,("Failed to fetch old record for db_id 0x%08x in ctdb_persistent_store\n",
98                                          state->ctdb_db->db_id));
99                         talloc_free(tmp_ctx);
100                         goto failed;
101                 }
102
103                 if (oldheader.rsn >= header.rsn &&
104                     (olddata.dsize != data.dsize ||
105                      memcmp(olddata.dptr, data.dptr, data.dsize) != 0)) {
106                         DEBUG(DEBUG_CRIT,("existing header for db_id 0x%08x has larger RSN %llu than new RSN %llu in ctdb_persistent_store\n",
107                                           state->ctdb_db->db_id,
108                                           (unsigned long long)oldheader.rsn, (unsigned long long)header.rsn));
109                         talloc_free(tmp_ctx);
110                         goto failed;
111                 }
112
113                 talloc_free(tmp_ctx);
114
115                 ret = ctdb_ltdb_store(state->ctdb_db, key, &header, data);
116                 if (ret != 0) {
117                         DEBUG(DEBUG_CRIT,("Failed to store record for db_id 0x%08x in ctdb_persistent_store\n",
118                                           state->ctdb_db->db_id));
119                         goto failed;
120                 }
121         }
122
123         ret = tdb_transaction_commit(state->ctdb_db->ltdb->tdb);
124         if (ret == -1) {
125                 DEBUG(DEBUG_ERR,("Failed to commit transaction for db_id 0x%08x in ctdb_persistent_store\n",
126                                  state->ctdb_db->db_id));
127                 return -1;
128         }
129
130         return 0;
131
132 failed:
133         tdb_transaction_cancel(state->ctdb_db->ltdb->tdb);
134         return -1;
135 }
136
137
138 /*
139   called when we the child has completed the persistent write
140   on our behalf
141  */
142 static void ctdb_persistent_write_callback(int status, void *private_data)
143 {
144         struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
145                                                                    struct ctdb_persistent_write_state);
146
147
148         ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, status, NULL);
149
150         talloc_free(state);
151 }
152
153 /*
154   called if our lockwait child times out
155  */
156 static void ctdb_persistent_lock_timeout(struct tevent_context *ev,
157                                          struct tevent_timer *te,
158                                          struct timeval t, void *private_data)
159 {
160         struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
161                                                                    struct ctdb_persistent_write_state);
162         ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, -1, "timeout in ctdb_persistent_lock");
163         talloc_free(state);
164 }
165
166 struct childwrite_handle {
167         struct ctdb_context *ctdb;
168         struct ctdb_db_context *ctdb_db;
169         struct tevent_fd *fde;
170         int fd[2];
171         pid_t child;
172         void *private_data;
173         void (*callback)(int, void *);
174         struct timeval start_time;
175 };
176
177 static int childwrite_destructor(struct childwrite_handle *h)
178 {
179         CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
180         ctdb_kill(h->ctdb, h->child, SIGKILL);
181         return 0;
182 }
183
184 /* called when the child process has finished writing the record to the
185    database
186 */
187 static void childwrite_handler(struct tevent_context *ev,
188                                struct tevent_fd *fde,
189                                uint16_t flags, void *private_data)
190 {
191         struct childwrite_handle *h = talloc_get_type(private_data,
192                                                      struct childwrite_handle);
193         void *p = h->private_data;
194         void (*callback)(int, void *) = h->callback;
195         pid_t child = h->child;
196         TALLOC_CTX *tmp_ctx = talloc_new(ev);
197         int ret;
198         char c;
199
200         CTDB_UPDATE_LATENCY(h->ctdb, h->ctdb_db, "persistent", childwrite_latency, h->start_time);
201         CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
202
203         /* the handle needs to go away when the context is gone - when
204            the handle goes away this implicitly closes the pipe, which
205            kills the child */
206         talloc_steal(tmp_ctx, h);
207
208         talloc_set_destructor(h, NULL);
209
210         ret = sys_read(h->fd[0], &c, 1);
211         if (ret < 1) {
212                 DEBUG(DEBUG_ERR, (__location__ " Read returned %d. Childwrite failed\n", ret));
213                 c = 1;
214         }
215
216         callback(c, p);
217
218         ctdb_kill(h->ctdb, child, SIGKILL);
219         talloc_free(tmp_ctx);
220 }
221
222 /* this creates a child process which will take out a tdb transaction
223    and write the record to the database.
224 */
225 static struct childwrite_handle *ctdb_childwrite(
226                                 struct ctdb_db_context *ctdb_db,
227                                 void (*callback)(int, void *private_data),
228                                 struct ctdb_persistent_write_state *state)
229 {
230         struct childwrite_handle *result;
231         int ret;
232         pid_t parent = getpid();
233
234         CTDB_INCREMENT_STAT(ctdb_db->ctdb, childwrite_calls);
235         CTDB_INCREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
236
237         if (!(result = talloc_zero(state, struct childwrite_handle))) {
238                 CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
239                 return NULL;
240         }
241
242         ret = pipe(result->fd);
243
244         if (ret != 0) {
245                 talloc_free(result);
246                 CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
247                 return NULL;
248         }
249
250         result->child = ctdb_fork(ctdb_db->ctdb);
251
252         if (result->child == (pid_t)-1) {
253                 close(result->fd[0]);
254                 close(result->fd[1]);
255                 talloc_free(result);
256                 CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
257                 return NULL;
258         }
259
260         result->callback = callback;
261         result->private_data = state;
262         result->ctdb = ctdb_db->ctdb;
263         result->ctdb_db = ctdb_db;
264
265         if (result->child == 0) {
266                 char c = 0;
267
268                 close(result->fd[0]);
269                 prctl_set_comment("ctdb_write_persistent");
270                 debug_extra = talloc_asprintf(NULL, "childwrite-%s:", ctdb_db->db_name);
271                 ret = ctdb_persistent_store(state);
272                 if (ret != 0) {
273                         DEBUG(DEBUG_ERR, (__location__ " Failed to write persistent data\n"));
274                         c = 1;
275                 }
276
277                 sys_write(result->fd[1], &c, 1);
278
279                 /* make sure we die when our parent dies */
280                 while (ctdb_kill(ctdb_db->ctdb, parent, 0) == 0 || errno != ESRCH) {
281                         sleep(5);
282                 }
283                 _exit(0);
284         }
285
286         close(result->fd[1]);
287         set_close_on_exec(result->fd[0]);
288
289         talloc_set_destructor(result, childwrite_destructor);
290
291         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for ctdb_childwrite\n", result->fd[0]));
292
293         result->fde = tevent_add_fd(ctdb_db->ctdb->ev, result, result->fd[0],
294                                     TEVENT_FD_READ, childwrite_handler,
295                                     (void *)result);
296         if (result->fde == NULL) {
297                 talloc_free(result);
298                 CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
299                 return NULL;
300         }
301         tevent_fd_set_auto_close(result->fde);
302
303         result->start_time = timeval_current();
304
305         return result;
306 }
307
308 /*
309    update a record on this node if the new record has a higher rsn than the
310    current record
311  */
312 int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
313                                    struct ctdb_req_control_old *c, TDB_DATA recdata,
314                                    bool *async_reply)
315 {
316         struct ctdb_db_context *ctdb_db;
317         struct ctdb_persistent_write_state *state;
318         struct childwrite_handle *handle;
319         struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr;
320
321         if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
322                 DEBUG(DEBUG_INFO,("rejecting ctdb_control_update_record when recovery active\n"));
323                 return -1;
324         }
325
326         ctdb_db = find_ctdb_db(ctdb, m->db_id);
327         if (ctdb_db == NULL) {
328                 DEBUG(DEBUG_ERR,("Unknown database 0x%08x in ctdb_control_update_record\n", m->db_id));
329                 return -1;
330         }
331
332         if (ctdb_db->unhealthy_reason) {
333                 DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_update_record: %s\n",
334                                  ctdb_db->db_name, ctdb_db->unhealthy_reason));
335                 return -1;
336         }
337
338         state = talloc(ctdb, struct ctdb_persistent_write_state);
339         CTDB_NO_MEMORY(ctdb, state);
340
341         state->ctdb_db = ctdb_db;
342         state->c       = c;
343         state->m       = m;
344         state->flags   = 0;
345         if (!ctdb_db->persistent) {
346                 state->flags   = UPDATE_FLAGS_REPLACE_ONLY;
347         }
348
349         /* create a child process to take out a transaction and
350            write the data.
351         */
352         handle = ctdb_childwrite(ctdb_db, ctdb_persistent_write_callback, state);
353         if (handle == NULL) {
354                 DEBUG(DEBUG_ERR,("Failed to setup childwrite handler in ctdb_control_update_record\n"));
355                 talloc_free(state);
356                 return -1;
357         }
358
359         /* we need to wait for the replies */
360         *async_reply = true;
361
362         /* need to keep the control structure around */
363         talloc_steal(state, c);
364
365         /* but we won't wait forever */
366         tevent_add_timer(ctdb->ev, state,
367                          timeval_current_ofs(ctdb->tunable.control_timeout, 0),
368                          ctdb_persistent_lock_timeout, state);
369
370         return 0;
371 }
372