ctdb-common: Drop CTDB's copy of sys_read() and sys_write()
[samba.git] / ctdb / server / ctdb_update_record.c
1 /* 
2    implementation of the update record control
3
4    Copyright (C) Andrew Tridgell  2007
5    Copyright (C) Ronnie Sahlberg  2007
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/time.h"
24
25 #include <talloc.h>
26 #include <tevent.h>
27
28 #include "lib/tdb_wrap/tdb_wrap.h"
29 #include "lib/util/debug.h"
30 #include "lib/util/samba_util.h"
31 #include "lib/util/sys_rw.h"
32 #include "lib/util/util_process.h"
33
34 #include "ctdb_private.h"
35 #include "ctdb_client.h"
36
37 #include "common/system.h"
38 #include "common/common.h"
39 #include "common/logging.h"
40
41 struct ctdb_persistent_write_state {
42         struct ctdb_db_context *ctdb_db;
43         struct ctdb_marshall_buffer *m;
44         struct ctdb_req_control_old *c;
45         uint32_t flags;
46 };
47
48 /* don't create/update records that does not exist locally */
49 #define UPDATE_FLAGS_REPLACE_ONLY       1
50
51 /*
52   called from a child process to write the data
53  */
54 static int ctdb_persistent_store(struct ctdb_persistent_write_state *state)
55 {
56         int ret, i;
57         struct ctdb_rec_data_old *rec = NULL;
58         struct ctdb_marshall_buffer *m = state->m;
59
60         ret = tdb_transaction_start(state->ctdb_db->ltdb->tdb);
61         if (ret == -1) {
62                 DEBUG(DEBUG_ERR,("Failed to start transaction for db_id 0x%08x in ctdb_persistent_store\n",
63                                  state->ctdb_db->db_id));
64                 return -1;
65         }
66
67         for (i=0;i<m->count;i++) {
68                 struct ctdb_ltdb_header oldheader;
69                 struct ctdb_ltdb_header header;
70                 TDB_DATA key, data, olddata;
71                 TALLOC_CTX *tmp_ctx = talloc_new(state);
72
73                 rec = ctdb_marshall_loop_next(m, rec, NULL, &header, &key, &data);
74
75                 if (rec == NULL) {
76                         DEBUG(DEBUG_ERR,("Failed to get next record %d for db_id 0x%08x in ctdb_persistent_store\n",
77                                          i, state->ctdb_db->db_id));
78                         talloc_free(tmp_ctx);
79                         goto failed;
80                 }
81
82                 /* we must check if the record exists or not because
83                    ctdb_ltdb_fetch will unconditionally create a record
84                  */
85                 if (state->flags & UPDATE_FLAGS_REPLACE_ONLY) {
86                         TDB_DATA trec;
87                         trec = tdb_fetch(state->ctdb_db->ltdb->tdb, key);
88                         if (trec.dsize == 0) {
89                                 talloc_free(tmp_ctx);
90                                 continue;
91                         }
92                         free(trec.dptr);
93                 }
94
95                 /* fetch the old header and ensure the rsn is less than the new rsn */
96                 ret = ctdb_ltdb_fetch(state->ctdb_db, key, &oldheader, tmp_ctx, &olddata);
97                 if (ret != 0) {
98                         DEBUG(DEBUG_ERR,("Failed to fetch old record for db_id 0x%08x in ctdb_persistent_store\n",
99                                          state->ctdb_db->db_id));
100                         talloc_free(tmp_ctx);
101                         goto failed;
102                 }
103
104                 if (oldheader.rsn >= header.rsn &&
105                     (olddata.dsize != data.dsize ||
106                      memcmp(olddata.dptr, data.dptr, data.dsize) != 0)) {
107                         DEBUG(DEBUG_CRIT,("existing header for db_id 0x%08x has larger RSN %llu than new RSN %llu in ctdb_persistent_store\n",
108                                           state->ctdb_db->db_id,
109                                           (unsigned long long)oldheader.rsn, (unsigned long long)header.rsn));
110                         talloc_free(tmp_ctx);
111                         goto failed;
112                 }
113
114                 talloc_free(tmp_ctx);
115
116                 ret = ctdb_ltdb_store(state->ctdb_db, key, &header, data);
117                 if (ret != 0) {
118                         DEBUG(DEBUG_CRIT,("Failed to store record for db_id 0x%08x in ctdb_persistent_store\n",
119                                           state->ctdb_db->db_id));
120                         goto failed;
121                 }
122         }
123
124         ret = tdb_transaction_commit(state->ctdb_db->ltdb->tdb);
125         if (ret == -1) {
126                 DEBUG(DEBUG_ERR,("Failed to commit transaction for db_id 0x%08x in ctdb_persistent_store\n",
127                                  state->ctdb_db->db_id));
128                 return -1;
129         }
130
131         return 0;
132
133 failed:
134         tdb_transaction_cancel(state->ctdb_db->ltdb->tdb);
135         return -1;
136 }
137
138
139 /*
140   called when we the child has completed the persistent write
141   on our behalf
142  */
143 static void ctdb_persistent_write_callback(int status, void *private_data)
144 {
145         struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
146                                                                    struct ctdb_persistent_write_state);
147
148
149         ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, status, NULL);
150
151         talloc_free(state);
152 }
153
154 /*
155   called if our lockwait child times out
156  */
157 static void ctdb_persistent_lock_timeout(struct tevent_context *ev,
158                                          struct tevent_timer *te,
159                                          struct timeval t, void *private_data)
160 {
161         struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
162                                                                    struct ctdb_persistent_write_state);
163         ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, -1, "timeout in ctdb_persistent_lock");
164         talloc_free(state);
165 }
166
167 struct childwrite_handle {
168         struct ctdb_context *ctdb;
169         struct ctdb_db_context *ctdb_db;
170         struct tevent_fd *fde;
171         int fd[2];
172         pid_t child;
173         void *private_data;
174         void (*callback)(int, void *);
175         struct timeval start_time;
176 };
177
178 static int childwrite_destructor(struct childwrite_handle *h)
179 {
180         CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
181         ctdb_kill(h->ctdb, h->child, SIGKILL);
182         return 0;
183 }
184
185 /* called when the child process has finished writing the record to the
186    database
187 */
188 static void childwrite_handler(struct tevent_context *ev,
189                                struct tevent_fd *fde,
190                                uint16_t flags, void *private_data)
191 {
192         struct childwrite_handle *h = talloc_get_type(private_data,
193                                                      struct childwrite_handle);
194         void *p = h->private_data;
195         void (*callback)(int, void *) = h->callback;
196         pid_t child = h->child;
197         TALLOC_CTX *tmp_ctx = talloc_new(ev);
198         int ret;
199         char c;
200
201         CTDB_UPDATE_LATENCY(h->ctdb, h->ctdb_db, "persistent", childwrite_latency, h->start_time);
202         CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
203
204         /* the handle needs to go away when the context is gone - when
205            the handle goes away this implicitly closes the pipe, which
206            kills the child */
207         talloc_steal(tmp_ctx, h);
208
209         talloc_set_destructor(h, NULL);
210
211         ret = sys_read(h->fd[0], &c, 1);
212         if (ret < 1) {
213                 DEBUG(DEBUG_ERR, (__location__ " Read returned %d. Childwrite failed\n", ret));
214                 c = 1;
215         }
216
217         callback(c, p);
218
219         ctdb_kill(h->ctdb, child, SIGKILL);
220         talloc_free(tmp_ctx);
221 }
222
223 /* this creates a child process which will take out a tdb transaction
224    and write the record to the database.
225 */
226 static struct childwrite_handle *ctdb_childwrite(
227                                 struct ctdb_db_context *ctdb_db,
228                                 void (*callback)(int, void *private_data),
229                                 struct ctdb_persistent_write_state *state)
230 {
231         struct childwrite_handle *result;
232         int ret;
233         pid_t parent = getpid();
234
235         CTDB_INCREMENT_STAT(ctdb_db->ctdb, childwrite_calls);
236         CTDB_INCREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
237
238         if (!(result = talloc_zero(state, struct childwrite_handle))) {
239                 CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
240                 return NULL;
241         }
242
243         ret = pipe(result->fd);
244
245         if (ret != 0) {
246                 talloc_free(result);
247                 CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
248                 return NULL;
249         }
250
251         result->child = ctdb_fork(ctdb_db->ctdb);
252
253         if (result->child == (pid_t)-1) {
254                 close(result->fd[0]);
255                 close(result->fd[1]);
256                 talloc_free(result);
257                 CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
258                 return NULL;
259         }
260
261         result->callback = callback;
262         result->private_data = state;
263         result->ctdb = ctdb_db->ctdb;
264         result->ctdb_db = ctdb_db;
265
266         if (result->child == 0) {
267                 char c = 0;
268
269                 close(result->fd[0]);
270                 prctl_set_comment("ctdb_write_persistent");
271                 debug_extra = talloc_asprintf(NULL, "childwrite-%s:", ctdb_db->db_name);
272                 ret = ctdb_persistent_store(state);
273                 if (ret != 0) {
274                         DEBUG(DEBUG_ERR, (__location__ " Failed to write persistent data\n"));
275                         c = 1;
276                 }
277
278                 sys_write(result->fd[1], &c, 1);
279
280                 ctdb_wait_for_process_to_exit(parent);
281                 _exit(0);
282         }
283
284         close(result->fd[1]);
285         set_close_on_exec(result->fd[0]);
286
287         talloc_set_destructor(result, childwrite_destructor);
288
289         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for ctdb_childwrite\n", result->fd[0]));
290
291         result->fde = tevent_add_fd(ctdb_db->ctdb->ev, result, result->fd[0],
292                                     TEVENT_FD_READ, childwrite_handler,
293                                     (void *)result);
294         if (result->fde == NULL) {
295                 talloc_free(result);
296                 CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
297                 return NULL;
298         }
299         tevent_fd_set_auto_close(result->fde);
300
301         result->start_time = timeval_current();
302
303         return result;
304 }
305
306 /*
307    update a record on this node if the new record has a higher rsn than the
308    current record
309  */
310 int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
311                                    struct ctdb_req_control_old *c, TDB_DATA recdata,
312                                    bool *async_reply)
313 {
314         struct ctdb_db_context *ctdb_db;
315         struct ctdb_persistent_write_state *state;
316         struct childwrite_handle *handle;
317         struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr;
318
319         if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
320                 DEBUG(DEBUG_INFO,("rejecting ctdb_control_update_record when recovery active\n"));
321                 return -1;
322         }
323
324         ctdb_db = find_ctdb_db(ctdb, m->db_id);
325         if (ctdb_db == NULL) {
326                 DEBUG(DEBUG_ERR,("Unknown database 0x%08x in ctdb_control_update_record\n", m->db_id));
327                 return -1;
328         }
329
330         if (ctdb_db->unhealthy_reason) {
331                 DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_update_record: %s\n",
332                                  ctdb_db->db_name, ctdb_db->unhealthy_reason));
333                 return -1;
334         }
335
336         state = talloc(ctdb, struct ctdb_persistent_write_state);
337         CTDB_NO_MEMORY(ctdb, state);
338
339         state->ctdb_db = ctdb_db;
340         state->c       = c;
341         state->m       = m;
342         state->flags   = 0;
343         if (!ctdb_db->persistent) {
344                 state->flags   = UPDATE_FLAGS_REPLACE_ONLY;
345         }
346
347         /* create a child process to take out a transaction and
348            write the data.
349         */
350         handle = ctdb_childwrite(ctdb_db, ctdb_persistent_write_callback, state);
351         if (handle == NULL) {
352                 DEBUG(DEBUG_ERR,("Failed to setup childwrite handler in ctdb_control_update_record\n"));
353                 talloc_free(state);
354                 return -1;
355         }
356
357         /* we need to wait for the replies */
358         *async_reply = true;
359
360         /* need to keep the control structure around */
361         talloc_steal(state, c);
362
363         /* but we won't wait forever */
364         tevent_add_timer(ctdb->ev, state,
365                          timeval_current_ofs(ctdb->tunable.control_timeout, 0),
366                          ctdb_persistent_lock_timeout, state);
367
368         return 0;
369 }
370