tests: Fix calling of ctdb tool from test
[ctdb.git] / utils / pmda / pmda_ctdb.c
1 /*
2  * CTDB Performance Metrics Domain Agent (PMDA) for Performance Co-Pilot (PCP)
3  *
4  * Copyright (c) 1995,2004 Silicon Graphics, Inc.  All Rights Reserved.
5  * Copyright (c) 2011 David Disseldorp
6  *
7  * This program is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU General Public License as published by the
9  * Free Software Foundation; either version 2 of the License, or (at your
10  * option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15  * for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with this program; if not, write to the Free Software Foundation, Inc.,
19  * 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
20  */
21
22 #include <pcp/pmapi.h>
23 #include <pcp/impl.h>
24 #include <pcp/pmda.h>
25 #include "../../include/includes.h"
26 #include "../../include/ctdb.h"
27 #include "../../include/ctdb_private.h"
28 #include "../../include/ctdb_protocol.h"
29 #include "domain.h"
30
31 /*
32  * CTDB PMDA
33  *
34  * This PMDA connects to the locally running ctdbd daemon and pulls
35  * statistics for export via PCP. The ctdbd Unix domain socket path can be
36  * specified with the CTDB_SOCKET environment variable, otherwise the default
37  * path is used.
38  */
39
40 /*
41  * All metrics supported in this PMDA - one table entry for each.
42  * The 4th field specifies the serial number of the instance domain
43  * for the metric, and must be either PM_INDOM_NULL (denoting a
44  * metric that only ever has a single value), or the serial number
45  * of one of the instance domains declared in the instance domain table
46  * (i.e. in indomtab, above).
47  */
48 static pmdaMetric metrictab[] = {
49         /* num_clients */
50         { NULL, { PMDA_PMID(0,0), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
51                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
52         /* frozen */
53         { NULL, { PMDA_PMID(1,2), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
54                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
55         /* recovering */
56         { NULL, { PMDA_PMID(3,3), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
57                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
58         /* client_packets_sent */
59         { NULL, { PMDA_PMID(4,4), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
60                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
61         /* client_packets_recv */
62         { NULL, { PMDA_PMID(5,5), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
63                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
64         /* node_packets_sent */
65         { NULL, { PMDA_PMID(6,6), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
66                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
67         /* node_packets_recv */
68         { NULL, { PMDA_PMID(7,7), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
69                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
70         /* keepalive_packets_sent */
71         { NULL, { PMDA_PMID(8,8), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
72                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
73         /* keepalive_packets_recv */
74         { NULL, { PMDA_PMID(9,9), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
75                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
76         /* req_call */
77         { NULL, { PMDA_PMID(10,10), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
78                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
79         /* reply_call */
80         { NULL, { PMDA_PMID(10,11), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
81                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
82         /* req_dmaster */
83         { NULL, { PMDA_PMID(10,12), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
84                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
85         /* reply_dmaster */
86         { NULL, { PMDA_PMID(10,13), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
87                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
88         /* reply_error */
89         { NULL, { PMDA_PMID(10,14), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
90                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
91         /* req_message */
92         { NULL, { PMDA_PMID(10,15), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
93                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
94         /* req_control */
95         { NULL, { PMDA_PMID(10,16), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
96                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
97         /* reply_control */
98         { NULL, { PMDA_PMID(10,17), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
99                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
100         /* req_call */
101         { NULL, { PMDA_PMID(11,18), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
102                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
103         /* req_message */
104         { NULL, { PMDA_PMID(11,19), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
105                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
106         /* req_control */
107         { NULL, { PMDA_PMID(11,20), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
108                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
109         /* call */
110         { NULL, { PMDA_PMID(12,21), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
111                 PMDA_PMUNITS(0,0,1,0,0,0) }, },
112         /* control */
113         { NULL, { PMDA_PMID(12,22), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
114                 PMDA_PMUNITS(0,0,1,0,0,0) }, },
115         /* traverse */
116         { NULL, { PMDA_PMID(12,23), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
117                 PMDA_PMUNITS(0,0,1,0,0,0) }, },
118         /* total_calls */
119         { NULL, { PMDA_PMID(13,24), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
120                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
121         /* pending_calls */
122         { NULL, { PMDA_PMID(14,25), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
123                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
124         /* locks.num_calls */
125         { NULL, { PMDA_PMID(15,27), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
126                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
127         /* locks.pending_calls */
128         { NULL, { PMDA_PMID(16,27), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
129                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
130         /* childwrite_calls */
131         { NULL, { PMDA_PMID(17,28), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
132                 PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
133         /* pending_childwrite_calls */
134         { NULL, { PMDA_PMID(18,29), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
135                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
136         /* memory_used */
137         { NULL, { PMDA_PMID(19,30), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
138                 PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) }, },
139         /* max_hop_count */
140         { NULL, { PMDA_PMID(20,31), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
141                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
142         /* max_reclock_ctdbd */
143         { NULL, { PMDA_PMID(21,32), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
144                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
145         /* max_reclock_recd */
146         { NULL, { PMDA_PMID(22,33), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
147                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
148         /* max_call_latency */
149         { NULL, { PMDA_PMID(23,34), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
150                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
151         /* locks.latency.max */
152         { NULL, { PMDA_PMID(24,35), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
153                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
154         /* childwrite_latency.max */
155         { NULL, { PMDA_PMID(25,36), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
156                 PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
157         /* num_recoveries */
158         { NULL, { PMDA_PMID(26,37), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
159                 PMDA_PMUNITS(0,0,0,0,0,0) }, },
160 };
161
162 static struct event_context *ev;
163 static struct ctdb_context *ctdb;
164 static struct ctdb_statistics *stats;
165
166 static void
167 pmda_ctdb_q_read_cb(uint8_t *data, size_t cnt, void *args)
168 {
169         if (cnt == 0) {
170                 fprintf(stderr, "ctdbd unreachable\n");
171                 /* cleanup on request timeout */
172                 return;
173         }
174
175         ctdb_client_read_cb(data, cnt, args);
176 }
177
178
179 static int
180 pmda_ctdb_daemon_connect(void)
181 {
182         const char *socket_name;
183         int ret;
184         struct sockaddr_un addr;
185
186         ev = event_context_init(NULL);
187         if (ev == NULL) {
188                 fprintf(stderr, "Failed to init event ctx\n");
189                 return -1;
190         }
191
192         ctdb = ctdb_init(ev);
193         if (ctdb == NULL) {
194                 fprintf(stderr, "Failed to init ctdb\n");
195                 goto err_ev;
196         }
197
198         socket_name = getenv("CTDB_SOCKET");
199         if (socket_name == NULL) {
200                 socket_name = CTDB_PATH;
201         }
202
203         ret = ctdb_set_socketname(ctdb, socket_name);
204         if (ret == -1) {
205                 fprintf(stderr, "ctdb_set_socketname failed - %s\n",
206                                 ctdb_errstr(ctdb));
207                 goto err_ctdb;
208         }
209
210         /*
211          * ctdb_socket_connect() sets a default queue callback handler that
212          * calls exit() if ctdbd is unavailable on recv, use our own wrapper to
213          * work around this
214          */
215
216         memset(&addr, 0, sizeof(addr));
217         addr.sun_family = AF_UNIX;
218         strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path));
219
220         ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
221         if (ctdb->daemon.sd == -1) {
222                 fprintf(stderr, "Failed to open client socket\n");
223                 goto err_ctdb;
224         }
225
226         set_nonblocking(ctdb->daemon.sd);
227         set_close_on_exec(ctdb->daemon.sd);
228
229         if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
230                 fprintf(stderr, "Failed to connect to ctdb daemon via %s\n",
231                         ctdb->daemon.name);
232                 goto err_sd;
233         }
234
235         ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd,
236                                               CTDB_DS_ALIGNMENT,
237                                               pmda_ctdb_q_read_cb, ctdb,
238                                               "to-ctdbd");
239         if (ctdb->daemon.queue == NULL) {
240                 fprintf(stderr, "Failed to setup queue\n");
241                 goto err_sd;
242         }
243
244         ctdb->pnn = ctdb_ctrl_getpnn(ctdb, timeval_current_ofs(3, 0),
245                                      CTDB_CURRENT_NODE);
246         if (ctdb->pnn == (uint32_t)-1) {
247                 fprintf(stderr, "Failed to get ctdb pnn\n");
248                 goto err_sd;
249         }
250
251         return 0;
252 err_sd:
253         close(ctdb->daemon.sd);
254 err_ctdb:
255         talloc_free(ctdb);
256 err_ev:
257         talloc_free(ev);
258         ctdb = NULL;
259         return -1;
260 }
261
262 static void
263 pmda_ctdb_daemon_disconnect(void)
264 {
265         if (ctdb->methods) {
266                 ctdb->methods->shutdown(ctdb);
267         }
268
269         if (ctdb->daemon.sd != -1) {
270                 close(ctdb->daemon.sd);
271         }
272
273         talloc_free(ctdb);
274         talloc_free(ev);
275         ctdb = NULL;
276 }
277
278 static int
279 fill_node(unsigned int item, pmAtomValue *atom)
280 {
281         switch (item) {
282         case 10:
283                 atom->ul = stats->node.req_call;
284                 break;
285         case 11:
286                 atom->ul = stats->node.reply_call;
287                 break;
288         case 12:
289                 atom->ul = stats->node.req_dmaster;
290                 break;
291         case 13:
292                 atom->ul = stats->node.reply_dmaster;
293                 break;
294         case 14:
295                 atom->ul = stats->node.reply_error;
296                 break;
297         case 15:
298                 atom->ul = stats->node.req_message;
299                 break;
300         case 16:
301                 atom->ul = stats->node.req_control;
302                 break;
303         case 17:
304                 atom->ul = stats->node.reply_control;
305                 break;
306         default:
307                 return PM_ERR_PMID;
308         }
309
310         return 0;
311 }
312
313 static int
314 fill_client(unsigned int item, pmAtomValue *atom)
315 {
316         switch (item) {
317         case 18:
318                 atom->ul = stats->client.req_call;
319                 break;
320         case 19:
321                 atom->ul = stats->client.req_message;
322                 break;
323         case 20:
324                 atom->ul = stats->client.req_control;
325                 break;
326         default:
327                 return PM_ERR_PMID;
328         }
329
330         return 0;
331 }
332
333 static int
334 fill_timeout(unsigned int item, pmAtomValue *atom)
335 {
336         switch (item) {
337         case 21:
338                 atom->ul = stats->timeouts.call;
339                 break;
340         case 22:
341                 atom->ul = stats->timeouts.control;
342                 break;
343         case 23:
344                 atom->ul = stats->timeouts.traverse;
345                 break;
346         default:
347                 return PM_ERR_PMID;
348         }
349
350         return 0;
351 }
352
353 /*
354  * callback provided to pmdaFetch
355  */
356 static int
357 pmda_ctdb_fetch_cb(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom)
358 {
359         int ret;
360         __pmID_int *id = (__pmID_int *)&(mdesc->m_desc.pmid);
361
362         if (inst != PM_IN_NULL) {
363                 return PM_ERR_INST;
364         }
365
366         if (stats == NULL) {
367                 fprintf(stderr, "stats not available\n");
368                 ret = PM_ERR_VALUE;
369                 goto err_out;
370         }
371
372
373         switch (id->cluster) {
374         case 0:
375                 atom->ul = stats->num_clients;
376                 break;
377         case 1:
378                 atom->ul = stats->frozen;
379                 break;
380         case 3:
381                 atom->ul = stats->recovering;
382                 break;
383         case 4:
384                 atom->ul = stats->client_packets_sent;
385                 break;
386         case 5:
387                 atom->ul = stats->client_packets_recv;
388                 break;
389         case 6:
390                 atom->ul = stats->node_packets_sent;
391                 break;
392         case 7:
393                 atom->ul = stats->node_packets_recv;
394                 break;
395         case 8:
396                 atom->ul = stats->keepalive_packets_sent;
397                 break;
398         case 9:
399                 atom->ul = stats->keepalive_packets_recv;
400                 break;
401         case 10:
402                 ret = fill_node(id->item, atom);
403                 if (ret) {
404                         goto err_out;
405                 }
406                 break;
407         case 11:
408                 ret = fill_client(id->item, atom);
409                 if (ret) {
410                         goto err_out;
411                 }
412                 break;
413         case 12:
414                 ret = fill_timeout(id->item, atom);
415                 if (ret) {
416                         goto err_out;
417                 }
418                 break;
419         case 13:
420                 atom->ul = stats->total_calls;
421                 break;
422         case 14:
423                 atom->ul = stats->pending_calls;
424                 break;
425         case 15:
426                 atom->ul = stats->locks.num_calls;
427                 break;
428         case 16:
429                 atom->ul = stats->locks.num_pending;
430                 break;
431         case 17:
432                 atom->ul = stats->childwrite_calls;
433                 break;
434         case 18:
435                 atom->ul = stats->pending_childwrite_calls;
436                 break;
437         case 19:
438                 atom->ul = stats->memory_used;
439                 break;
440         case 20:
441                 atom->ul = stats->max_hop_count;
442                 break;
443         case 21:
444                 atom->d = stats->reclock.ctdbd.max;
445                 break;
446         case 22:
447                 atom->d = stats->reclock.recd.max;
448                 break;
449         case 23:
450                 atom->d = stats->call_latency.max;
451                 break;
452         case 24:
453                 atom->d = stats->locks.latency.max;
454                 break;
455         case 25:
456                 atom->d = stats->childwrite_latency.max;
457                 break;
458         case 26:
459                 atom->d = stats->num_recoveries;
460                 break;
461         default:
462                 return PM_ERR_PMID;
463         }
464
465         ret = 0;
466 err_out:
467         return ret;
468 }
469
470 /*
471  * This routine is called once for each pmFetch(3) operation, so is a
472  * good place to do once-per-fetch functions, such as value caching or
473  * instance domain evaluation.
474  */
475 static int
476 pmda_ctdb_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda)
477 {
478         int ret;
479         TDB_DATA data;
480         int32_t res;
481         struct timeval ctdb_timeout;
482
483         if (ctdb == NULL) {
484                 fprintf(stderr, "attempting reconnect to ctdbd\n");
485                 ret = pmda_ctdb_daemon_connect();
486                 if (ret < 0) {
487                         fprintf(stderr, "reconnect failed\n");
488                         return PM_ERR_VALUE;
489                 }
490         }
491
492         ctdb_timeout = timeval_current_ofs(1, 0);
493         ret = ctdb_control(ctdb, ctdb->pnn, 0,
494                            CTDB_CONTROL_STATISTICS, 0, tdb_null,
495                            ctdb, &data, &res, &ctdb_timeout, NULL);
496
497         if (ret != 0 || res != 0) {
498                 fprintf(stderr, "ctdb control for statistics failed, reconnecting\n");
499                 pmda_ctdb_daemon_disconnect();
500                 ret = PM_ERR_VALUE;
501                 goto err_out;
502         }
503
504         stats = (struct ctdb_statistics *)data.dptr;
505
506         if (data.dsize != sizeof(struct ctdb_statistics)) {
507                 fprintf(stderr, "incorrect statistics size %zu - not %zu\n",
508                         data.dsize, sizeof(struct ctdb_statistics));
509                 ret = PM_ERR_VALUE;
510                 goto err_stats;
511         }
512
513         ret = pmdaFetch(numpmid, pmidlist, resp, pmda);
514
515 err_stats:
516         talloc_free(stats);
517 err_out:
518         return ret;
519 }
520
521 /*
522  * Initialise the agent
523  */
524 void
525 pmda_ctdb_init(pmdaInterface *dp)
526 {
527         if (dp->status != 0) {
528                 return;
529         }
530
531         dp->version.two.fetch = pmda_ctdb_fetch;
532         pmdaSetFetchCallBack(dp, pmda_ctdb_fetch_cb);
533
534         pmdaInit(dp, NULL, 0, metrictab,
535                  (sizeof(metrictab) / sizeof(metrictab[0])));
536 }
537
538 static char *
539 helpfile(void)
540 {
541         static char buf[MAXPATHLEN];
542
543         if (!buf[0]) {
544                 snprintf(buf, sizeof(buf), "%s/ctdb/help",
545                          pmGetConfig("PCP_PMDAS_DIR"));
546         }
547         return buf;
548 }
549
550 static void
551 usage(void)
552 {
553         fprintf(stderr, "Usage: %s [options]\n\n", pmProgname);
554         fputs("Options:\n"
555           "  -d domain        use domain (numeric) for metrics domain of PMDA\n"
556           "  -l logfile       write log into logfile rather than using default log name\n"
557           "\nExactly one of the following options may appear:\n"
558           "  -i port          expect PMCD to connect on given inet port (number or name)\n"
559           "  -p               expect PMCD to supply stdin/stdout (pipe)\n"
560           "  -u socket        expect PMCD to connect on given unix domain socket\n",
561           stderr);
562         exit(1);
563 }
564
565 /*
566  * Set up the agent if running as a daemon.
567  */
568 int
569 main(int argc, char **argv)
570 {
571         int err = 0;
572         char log_file[] = "pmda_ctdb.log";
573         pmdaInterface dispatch;
574
575         __pmSetProgname(argv[0]);
576
577         pmdaDaemon(&dispatch, PMDA_INTERFACE_2, pmProgname, CTDB,
578                    log_file, helpfile());
579
580         if (pmdaGetOpt(argc, argv, "d:i:l:pu:?", &dispatch, &err) != EOF) {
581                 err++;
582         }
583
584         if (err) {
585                 usage();
586         }
587
588         pmdaOpenLog(&dispatch);
589         pmda_ctdb_init(&dispatch);
590         pmdaConnect(&dispatch);
591         pmdaMain(&dispatch);
592
593         exit(0);
594 }
595