ctdb: cluster mutex helper using Ceph RADOS
authorDavid Disseldorp <ddiss@samba.org>
Thu, 1 Dec 2016 12:33:22 +0000 (13:33 +0100)
committerAmitay Isaacs <amitay@samba.org>
Fri, 9 Dec 2016 03:10:20 +0000 (04:10 +0100)
ctdb_mutex_ceph_rados_helper implements the cluster mutex helper API
atop Ceph using the librados rados_lock_exclusive()/rados_unlock()
functionality.

Once configured, split brain avoidance during CTDB recovery will be
handled using locks against an object located in a Ceph RADOS pool.

Signed-off-by: David Disseldorp <ddiss@samba.org>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c [new file with mode: 0644]
ctdb/wscript

diff --git a/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c b/ctdb/utils/ceph/ctdb_mutex_ceph_rados_helper.c
new file mode 100644 (file)
index 0000000..326a0b0
--- /dev/null
@@ -0,0 +1,328 @@
+/*
+   CTDB mutex helper using Ceph librados locks
+
+   Copyright (C) David Disseldorp 2016
+
+   Based on ctdb_mutex_fcntl_helper.c, which is:
+   Copyright (C) Martin Schwenke 2015
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "tevent.h"
+#include "talloc.h"
+#include "rados/librados.h"
+
+#define CTDB_MUTEX_CEPH_LOCK_NAME      "ctdb_reclock_mutex"
+#define CTDB_MUTEX_CEPH_LOCK_COOKIE    CTDB_MUTEX_CEPH_LOCK_NAME
+#define CTDB_MUTEX_CEPH_LOCK_DESC      "CTDB recovery lock"
+
+#define CTDB_MUTEX_STATUS_HOLDING "0"
+#define CTDB_MUTEX_STATUS_CONTENDED "1"
+#define CTDB_MUTEX_STATUS_TIMEOUT "2"
+#define CTDB_MUTEX_STATUS_ERROR "3"
+
+static char *progname = NULL;
+
+static int ctdb_mutex_rados_ctx_create(const char *ceph_cluster_name,
+                                      const char *ceph_auth_name,
+                                      const char *pool_name,
+                                      rados_t *_ceph_cluster,
+                                      rados_ioctx_t *_ioctx)
+{
+       rados_t ceph_cluster = NULL;
+       rados_ioctx_t ioctx = NULL;
+       int ret;
+
+       ret = rados_create2(&ceph_cluster, ceph_cluster_name, ceph_auth_name, 0);
+       if (ret < 0) {
+               fprintf(stderr, "%s: failed to initialise Ceph cluster %s as %s"
+                       " - (%s)\n", progname, ceph_cluster_name, ceph_auth_name,
+                       strerror(-ret));
+               return ret;
+       }
+
+       /* path=NULL tells librados to use default locations */
+       ret = rados_conf_read_file(ceph_cluster, NULL);
+       if (ret < 0) {
+               fprintf(stderr, "%s: failed to parse Ceph cluster config"
+                       " - (%s)\n", progname, strerror(-ret));
+               rados_shutdown(ceph_cluster);
+               return ret;
+       }
+
+       ret = rados_connect(ceph_cluster);
+       if (ret < 0) {
+               fprintf(stderr, "%s: failed to connect to Ceph cluster %s as %s"
+                       " - (%s)\n", progname, ceph_cluster_name, ceph_auth_name,
+                       strerror(-ret));
+               rados_shutdown(ceph_cluster);
+               return ret;
+       }
+
+
+       ret = rados_ioctx_create(ceph_cluster, pool_name, &ioctx);
+       if (ret < 0) {
+               fprintf(stderr, "%s: failed to create Ceph ioctx for pool %s"
+                       " - (%s)\n", progname, pool_name, strerror(-ret));
+               rados_shutdown(ceph_cluster);
+               return ret;
+       }
+
+       *_ceph_cluster = ceph_cluster;
+       *_ioctx = ioctx;
+
+       return 0;
+}
+
+static void ctdb_mutex_rados_ctx_destroy(rados_t ceph_cluster,
+                                        rados_ioctx_t ioctx)
+{
+       rados_ioctx_destroy(ioctx);
+       rados_shutdown(ceph_cluster);
+}
+
+static int ctdb_mutex_rados_lock(rados_ioctx_t *ioctx,
+                                const char *oid)
+{
+       int ret;
+
+       ret = rados_lock_exclusive(ioctx, oid,
+                                   CTDB_MUTEX_CEPH_LOCK_NAME,
+                                  CTDB_MUTEX_CEPH_LOCK_COOKIE,
+                                  CTDB_MUTEX_CEPH_LOCK_DESC,
+                                   NULL, /* infinite duration */
+                                   0);
+       if ((ret == -EEXIST) || (ret == -EBUSY)) {
+               /* lock contention */
+               return ret;
+       } else if (ret < 0) {
+               /* unexpected failure */
+               fprintf(stderr,
+                       "%s: Failed to get lock on RADOS object '%s' - (%s)\n",
+                       progname, oid, strerror(-ret));
+               return ret;
+       }
+
+       /* lock obtained */
+       return 0;
+}
+
+static int ctdb_mutex_rados_unlock(rados_ioctx_t *ioctx,
+                                  const char *oid)
+{
+       int ret;
+
+       ret = rados_unlock(ioctx, oid,
+                          CTDB_MUTEX_CEPH_LOCK_NAME,
+                          CTDB_MUTEX_CEPH_LOCK_COOKIE);
+       if (ret < 0) {
+               fprintf(stderr,
+                       "%s: Failed to drop lock on RADOS object '%s' - (%s)\n",
+                       progname, oid, strerror(-ret));
+               return ret;
+       }
+
+       return 0;
+}
+
+struct ctdb_mutex_rados_state {
+       bool holding_mutex;
+       const char *ceph_cluster_name;
+       const char *ceph_auth_name;
+       const char *pool_name;
+       const char *object;
+       int ppid;
+       struct tevent_context *ev;
+       struct tevent_signal *sig_ev;
+       struct tevent_timer *timer_ev;
+       rados_t ceph_cluster;
+       rados_ioctx_t ioctx;
+};
+
+static void ctdb_mutex_rados_sigterm_cb(struct tevent_context *ev,
+                                       struct tevent_signal *se,
+                                       int signum,
+                                       int count,
+                                       void *siginfo,
+                                       void *private_data)
+{
+       struct ctdb_mutex_rados_state *cmr_state = private_data;
+       int ret;
+
+       if (!cmr_state->holding_mutex) {
+               fprintf(stderr, "Sigterm callback invoked without mutex!\n");
+               ret = -EINVAL;
+               goto err_ctx_cleanup;
+       }
+
+       ret = ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object);
+err_ctx_cleanup:
+       ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster,
+                                    cmr_state->ioctx);
+       talloc_free(cmr_state);
+       exit(ret ? 1 : 0);
+}
+
+static void ctdb_mutex_rados_timer_cb(struct tevent_context *ev,
+                                     struct tevent_timer *te,
+                                     struct timeval current_time,
+                                     void *private_data)
+{
+       struct ctdb_mutex_rados_state *cmr_state = private_data;
+       int ret;
+
+       if (!cmr_state->holding_mutex) {
+               fprintf(stderr, "Timer callback invoked without mutex!\n");
+               ret = -EINVAL;
+               goto err_ctx_cleanup;
+       }
+
+       if ((kill(cmr_state->ppid, 0) == 0) || (errno != ESRCH)) {
+               /* parent still around, keep waiting */
+               cmr_state->timer_ev = tevent_add_timer(cmr_state->ev, cmr_state,
+                                              tevent_timeval_current_ofs(5, 0),
+                                                     ctdb_mutex_rados_timer_cb,
+                                                      cmr_state);
+               if (cmr_state->timer_ev == NULL) {
+                       fprintf(stderr, "Failed to create timer event\n");
+                       /* rely on signal cb */
+               }
+               return;
+       }
+
+       /* parent ended, drop lock and exit */
+       ret = ctdb_mutex_rados_unlock(cmr_state->ioctx, cmr_state->object);
+err_ctx_cleanup:
+       ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster,
+                                    cmr_state->ioctx);
+       talloc_free(cmr_state);
+       exit(ret ? 1 : 0);
+}
+
+int main(int argc, char *argv[])
+{
+       int ret;
+       struct ctdb_mutex_rados_state *cmr_state;
+
+       progname = argv[0];
+
+       if (argc != 5) {
+               fprintf(stderr, "Usage: %s <Ceph Cluster> <Ceph user> "
+                               "<RADOS pool> <RADOS object>\n",
+                       progname);
+               ret = -EINVAL;
+               goto err_out;
+       }
+
+       ret = setvbuf(stdout, NULL, _IONBF, 0);
+       if (ret != 0) {
+               fprintf(stderr, "Failed to configure unbuffered stdout I/O\n");
+       }
+
+       cmr_state = talloc_zero(NULL, struct ctdb_mutex_rados_state);
+       if (cmr_state == NULL) {
+               fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+               ret = -ENOMEM;
+               goto err_out;
+       }
+
+       cmr_state->ceph_cluster_name = argv[1];
+       cmr_state->ceph_auth_name = argv[2];
+       cmr_state->pool_name = argv[3];
+       cmr_state->object = argv[4];
+
+       cmr_state->ppid = getppid();
+       if (cmr_state->ppid == 1) {
+               /*
+                * The original parent is gone and the process has
+                * been reparented to init.  This can happen if the
+                * helper is started just as the parent is killed
+                * during shutdown.  The error message doesn't need to
+                * be stellar, since there won't be anything around to
+                * capture and log it...
+                */
+               fprintf(stderr, "%s: PPID == 1\n", progname);
+               ret = -EPIPE;
+               goto err_state_free;
+       }
+
+       cmr_state->ev = tevent_context_init(cmr_state);
+       if (cmr_state->ev == NULL) {
+               fprintf(stderr, "tevent_context_init failed\n");
+               fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+               ret = -ENOMEM;
+               goto err_state_free;
+       }
+
+       /* wait for sigterm */
+       cmr_state->sig_ev = tevent_add_signal(cmr_state->ev, cmr_state, SIGTERM, 0,
+                                             ctdb_mutex_rados_sigterm_cb,
+                                             cmr_state);
+       if (cmr_state->sig_ev == NULL) {
+               fprintf(stderr, "Failed to create signal event\n");
+               fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+               ret = -ENOMEM;
+               goto err_state_free;
+       }
+
+       /* periodically check parent */
+       cmr_state->timer_ev = tevent_add_timer(cmr_state->ev, cmr_state,
+                                              tevent_timeval_current_ofs(5, 0),
+                                              ctdb_mutex_rados_timer_cb,
+                                              cmr_state);
+       if (cmr_state->timer_ev == NULL) {
+               fprintf(stderr, "Failed to create timer event\n");
+               fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+               ret = -ENOMEM;
+               goto err_state_free;
+       }
+
+       ret = ctdb_mutex_rados_ctx_create(cmr_state->ceph_cluster_name,
+                                         cmr_state->ceph_auth_name,
+                                         cmr_state->pool_name,
+                                         &cmr_state->ceph_cluster,
+                                         &cmr_state->ioctx);
+       if (ret < 0) {
+               fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+               goto err_state_free;
+       }
+
+       ret = ctdb_mutex_rados_lock(cmr_state->ioctx, cmr_state->object);
+       if ((ret == -EEXIST) || (ret == -EBUSY)) {
+               fprintf(stdout, CTDB_MUTEX_STATUS_CONTENDED);
+               goto err_ctx_cleanup;
+       } else if (ret < 0) {
+               fprintf(stdout, CTDB_MUTEX_STATUS_ERROR);
+               goto err_ctx_cleanup;
+       }
+
+       cmr_state->holding_mutex = true;
+       fprintf(stdout, CTDB_MUTEX_STATUS_HOLDING);
+
+       /* wait for the signal / timer events to do their work */
+       ret = tevent_loop_wait(cmr_state->ev);
+       if (ret < 0) {
+               goto err_ctx_cleanup;
+       }
+err_ctx_cleanup:
+       ctdb_mutex_rados_ctx_destroy(cmr_state->ceph_cluster,
+                                    cmr_state->ioctx);
+err_state_free:
+       talloc_free(cmr_state);
+err_out:
+       return ret ? 1 : 0;
+}
index d7b189172a14538313704c9fc38955672552f7c0..59bd8e2e7d5f90e608520536947626f83e6a7bf5 100644 (file)
@@ -79,6 +79,9 @@ def set_options(opt):
     opt.add_option('--enable-etcd-reclock',
                    help=("Enable etcd recovery lock helper (default=no)"),
                    action="store_true", dest='ctdb_etcd_reclock', default=False)
+    opt.add_option('--enable-ceph-reclock',
+                   help=("Enable Ceph CTDB recovery lock helper (default=no)"),
+                   action="store_true", dest='ctdb_ceph_reclock', default=False)
 
     opt.add_option('--with-logdir',
                    help=("Path to log directory"),
@@ -201,6 +204,15 @@ def configure(conf):
         Logs.info('Building with etcd support')
     conf.env.etcd_reclock = have_etcd_reclock
 
+    if Options.options.ctdb_ceph_reclock:
+        if (conf.CHECK_HEADERS('rados/librados.h', False, False, 'rados') and
+                                       conf.CHECK_LIB('rados', shlib=True)):
+            Logs.info('Building with Ceph librados recovery lock support')
+            conf.define('HAVE_LIBRADOS', 1)
+        else:
+            Logs.error("Missing librados for Ceph recovery lock support")
+            sys.exit(1)
+
     conf.env.CTDB_BINDIR = os.path.join(conf.env.EXEC_PREFIX, 'bin')
     conf.env.CTDB_ETCDIR = os.path.join(conf.env.SYSCONFDIR, 'ctdb')
     conf.env.CTDB_VARDIR = os.path.join(conf.env.LOCALSTATEDIR, 'lib/ctdb')
@@ -540,6 +552,13 @@ def build(bld):
         bld.INSTALL_FILES('${CTDB_PMDADIR}', 'utils/pmda/README',
                           destname='README')
 
+    if bld.env.HAVE_LIBRADOS:
+        bld.SAMBA_BINARY('ctdb_mutex_ceph_rados_helper',
+                         source='utils/ceph/ctdb_mutex_ceph_rados_helper.c',
+                        deps='talloc tevent rados',
+                        includes='include',
+                        install_path='${CTDB_HELPER_BINDIR}')
+
     sed_expr1 = 's|/usr/local/var/lib/ctdb|%s|g'  % (bld.env.CTDB_VARDIR)
     sed_expr2 = 's|/usr/local/etc/ctdb|%s|g'      % (bld.env.CTDB_ETCDIR)
     sed_expr3 = 's|/usr/local/var/log|%s|g'       % (bld.env.CTDB_LOGDIR)